In [22]:
#Regression Tutorial

#Regression -> Take continuous data and figure out the best function for the data.
#simple Regression -> Linear Equation (y=mx+b)
#Regression -> find y & x. E.g. used on stock market prediction

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import quandl #function library yang gunanya kayak buat kaggle

In [23]:
df = quandl.get('WIKI/GOOGL')
print(df.head())

#What we want are features that have some things to do with our goal
#So many data -> but we do not need all of the data
#P.S. -> Adj is when stock is split into two 1000 to two 500.

              Open    High     Low    Close      Volume  Ex-Dividend  \
Date                                                                   
2004-08-19  100.01  104.06   95.96  100.335  44659000.0          0.0   
2004-08-20  101.01  109.08  100.50  108.310  22834300.0          0.0   
2004-08-23  110.76  113.48  109.05  109.400  18256100.0          0.0   
2004-08-24  111.24  111.60  103.57  104.870  15247300.0          0.0   
2004-08-25  104.76  108.00  103.88  106.000   9188600.0          0.0   

            Split Ratio  Adj. Open  Adj. High   Adj. Low  Adj. Close  \
Date                                                                   
2004-08-19          1.0  50.159839  52.191109  48.128568   50.322842   
2004-08-20          1.0  50.661387  54.708881  50.405597   54.322689   
2004-08-23          1.0  55.551482  56.915693  54.693835   54.869377   
2004-08-24          1.0  55.792225  55.972783  51.945350   52.597363   
2004-08-25          1.0  52.542193  54.167209  52.100830   53.1

In [24]:
##Grab features

df_data = df[['Adj. Open','Adj. High','Adj. Low','Adj. Close','Adj. Volume']]
#Margin between high and low tells volatility of the market, Margin between Open&close shows how much stock price increase/decrease
#Volume shows how many trade that day
#Shows that each collumn has relation with each other

#HL_PCT -> High low Percentage

df_data['HL_PCT'] = (df_data['Adj. High']-df_data['Adj. Close']) / df_data['Adj. Close']*100
df_data['PCT_change'] = (df_data['Adj. Close']-df_data['Adj. Open']) / df_data['Adj. Open']*100

df_Regressed = df_data[['Adj. Close','HL_PCT','PCT_change','Adj. Volume']]

print(df_Regressed.head()) #Take data that we need only

            Adj. Close    HL_PCT  PCT_change  Adj. Volume
Date                                                     
2004-08-19   50.322842  3.712563    0.324968   44659000.0
2004-08-20   54.322689  0.710922    7.227007   22834300.0
2004-08-23   54.869377  3.729433   -1.227880   18256100.0
2004-08-24   52.597363  6.417469   -5.726357   15247300.0
2004-08-25   53.164113  1.886792    1.183658    9188600.0


In [27]:
import math

##Labels. Simplenya feature itu input, label itu outputnya

df_Regressed['Forecast_date'] = df_Regressed.index #since Date is index, we need to copy index into a collumn
forecast_col = 'Adj. Close'
df_Regressed.fillna(-9999, inplace=True) #fill NaN value with -99,999, and inplace True. This is used because machine learning cannot process NaN value.

forecast_out = int(math.ceil(0.1*len(df))) #round length of df*x (x in this case is 0.1. This is used to predict x percent from the dataframe. 

df_Regressed['label'] = df_Regressed[forecast_col].shift(-forecast_out) #so by using this, the data will be shifted to up based on the x value in forecast_out
df_Regressed['Forecast_date'].update(df_Regressed['Forecast_date'].shift(-forecast_out)) #so by using this, the data will be shifted to up based on the x value in forecast_out


print (df_Regressed.head()) #note the forecast date is different than the date

            Adj. Close    HL_PCT  PCT_change  Adj. Volume Forecast_date  \
Date                                                                      
2004-08-19   50.322842  3.712563    0.324968   44659000.0    2004-08-25   
2004-08-20   54.322689  0.710922    7.227007   22834300.0    2004-08-26   
2004-08-23   54.869377  3.729433   -1.227880   18256100.0    2004-08-27   
2004-08-24   52.597363  6.417469   -5.726357   15247300.0    2004-08-30   
2004-08-25   53.164113  1.886792    1.183658    9188600.0    2004-08-31   

                label  
Date                   
2004-08-19  53.164113  
2004-08-20  54.122070  
2004-08-23  53.239345  
2004-08-24  51.162935  
2004-08-25  51.343492  


In [28]:
#Define label and pass through classifier

df = df_Regressed

import numpy as np #will be able use array in py
from sklearn import preprocessing #Will use Scaling data -> So feature value will be between -1 & 1 so it can help will accuracy and processing speed.
from sklearn.model_selection import train_test_split
 #Shuffle data so that we dont have biased data and separate data
from sklearn import svm #to do regression and how simple it is on changing the algorithm we are using
from sklearn.linear_model import LinearRegression

#usually features will be x, labels will be y
X = np.array(df.drop(['label'],1))
y = np.array(df['Forecast_date','label'])

X = preprocessing.scale(X) #Before you can put data through classifier, data needs to be scaled. Scaling is like normalized with other data points. TO get an optimal scaled data, data needs also to be scaled with other value, such as including it with our training data/other values.
#disadvantage->adds processing time. Such as in high freq trading, this step is skipped

X=X[:-forecast_out+1] #This is done to make sure that data in table is only data that has value after shifted
df.dropna(inplace=True) #making sure no more null data aftar fillna

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#Cross validation will split X&Y into two different data. One used for training, other for testing
#Training 80% of the data. The data is shuffled when training (ofc connected X and Y) and testing to get unbiased data

#classifier 
clf = LinearRegression()
clf.fit(X_train, y_train) #fit->train
cld.score(X_test,y_test) #testing buat tau akurasinya. Kenapa dipisah train,test? Train kayak latihan, test kayak ulangan

ImportError: cannot import name 'cross_validation' from 'sklearn' (C:\Users\jovan\anaconda3\lib\site-packages\sklearn\__init__.py)