In [1]:
import pandas as pd 
import numpy as np
from pandas_datareader import data as pdr 
import yfinance as yf 
import matplotlib 
import matplotlib.pyplot as plt
import datetime 

In [2]:
yf.pdr_override()

# reading stock data into a DataFrame
start_date = datetime.datetime(2009, 1, 1)
end_date = datetime.datetime(2019, 10, 5)
data = pdr.get_data_yahoo('AMZN', start = start_date, end = end_date) 

[*********************100%***********************]  1 of 1 downloaded


In [3]:
# Redefining Adj Close 
data['Adj Close'] = (data.Open + data.High + data.Close) / 3
# calculating Daily returns. 
data['Returns'] = np.log(data['Close'] / data['Close'].shift(1))  
data.head(10) 

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-12-31,50.74,51.69,49.91,51.28,51.236667,7792200,
2009-01-02,51.35,54.53,51.07,54.36,53.413333,7296400,0.058328
2009-01-05,55.73,55.74,53.03,54.06,55.176667,9509800,-0.005534
2009-01-06,54.55,58.22,53.75,57.36,56.71,11080100,0.059253
2009-01-07,56.29,56.95,55.35,56.2,56.48,7942700,-0.02043
2009-01-08,54.99,57.32,54.58,57.16,56.49,6577900,0.016938
2009-01-09,56.92,57.0,54.7,55.51,56.476667,6684100,-0.029291
2009-01-12,54.12,54.3,50.87,51.92,53.446667,9556300,-0.066859
2009-01-13,50.96,53.29,50.75,51.45,51.9,7883200,-0.009094
2009-01-14,50.1,50.1,48.14,48.49,49.563333,10442600,-0.059253


In [4]:
def classifier(returns) : 
    if(returns > 0) : 
        return 1 
    else : 
        return -1

In [5]:
data['Returns'] = np.log(data.Close / data.Close.shift(1)) 

In [6]:
data.fillna(0, inplace = True) 

Checked for Infinte and Negative infinite value, found none. 

In [7]:
data['Target'] = data.Returns.apply(classifier) 

We want to build a model based on past behaviour and test our predictions on more recent data. 
* Training - 85% | Test - 15%

In [8]:
test_data = data[-int((len(data) * 0.15)):] # recent 15% 
train_data = data[:int(len(data) * 0.85)] # earlier 85% 

In [9]:
# X, y for Train and Test 
X_train = train_data.drop('Target', axis = 1) 
y_train = train_data.Target 
X_test = test_data.drop('Target', axis = 1) 
y_test = test_data.Target 

In [10]:
# Base Model: Good Old LogReg 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
base_model = LogisticRegression(penalty = 'l1', solver = 'liblinear') 
base_model.fit(X_train, y_train) 
predictions = base_model.predict(X_test) 
print('Base Model Accuracy = ', round(accuracy_score(y_test, predictions) * 100, 3))  

Base Model Accuracy =  97.291


In [11]:
def model_report(clf, X_train, y_train, X_test, y_test, train = True) : 
    from sklearn.linear_model import LogisticRegression 
    from sklearn.model_selection import cross_val_score, cross_val_predict 
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
    
    if train: 
        print('Training Report') 
        print('Accuracy = {0:.4f} \n'.format(accuracy_score(y_train, clf.predict(X_train)))) 
        print('Classification Report: \n {} \n'.format(classification_report(y_train, clf.predict(X_train)))) 
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_train, clf.predict(X_train)))) 
        result = cross_val_score(clf, X_train, y_train, cv = 10, scoring = 'accuracy') 
        print('Cross Validataion: ') 
        print('Average Accuracy = {0:.4f}'.format(np.mean(result))) 
        print('Std_Dev Accuracy = {0:.4f}'.format(np.std(result))) 
    elif (train == False): 
        print('Test Report') 
        print('Accuracy = {0:.4f}'.format(accuracy_score(y_test, clf.predict(X_test)))) 
        print('Classification Report \n {} \n'.format(classification_report(y_test, clf.predict(X_test))))
        print('Confusion Matrix: \n {} \n'.format(confusion_matrix(y_test, clf.predict(X_test)))) 

In [12]:
model_report(base_model, X_train, y_train, X_test, y_test)   

Training Report
Accuracy = 0.9961 

Classification Report: 
               precision    recall  f1-score   support

          -1       1.00      1.00      1.00      1097
           1       1.00      1.00      1.00      1205

    accuracy                           1.00      2302
   macro avg       1.00      1.00      1.00      2302
weighted avg       1.00      1.00      1.00      2302
 

Confusion Matrix: 
 [[1093    4]
 [   5 1200]] 

Cross Validataion: 
Average Accuracy = 0.9900
Std_Dev Accuracy = 0.0204




In [13]:
model_report(base_model, X_train, y_train, X_test, y_test, False)  

Test Report
Accuracy = 0.9729
Classification Report 
               precision    recall  f1-score   support

          -1       1.00      0.94      0.97       183
           1       0.95      1.00      0.98       223

    accuracy                           0.97       406
   macro avg       0.98      0.97      0.97       406
weighted avg       0.97      0.97      0.97       406
 

Confusion Matrix: 
 [[172  11]
 [  0 223]] 



In [14]:
import pickle 
# saving model 
filename = 'base_model.sav' 
pickle.dump(base_model, open(filename, 'wb')) 
# saving data 
data.to_csv('data.csv') 