In [1]:
import pandas_datareader.data as pdd
import datetime as dt
import numpy as np
import pandas as pd
from math import sqrt
from matplotlib import pyplot as plt
import copy
from ta import *

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1,1))

import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_curve, auc, average_precision_score

import warnings
warnings.filterwarnings('ignore')

## Load the data

In [2]:
start=dt.datetime(2010,1,1)
end=dt.datetime(2019,11,27)
lst=['AAPL','AMZN','MSFT','JPM','GOOGL','BA','LMT','WMT','C','IBM','MCO','UAL','BBY','BLK','NVDA','MCK','MRK','XOM','ORCL','NKE']
dfdc={}
for i in lst:
    df = pdd.DataReader(i, "quandl",start,end,api_key='xxxxxxxxx')
    df = df.reindex(index=df.index[::-1])
    df.drop(df.loc[:, 'Open':'SplitRatio'].columns, inplace=True, axis = 1)
    dfdc[i] = df

## XGBoost

### creat indicators and label the target as "1" and "0"

In [3]:
######## creat a copy file to train the model
stockDict = copy.deepcopy(dfdc) # copy of dfdc


In [4]:
######## creat indicators
for stock in stockDict:
    # Relative Strength Index (RSI)
    stockDict[stock]['RSI'] = momentum.rsi(stockDict[stock]['AdjClose'])
    # Average Directional Movement Index (ADX)
    stockDict[stock]['ADX'] = trend.adx(stockDict[stock]['AdjHigh'], stockDict[stock]['AdjLow'], stockDict[stock]['AdjClose'])
    # Parabolic Stop and Reverse (Parabolic SAR)
    stockDict[stock]['PSAR_UP'] = trend.psar_up(stockDict[stock]['AdjHigh'], stockDict[stock]['AdjLow'], stockDict[stock]['AdjClose'])
    stockDict[stock]['PSAR_DOWN'] = trend.psar_down(stockDict[stock]['AdjHigh'], stockDict[stock]['AdjLow'], stockDict[stock]['AdjClose'])
    # Exponential Moving Average (5days)
    stockDict[stock]['EMA5'] = trend.ema_indicator(stockDict[stock]['AdjClose'],n=5)
    # difference between EMA (5 days) and close price
    stockDict[stock]['EMA5_AdjClose'] = stockDict[stock]['EMA5'] - stockDict[stock]['AdjClose']
    #average true range
    stockDict[stock]['ATR'] = volatility.average_true_range(stockDict[stock]['AdjHigh'], stockDict[stock]['AdjLow'], stockDict[stock]['AdjClose'], n=14)
    #Moving Average Convergence Divergence
    stockDict[stock]['MACD'] = trend.macd(stockDict[stock]['AdjClose'], n_fast=12,n_slow=26)
    #Stochastic Oscillator
    stockDict[stock]['SR'] = momentum.stoch(stockDict[stock]['AdjHigh'],stockDict[stock]['AdjLow'], stockDict[stock]['AdjClose'], n=14)

In [5]:
###### lag indicators (5days)
lags = 5
keys = stockDict['AAPL'].keys()[5:14]
for stock in stockDict:
    keys_new = []
    for key in keys:
        for lag in range(1,lags+1):
            key_new = key + '_' + str(lag)
            stockDict[stock][key_new] = stockDict[stock][key].shift(lag)
            keys_new.append(key_new)

In [6]:
# ###### uncomment this line to see the features used to predict the target
# keys_new

In [7]:
###### target label: up_down 
for stock in stockDict:
    stockDict[stock]['Return'] = np.log(stockDict[stock]['AdjClose']/stockDict[stock]['AdjClose'].shift(1))
    stockDict[stock]['Up_Down'] = np.where(stockDict[stock]['Return'] > 0, 1, 0)  # 1 if up, 0 otherwise

### train_test split function

In [8]:
def split_train_test(df, split_ratio):
    
    trainSize = int(split_ratio*df.shape[0])
    train_set = df[:trainSize]
    test_set = df[trainSize:]
    
    X_train = train_set[keys_new]
    X_train = scaler.fit_transform(X_train) #normalize
    
    X_test = test_set[keys_new]
    X_test = scaler.transform(X_test)  #normalize
    
    y_train = train_set['Up_Down']
    y_test = test_set['Up_Down']
    
#     print(X_train.shape,y_train.shape)
#     print(X_test.shape, y_test.shape)
    
    return X_train, X_test, y_train, y_test

### RandomSearch Function to tune the parameters

In [9]:
params = {
    'learning_rate' :[0.15,0.20,0.25,0.30, 0.35],
    'n_estimators':[100,200,500,1000],
    'max_depth':[3,4,5,6,7,8],
    'min_child_weight':[1,3,5,7],
    'gamma':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
    'colsample_bytree':[0.3, 0.4, 0.5, 0.6, 0.7]
}


In [10]:
# random search of hyperparameters
def XGB_best (X_train,X_test,y_train,y_test, params):
    
    Classifier = xgb.XGBClassifier()
    random_search = RandomizedSearchCV(Classifier,param_distributions = params, \
                                       scoring = 'roc_auc', cv=5, random_state = 666)
    best_xgb = random_search.fit(X_train,y_train)
    y_pred = best_xgb.predict(X_test)
    best_estimator = random_search.best_estimator_ 
    
    return best_estimator, y_pred

### Fit the model on train dataset and make prediction on the test dataset

In [11]:
for stock in stockDict:
    # test_train split
    temp = split_train_test(stockDict[stock],0.8) #split_ratio = 0.8
    X_train = temp[0]
    X_test = temp[1]
    y_train = temp[2]
    y_test = temp[3]   
    # fit the model
    rand_xgb = XGB_best(X_train, X_test, y_train, y_test, params)
    y_pred = rand_xgb[1] 
    
    print(stock)
    print('=======================================')
#     print("best parameters:")
#     print(rand_xgb[0]) #[0] will return the best parameters
    print("accuracy score:")
    print(accuracy_score(y_test, y_pred))
    print("confusion matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("classification report:")
    print(classification_report(y_test, y_pred))
    print('=======================================')

AAPL
accuracy score:
0.47101449275362317
confusion matrix:
[[107  88]
 [131  88]]
classification report:
              precision    recall  f1-score   support

           0       0.45      0.55      0.49       195
           1       0.50      0.40      0.45       219

   micro avg       0.47      0.47      0.47       414
   macro avg       0.47      0.48      0.47       414
weighted avg       0.48      0.47      0.47       414

AMZN
accuracy score:
0.5120772946859904
confusion matrix:
[[ 86  93]
 [109 126]]
classification report:
              precision    recall  f1-score   support

           0       0.44      0.48      0.46       179
           1       0.58      0.54      0.56       235

   micro avg       0.51      0.51      0.51       414
   macro avg       0.51      0.51      0.51       414
weighted avg       0.52      0.51      0.51       414

MSFT
accuracy score:
0.5277108433734939
confusion matrix:
[[129  61]
 [135  90]]
classification report:
              precision    recall

XOM
accuracy score:
0.5084337349397591
confusion matrix:
[[117  84]
 [120  94]]
classification report:
              precision    recall  f1-score   support

           0       0.49      0.58      0.53       201
           1       0.53      0.44      0.48       214

   micro avg       0.51      0.51      0.51       415
   macro avg       0.51      0.51      0.51       415
weighted avg       0.51      0.51      0.51       415

ORCL
accuracy score:
0.46987951807228917
confusion matrix:
[[128  66]
 [154  67]]
classification report:
              precision    recall  f1-score   support

           0       0.45      0.66      0.54       194
           1       0.50      0.30      0.38       221

   micro avg       0.47      0.47      0.47       415
   macro avg       0.48      0.48      0.46       415
weighted avg       0.48      0.47      0.45       415

NKE
accuracy score:
0.4939759036144578
confusion matrix:
[[ 97 107]
 [103 108]]
classification report:
              precision    recall  

#### Codes below are for testing the whole training process before run the model fitting on all stocks

In [12]:
# ## Take 'Apple' as a sample 

# temp_ = split_train_test(stockDict['AAPL'],0.8)

# aapl_train = temp_[0] #x_train
# aapl_test = temp_[1] #x_test
# y_tr =temp_[2] #y_train
# y_ts = temp_[3] #y_test

# # timing the gridsearch on one stock
# import timeit 
# start = timeit.default_timer()
# best_aapl = XGB_best(aapl_train,aapl_test,y_tr,y_ts,params)
# stop = timeit.default_timer()
# print('Time: ', stop - start)

# y_pr = best_aapl[1]
# accuracy_score(y_pr,y_ts)

##### Tried GridSearch to tune the hypermeters, which was time consuming. 

In [13]:
## GridSearch
# from sklearn.model_selection import GridSearchCV

# def XGB_Grid (X_train,X_test,y_train,y_test, params):
    
#     Classifier = xgb.XGBClassifier()
#     grid_search = GridSearchCV(Classifier,param_grid = params, scoring = 'roc_auc', cv=5)
#     best_xgb_grid = grid_search.fit(X_train,y_train)
#     y_pred_grid = best_xgb_grid.predict(X_test)
#     best_estimator_grid = grid_search.best_estimator_ 
    
#     return best_estimator_grid, y_pred_grid

In [14]:
# for stock in stockDict:
#     # test_train split
#     temp = split_train_test(stockDict[stock],0.8) #split_ratio = 0.8
#     X_train = temp[0]
#     X_test = temp[1]
#     y_train = temp[2]
#     y_test = temp[3]   
#     # fit the model
#     rand_xgb = XGB_Grid(X_train, X_test, y_train, y_test, params)
#     y_pred = rand_xgb[1] #[0] will return the best parameters
    
#     print(stock)
#     print('=======================================')
#     print("best parameters:")
#     print(rand_xgb[0])
#     print("accuracy score:")
#     print(accuracy_score(y_test, y_pred))
#     print("confusion matrix:")
#     print(confusion_matrix(y_test, y_pred))
#     print("classification report:")
#     print(classification_report(y_test, y_pred))
#     print('=======================================')