# Random Forest Classifier

In [1]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix,classification_report,f1_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
import seaborn as sns
import pandas as pd
import numpy as np

## Using Direction as the dependent variable
### Taking AAPL Daily stock data as an example

In [2]:
freq = 'Daily'
stock = 'AAPL'

price = pd.read_csv('../encode_price/'+freq+'/'+stock+'.csv')
price = price.loc[(price.Date<='2019-12-31')&(price.Date>='2010-01-04'),:]
price = price.loc[(price.Date>='2010-01-04'),:]
y = price.direction.shift(-1).values[:-1]
predictors = pd.read_csv('../predictors/Merged/'+freq+'/'+stock+'.csv')
NLP = pd.read_csv('../predictors/NLP/'+freq+'/NYT_macro_SA.csv')
predictors = pd.merge(predictors,NLP,how='left',on=['Date'])
predictors = predictors.loc[predictors.Date <= '2019-12-31',:]
predictors.set_index('Date',inplace=True)
predictors.fillna(0,inplace=True) 
X = predictors.values[:-1]

- Random Forest

In [3]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,shuffle=False)
sm = SMOTE(sampling_strategy='minority',random_state=42,k_neighbors=4)
X_res,y_res = sm.fit_resample(X_train,y_train)
pipe = make_pipeline(MinMaxScaler(),RandomForestClassifier(criterion='entropy',max_depth = 100,max_features = 'sqrt',class_weight='balanced_subsample'))
pipe.fit(X_res,y_res)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.46      0.72      0.56       341
         1.0       0.56      0.29      0.38       414

    accuracy                           0.48       755
   macro avg       0.51      0.51      0.47       755
weighted avg       0.51      0.48      0.46       755



Xgboost

In [4]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,shuffle=False)
pipe = make_pipeline(StandardScaler(),XGBClassifier(base_score=0.5, booster='gbtree',objective='binary:logistic', learning_rate=0.1, max_depth=5,n_estimators=100))
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.46      0.71      0.56       341
         1.0       0.57      0.32      0.41       414

    accuracy                           0.50       755
   macro avg       0.52      0.52      0.49       755
weighted avg       0.52      0.50      0.48       755



## Using Direction2 to be the dependent variable

In [15]:
freq = 'Daily'
stock = 'AAPL'

price = pd.read_csv('../encode_price/'+freq+'/'+stock+'.csv')
price = price.loc[(price.Date<='2019-12-31')&(price.Date>='2010-01-04'),:]
price = price.loc[(price.Date>='2010-01-04'),:]
y = price.direction2.shift(-1).values[:-1]
predictors = pd.read_csv('../predictors/Merged/'+freq+'/'+stock+'.csv')
NLP = pd.read_csv('../predictors/NLP/'+freq+'/NYT_macro_SA.csv')
predictors = pd.merge(predictors,NLP,how='left',on=['Date'])
predictors = predictors.loc[predictors.Date <= '2019-12-31',:]
predictors.set_index('Date',inplace=True)
predictors.fillna(0,inplace=True) 
X = predictors.values[:-1]

- Random Forest

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,shuffle=False)
sm = SMOTE(sampling_strategy='minority',random_state=42,k_neighbors=4)
X_res,y_res = sm.fit_resample(X_train,y_train)
pipe = make_pipeline(MinMaxScaler(),RandomForestClassifier(criterion='entropy',max_depth = 100,max_features = 'sqrt',class_weight='balanced_subsample'))
pipe.fit(X_res,y_res)
y_pred = pipe.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.40      0.21      0.27       164
         1.0       0.69      0.85      0.76       339

    accuracy                           0.64       503
   macro avg       0.54      0.53      0.52       503
weighted avg       0.59      0.64      0.60       503



- Xgboost

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,shuffle=False)
pipe = make_pipeline(MinMaxScaler(),XGBClassifier(base_score=0.5, booster='gbtree',objective='binary:logistic', learning_rate=0.1, max_depth=5,n_estimators=1000))
pipe.fit(X_train,y_train)
y_pred = pipe.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.31      0.21      0.25       232
         1.0       0.69      0.80      0.74       523

    accuracy                           0.62       755
   macro avg       0.50      0.50      0.50       755
weighted avg       0.58      0.62      0.59       755



From above results we can see Random Forest and Xgboost do not perform well both on direction 1 and direction 2. But we still want to try if feature selection can help improve the performance.

## Perform Feature Selection

In [12]:
tech_list = ['SMA','EMA','STOCH_k','STOCK_d','RSI','MFI','SAR','AD','MACD','MACD_Signal','MACD_Histo','VWAP','SPY','NDAQ','PC1','PC2']
funda_list = ['pcf','PEG_trailing','dpr','npm','gpm','roa','roe','capital_ratio','de_ratio','cash_ratio','curr_ratio','inv_turn','pay_turn','sale_nwc','rd_sale','accrual']
macro_list = ['gdpr1','gdpr2','cpi','bond20yr','bond30yr','fedfunds','cpir','wpir','unemp','employ']
nlp_list = ['Pos_lag2','Pos_lag3','Neg_lag1','Neg_lag2','Neg_lag3','Neu_lag1','Neu_lag2','Neu_lag3']
stock_list = ['AAPL','AMZN','BRK-B','GOOG','JNJ','META','MSFT','NVDA','TSLA','V']

In [13]:
stock_score = dict()
for j in stock_list:
    freq = 'Daily'
    stock = j
    price = pd.read_csv('../encode_price/'+freq+'/'+stock+'.csv')
    price = price.loc[(price.Date<='2019-12-31')&(price.Date>='2010-01-04'),:]
    price = price.loc[(price.Date>='2010-01-04'),:]
    y = price.direction2.shift(-1).values[:-1]
    predictors = pd.read_csv('../predictors/Merged/'+freq+'/'+stock+'.csv')
    NLP = pd.read_csv('../predictors/NLP/Daily/NYT_macro_SA.csv')
    predictors = pd.merge(predictors,NLP,how='left',on=['Date'])
    predictors = predictors.loc[predictors.Date <= '2019-12-31',:]
    predictors.set_index('Date',inplace=True)
    predictors.fillna(0,inplace=True) 
    X = predictors.values[:-1]
    cv = 3
    scores = []
    tscv = TimeSeriesSplit(n_splits=cv)
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pipe = make_pipeline(MinMaxScaler(),RandomForestClassifier(criterion='log_loss'))
        sfs = SequentialFeatureSelector(pipe,n_jobs = -1,n_features_to_select=5,scoring='f1_weighted')
        sfs.fit(X_train,y_train)
        X_train = sfs.transform(X_train)
        pipe.fit(X_train,y_train)
        X_test =  sfs.transform(X_test)
        y_pred = pipe.predict(X_test)
        scores.append(f1_score(y_pred,y_test,average = 'weighted'))
    average_score = np.mean(scores)
    print(j,average_score)
    stock_score[j] = average_score

AAPL 0.5713243522279255
AMZN 0.5444843373331719
BRK-B 0.49905601039933295
GOOG 0.5022725157181038
JNJ 0.5610963985000538
META 0.5990741680534032
MSFT 0.5418901716291072
NVDA 0.49386992571793
TSLA 0.5659073753244585
V 0.5131946143020236


- XGboost

In [19]:
stock_score = dict()
for j in stock_list:
    freq = 'Daily'
    stock = j
    price = pd.read_csv('../encode_price/'+freq+'/'+stock+'.csv')
    price = price.loc[(price.Date<='2019-12-31')&(price.Date>='2010-01-04'),:]
    price = price.loc[(price.Date>='2010-01-04'),:]
    y = price.direction2.shift(-1).values[:-1]
    predictors = pd.read_csv('../predictors/Merged/'+freq+'/'+stock+'.csv')
    NLP = pd.read_csv('../predictors/NLP/Daily/NYT_macro_SA.csv')
    predictors = pd.merge(predictors,NLP,how='left',on=['Date'])
    predictors = predictors.loc[predictors.Date <= '2019-12-31',:]
    predictors.set_index('Date',inplace=True)
    predictors.fillna(0,inplace=True) 
    X = predictors.values[:-1]
    cv = 3
    scores = []
    tscv = TimeSeriesSplit(n_splits=cv)
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pipe = make_pipeline(MinMaxScaler(),XGBClassifier(base_score=0.5, booster='gbtree',objective='binary:logistic', learning_rate=0.1, max_depth=5,n_estimators=100))
        sfs = SequentialFeatureSelector(pipe,n_jobs = -1,n_features_to_select=5,scoring='f1_weighted')
        sfs.fit(X_train,y_train)
        X_train = sfs.transform(X_train)
        pipe.fit(X_train,y_train)
        X_test =  sfs.transform(X_test)
        y_pred = pipe.predict(X_test)
        scores.append(f1_score(y_pred,y_test,average = 'weighted'))
    average_score = np.mean(scores)
    print(j,average_score)
    stock_score[j] = average_score

AAPL 0.5824460888713481
AMZN 0.5339341885693225
BRK-B 0.5884912030987596
GOOG 0.5714190219889693
JNJ 0.5393843060877742
META 0.5191499644725684
MSFT 0.5683555826897859
NVDA 0.5039365334042633
TSLA 0.5536460028392852
V 0.5169206615334808


Apparrently, feature selection brings limited improvements on the performance of Random Forest and Xgboost. For classification problem we may try other models. 