In [1]:
# General libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Import Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

## Preliminary work on the dataset

In [2]:
yahoo_df = pd.read_csv("../data/IBM.csv")
yahoo_df = yahoo_df.set_index("Date")

X = yahoo_df[['ROC_4', 'ROC_5', 'ROC_6', '4 Day ROI', '6 Day ROI', '30 Day ROI',
       'RSI_3', 'RSI_4', 'CCI']]
y = yahoo_df["Up down"]

# Split in 80/20 the dataframeX
X_train_80, X_test, y_train_80, y_test = train_test_split(X, y, test_size = 0.20,
                                                          shuffle=False)

# Split in 75/25 the remaining 80 %
X_train, X_valid, y_train, y_valid = train_test_split(X_train_80, y_train_80,
                                                      test_size = 0.25, shuffle=False)


# Normalize

In [3]:
def shift_norm(df):
    #df = data
    for column in df:
        df[column]=((df[column]-df[column].mean())/df[column].std())
    return df

In [4]:
X_train_80 = shift_norm(X_train_80)
X_test = shift_norm(X_test)

In [5]:
X_train = shift_norm(X_train)
X_valid = shift_norm(X_valid)

# Random Forest

In [6]:
# Fit the model
rfc = RandomForestClassifier(random_state = 42)
rfc.fit(X_train,y_train) 

# Predict
y_pred_train = rfc.predict(X_train)
y_pred_valid = rfc.predict(X_valid)

# Compute Accuracy
train_acc = accuracy_score(y_train, y_pred_train)
valid_acc = accuracy_score(y_valid, y_pred_valid)
print("Train Accuracy: {:.5f} - Validation Accuracy: {:.5f}".format(train_acc, valid_acc))

Train Accuracy: 1.00000 - Validation Accuracy: 0.51169


### Tuning parameters (VERY SLOW!)

RFC = RandomForestClassifier()

parameters = {  'max_depth': [10, 50, 70, 100, None],
                'max_features': ['auto', 'sqrt'],
                'min_samples_leaf': [1, 2, 4],
                'min_samples_split': [2, 5, 10],
                'n_estimators': [100, 500, 1000, 2000]       
             }

best_RGB = GridSearchCV(cv=2,estimator=RFC, 
                   param_grid=parameters)

best_RGB.fit(X_train, y_train)

print(best_RGB.best_estimator_)

In [7]:
# Fit
rfc = RandomForestClassifier(max_depth=100, max_features='auto', min_samples_leaf=4,
                       min_samples_split=10)

rfc.fit(X_train_80,y_train_80)

  warn(


In [8]:
# Predict
y_pred_test = rfc.predict(X_test)
y_prob = rfc.predict_proba(X_test)[:,1]
# Compute accuracy
test_acc = accuracy_score(y_test, y_pred_test)
print ("Test Accuracy: {:.5f}".format(test_acc) )

Test Accuracy: 0.52523


# Save accuracies

In [11]:

y_pred_test = rfc.predict(X_test)
y_prob = rfc.predict_proba(X_test)[:,1]
# Compute accuracy
test_acc = accuracy_score(y_test, y_pred_test)
print ("Test Accuracy: {:.5f}".format(test_acc) )

Test Accuracy: 0.52523


In [12]:
df_prediction = pd.read_csv("../data/model_accuracy_IBM.csv", index_col = False)
df_prediction =  df_prediction.drop(columns=["Unnamed: 0"])



In [13]:
df_prediction["RF_pred"] = y_pred_test
df_prediction["RF_accuracy"] = [test_acc for x in range(len(y_pred_test))]
df_prediction["RF_prob"] = y_prob

df_prediction

Unnamed: 0,LogReg_pred,LogReg_accuracy,LogReg_prob,LGBM_pred,LGBM_accuracy,LGBM_prob,y,kNN_pred,kNN_accuracy,kNN_prob,ANN_pred,ANN_accuracy,ANN_prob,RF_pred,RF_accuracy,RF_prob
0,0,0.48785,0.499232,0,0.48972,0.482038,0,0,0.508411,0.5,0,0.483178,0.490417,0,0.525234,0.422360
1,1,0.48785,0.504807,1,0.48972,0.516175,1,0,0.508411,0.5,1,0.483178,0.506478,1,0.525234,0.504748
2,1,0.48785,0.591117,1,0.48972,0.526401,1,0,0.508411,0.5,1,0.483178,0.594908,1,0.525234,0.635901
3,1,0.48785,0.595272,1,0.48972,0.515859,1,1,0.508411,1.0,1,0.483178,0.625306,1,0.525234,0.800362
4,1,0.48785,0.590655,1,0.48972,0.515859,1,1,0.508411,1.0,1,0.483178,0.620808,1,0.525234,0.617615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,1,0.48785,0.543262,1,0.48972,0.536454,1,0,0.508411,0.5,1,0.483178,0.507600,1,0.525234,0.669753
1066,1,0.48785,0.570132,1,0.48972,0.521491,1,1,0.508411,1.0,1,0.483178,0.546710,1,0.525234,0.765041
1067,1,0.48785,0.542180,1,0.48972,0.537546,0,1,0.508411,1.0,1,0.483178,0.533604,1,0.525234,0.747523
1068,1,0.48785,0.569741,1,0.48972,0.521313,1,0,0.508411,0.5,1,0.483178,0.560094,1,0.525234,0.580589


In [14]:
df_prediction.to_csv("../data/model_accuracy_IBM.csv")