In [1]:
# General libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
# Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Import Logistic Regression function
# Import XGBoost Classifier
from xgboost import XGBClassifier

In [2]:
#print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
#config = tf.compat.v1.ConfigProto(device_count = {'GPU': 1})

## Preliminary work on the dataset

In [3]:
yahoo_df = pd.read_csv("../data/IBM.csv")
yahoo_df = yahoo_df.set_index("Date")

X = yahoo_df[['ROC_3', 'Momentum', 'RSI_2', 'ATR_14']]

y = yahoo_df["Up down"]

# Split in 80/20 the dataframeX
X_train_80, X_test, y_train_80, y_test = train_test_split(X, y, test_size = 0.20,
                                                          shuffle=False)

# Split in 75/25 the remaining 80 %
X_train, X_valid, y_train, y_valid = train_test_split(X_train_80, y_train_80,
                                                      test_size = 0.25, shuffle=False)


# Normalize

In [4]:
def shift_norm(df):
    #df = data
    for column in df:
        df[column]=((df[column]-df[column].mean())/df[column].std())
    return df

In [5]:
X_train_80 = shift_norm(X_train_80)
X_test = shift_norm(X_test)

In [6]:
X_train = shift_norm(X_train)
X_valid = shift_norm(X_valid)

# Gradient Boosting Classifier (XGboost)

XGboost is a library based on GradientBoostingClassifier. Many consider it as one of the best algorithms and, due to its great performance for regression and classification problems, would recommend it as a first choice in many situations. XGBoost has become famous for winning tons of Kaggle competitions, is now used in many industry-application, and is even implemented within machine-learning platforms, such as BigQuery ML.

GBoosting is a sequential technique which works on the principle of ensemble. It combines a set of weak learners and delivers improved prediction accuracy. At any instant t, the model outcomes are weighed based on the outcomes of previous instant t-1. The outcomes predicted correctly are given a lower weight and the ones miss-classified are weighted higher. This technique is followed for a classification problem while a similar technique is used for regression.

- Pros

It is extremely powerful machine learning classifier.
Accepts various types of inputs that make it more flexible.
It can be used for both regression and classification.
It gives you features important for the output.

- Cons

It takes longer time to train as it can’t be parallelized.
More likely to overfit as it obsessed with the wrong output as it learns from past mistakes.
In some cases, Tuning is very hard as it has many parameters to tune.

In [7]:
# Fit the model
XGB = XGBClassifier(random_state = 42)
XGB.fit(X_train, y_train)

# Predict
y_pred_train = XGB.predict(X_train)
y_pred_valid = XGB.predict(X_valid)

# Compute accuracy
train_acc = accuracy_score(y_train, y_pred_train)
valid_acc = accuracy_score(y_valid, y_pred_valid)
print("Train Accuracy: {:.5f} - Validation Accuracy: {:.5f}".format(train_acc, valid_acc))

Train Accuracy: 0.94886 - Validation Accuracy: 0.51263


### Tuning parameters very slow

params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}

XGB = XGBClassifier(seed = 42)
best_XGB = GridSearchCV(estimator=XGB, 
                   param_grid=params,
                   scoring='accuracy', 
                   verbose=1)
best_XGB.fit(X_train, y_train)
print("Best parameters:", best_XGB.best_params_)
#print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

In [8]:
# Fit 
XGB = XGBClassifier(colsample_bytree= 0.7, learning_rate= 0.05, max_depth= 3, n_estimators= 100)
XGB.fit(X_train_80, y_train_80)

y_pred_test = XGB.predict(X_test)
y_proba = XGB.predict_proba(X_test)


# Compute accuracy
test_acc = accuracy_score(y_test, y_pred_test)
print("Test Accuracy: {:.5f} ".format(test_acc))

# Store the Test accuracy
XGboost_test_accuracy_test = test_acc



Test Accuracy: 0.49626 


# Save accuracies

In [9]:
df_prediction = pd.read_csv("../data/model_accuracy_ibm.csv", index_col = False)
df_prediction =  df_prediction.drop(columns=["Unnamed: 0"])


In [10]:
df_prediction["XGB_pred"] = y_pred_test
df_prediction["XGB_accuracy"] = [XGboost_test_accuracy_test for x in range(len(y_pred_test))]
df_prediction["XGB_prob"] = y_proba[:,1]


In [11]:
df_prediction

Unnamed: 0,LogReg_pred,LogReg_accuracy,LogReg_prob,LGBM_pred,LGBM_accuracy,LGBM_prob,y,kNN_pred,kNN_accuracy,kNN_prob,...,SVC_pred,SVC_accuracy,SVC_prob,LSTM_price_accuracy,RNN_pred,RNN_accuracy,RNN_prob,XGB_pred,XGB_accuracy,XGB_prob
0,0,0.48785,0.499232,0,0.48972,0.482038,0,0,0.508411,0.5,...,1,0.492523,0.517458,0.857009,0,0.493458,0.488459,1,0.496262,0.515965
1,1,0.48785,0.504807,1,0.48972,0.516175,1,0,0.508411,0.5,...,1,0.492523,0.517960,0.857009,1,0.493458,0.536730,1,0.496262,0.524946
2,1,0.48785,0.591117,1,0.48972,0.526401,1,0,0.508411,0.5,...,1,0.492523,0.526435,0.857009,1,0.493458,0.626067,1,0.496262,0.520137
3,1,0.48785,0.595272,1,0.48972,0.515859,1,1,0.508411,1.0,...,1,0.492523,0.527133,0.857009,1,0.493458,0.636652,0,0.496262,0.486497
4,1,0.48785,0.590655,1,0.48972,0.515859,1,1,0.508411,1.0,...,1,0.492523,0.526471,0.857009,1,0.493458,0.635811,1,0.496262,0.545093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,1,0.48785,0.543262,1,0.48972,0.536454,1,0,0.508411,0.5,...,1,0.492523,0.522732,0.857009,1,0.493458,0.548477,0,0.496262,0.430383
1066,1,0.48785,0.570132,1,0.48972,0.521491,1,1,0.508411,1.0,...,1,0.492523,0.524813,0.857009,1,0.493458,0.591784,1,0.496262,0.592070
1067,1,0.48785,0.542180,1,0.48972,0.537546,0,1,0.508411,1.0,...,1,0.492523,0.522923,0.857009,1,0.493458,0.559243,1,0.496262,0.574436
1068,1,0.48785,0.569741,1,0.48972,0.521313,1,0,0.508411,0.5,...,1,0.492523,0.524824,0.857009,1,0.493458,0.595539,1,0.496262,0.584061


In [12]:
df_prediction.to_csv("../data/model_accuracy_ibm.csv")