In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [2]:
#print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
#config = tf.compat.v1.ConfigProto(device_count = {'GPU': 1})

## Preliminary work on the dataset

In [3]:
yahoo_df = pd.read_csv("../data/IBM.csv")
yahoo_df = yahoo_df.set_index("Date")
yahoo_df = yahoo_df.astype(float,errors="raise")
yahoo_df

Unnamed: 0_level_0,Open,High,Low,Close,vwap,SO,ROC_1,ROC_2,ROC_3,ROC_4,...,SRSI_10,SRSI_14,SRSI_30,Williams_14,ATR_14,CCI,APO,Differenced,Previous_differenced,principalDf
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-12-29,83.114243,83.114243,80.544930,81.261948,84.396916,22.267179,-0.293256,0.368997,0.221074,-4.494388,...,0.291869,0.176362,0.127287,-72.852250,4.786504,-77.323805,-5.074659,-0.002933,0.006642,4.695353
2001-01-02,80.783936,83.652008,80.425430,81.082695,83.802359,24.186019,-0.220586,-0.513195,0.147597,0.000000,...,0.405954,0.219505,0.153319,-71.320765,4.578513,-68.964816,-5.084234,-0.002206,-0.002933,4.464409
2001-01-03,80.066925,90.822182,80.066925,90.463669,84.105596,100.000000,11.569637,11.323530,10.997068,11.734311,...,0.620536,0.605918,0.376550,-12.075488,5.402079,6.359477,-4.714618,0.115696,-0.002206,-10.650756
2001-01-04,90.583176,95.363289,88.611374,89.089386,84.328010,88.995212,-1.519154,9.874722,9.632354,9.310851,...,0.589364,0.641378,0.290968,-33.333347,5.582057,74.412839,-4.093357,-0.015192,0.115696,-6.566460
2001-01-05,89.806404,90.523422,86.998085,89.866158,84.493911,95.215334,0.871901,-0.660499,10.832721,10.588240,...,0.740951,0.630793,0.458128,-29.206347,5.307828,51.328590,-3.672032,0.008719,-0.015192,-7.013639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-03-24,128.330002,129.369995,127.800003,129.250000,126.652805,100.000000,0.740450,0.147220,0.897731,0.380557,...,0.786216,0.615170,0.372530,-6.950430,2.167827,93.469804,1.295513,0.007404,-0.005889,-2.986045
2022-03-25,129.500000,131.399994,129.309998,131.350006,126.922471,100.000000,1.624763,2.377243,1.774375,2.537080,...,0.859787,0.701564,0.487151,-0.573913,2.165449,131.060998,1.639551,0.016248,0.007404,-5.610019
2022-03-28,130.820007,131.500000,129.600006,131.470001,127.184762,100.000000,0.091355,1.717602,2.470770,1.867351,...,0.863068,0.725094,0.505181,-0.340508,2.130055,121.945084,1.982885,0.000914,0.016248,-5.438178
2022-03-29,132.039993,132.839996,130.429993,131.940002,127.762773,100.000000,0.357497,0.449179,2.081240,2.837100,...,0.844640,0.723586,0.567238,-8.866940,2.167381,134.478125,2.355962,0.003575,0.000914,-5.582980


# Train, test, val

In [4]:
X = yahoo_df[['ROC_4', 'ROC_5', 'ROC_6', '4 Day ROI', '6 Day ROI', '30 Day ROI',
       'RSI_3', 'RSI_4', 'CCI']]
y = yahoo_df["Up down"]

In [5]:
# Split in 80/20 the dataframeX
X_train_80, X_test, y_train_80, y_test = train_test_split(X, y, test_size = 0.20,
                                                          shuffle=False)

# Split in 75/25 the remaining 80 %
X_train, X_valid, y_train, y_valid = train_test_split(X_train_80, y_train_80,
                                                      test_size = 0.25, shuffle=False)

# Normalize

In [6]:
def normalize(df):
    #df = data
    for column in df:
        df[column]=((df[column]-df[column].mean())/df[column].std())
    return df

In [7]:
X_train_80 = normalize(X_train_80)
X_test = normalize(X_test)


In [8]:
X_train = normalize(X_train)
X_valid = normalize(X_valid)

# Light Gradient Boosting Machine


In [9]:
# Build LightGBM Model
valid_data = lgb.Dataset(X_valid, label = y_valid)
train_data = lgb.Dataset(X_train, label = y_train)
test_data = lgb.Dataset(X_train_80, label = y_train_80)

In [10]:
# Select Hyper-Parameters
params = {'boosting_type': 'dart',
          "num_leaves":[2],
          'max_depth' : [10],
          'objective': 'multiclass',
          'nthread': 4,
          'learning_rate': 0.0125,
          'max_bin': 100,
          'reg_lambda': 1.4,
          'num_class' : 2,  
          'metric' : 'multi_logloss'}

In [11]:
# Train the model
lgbm = lgb.train(params, train_data)

# Predict
pred_train = lgbm.predict(X_train)
pred_valid = lgbm.predict(X_valid)

y_pred_train = np.argmax(pred_train, axis = 1)
y_pred_valid = np.argmax(pred_valid, axis = 1)

# Compute accuracy
train_acc = accuracy_score(y_train, y_pred_train)
valid_acc = accuracy_score(y_valid, y_pred_valid)
print("Train Accuracy: {:.5f} - Validation Accuracy: {:.5f}".format(train_acc, valid_acc))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 900
[LightGBM] [Info] Number of data points in the train set: 3207, number of used features: 9
[LightGBM] [Info] Start training from score -0.753226
[LightGBM] [Info] Start training from score -0.636474
Train Accuracy: 0.52915 - Validation Accuracy: 0.50140


# Tuning very slow

In [12]:

param_test ={
            'num_leaves': sp_randint(6, 50), 
             'min_child_samples': sp_randint(100, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': sp_uniform(loc=0.2, scale=0.8), 
             'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

#This parameter defines the number of HP points to be tested
n_HP_points_to_test = 100

#n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the absolute maximum
clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)
gs = RandomizedSearchCV(
    estimator=clf, param_distributions=param_test, 
    n_iter=n_HP_points_to_test,
    scoring='roc_auc',
    cv=3,
    refit=True,
    random_state=314,
    verbose=True)

gs.fit(X_train, y_train)
print('Best score reached: {} with params: {} '.format(gs.best_score_, gs.best_params_))

In [13]:
params = {'boosting_type': 'dart',
          "learning_rate":0.001,
          "num_iterations":1000,
        'colsample_bytree': 0.9242593373375023,
          'min_child_samples': 282,
          'min_child_weight': 10.0, 
          'num_leaves': 36, 
          'reg_alpha': 0, 
          'reg_lambda': 0.1, 
          'subsample': 0.4955174534902478,
          'objective': 'multiclass',
          'metric' : 'multi_logloss',
          'num_class' : 2, #Away, Draw and Home,
         }

In [14]:
len(pred_train)

3207

In [15]:
# Train the model
lgbm = lgb.train(params, train_data)

# Predict
pred_train = lgbm.predict(X_train)
pred_valid = lgbm.predict(X_valid)

y_pred_train = np.argmax(pred_train, axis = 1)
y_pred_valid = np.argmax(pred_valid, axis = 1)

# Compute accuracy
train_acc = accuracy_score(y_train, y_pred_train)
valid_acc = accuracy_score(y_valid, y_pred_valid)
print("Train Accuracy: {:.5f} - Validation Accuracy: {:.5f}".format(train_acc, valid_acc))

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 900
[LightGBM] [Info] Number of data points in the train set: 3207, number of used features: 9
[LightGBM] [Info] Start training from score -0.753226
[LightGBM] [Info] Start training from score -0.636474




































Train Accuracy: 0.56595 - Validation Accuracy: 0.50140


In [16]:
# Predict
pred_test = lgbm.predict(X_test)

y_pred_test = np.argmax(pred_test, axis = 1)

# Compute accuracy
test_acc = accuracy_score(y_test, y_pred_test)
print("Test Accuracy: {:.5f}".format(test_acc))

Test Accuracy: 0.48972


# Save accuracies

In [17]:
# Train the test set

# Predict
pred = lgbm.predict(X_test)
y_pred = np.argmax(pred, axis = 1)

# Compute accuracy
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.5f}".format(acc))

# Store the test and the validation accuracy
LGBM = acc

Test Accuracy: 0.48972


In [23]:
df_prediction = pd.read_csv("../data/model_accuracy_IBM.csv", index_col = False)
df_prediction =  df_prediction.drop(columns=["Unnamed: 0"])

In [24]:
#df_prediction = pd.read_csv("../data/model_accuracy.csv", index_col = False)
df_prediction["LGBM_pred"] = y_pred
df_prediction["LGBM_accuracy"] = [LGBM for x in range(len(y_pred))]
df_prediction["LGBM_prob"] = pred[:,1]
df_prediction

Unnamed: 0,LogReg_pred,LogReg_accuracy,LogReg_prob,LGBM_pred,LGBM_accuracy,LGBM_prob,y,kNN_pred,kNN_accuracy,kNN_prob,ANN_pred,ANN_accuracy,ANN_prob
0,0,0.48785,0.499232,0,0.48972,0.482038,0,0,0.508411,0.5,0,0.483178,0.490417
1,1,0.48785,0.504807,1,0.48972,0.516175,1,0,0.508411,0.5,1,0.483178,0.506478
2,1,0.48785,0.591117,1,0.48972,0.526401,1,0,0.508411,0.5,1,0.483178,0.594908
3,1,0.48785,0.595272,1,0.48972,0.515859,1,1,0.508411,1.0,1,0.483178,0.625306
4,1,0.48785,0.590655,1,0.48972,0.515859,1,1,0.508411,1.0,1,0.483178,0.620808
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,1,0.48785,0.543262,1,0.48972,0.536454,1,0,0.508411,0.5,1,0.483178,0.507600
1066,1,0.48785,0.570132,1,0.48972,0.521491,1,1,0.508411,1.0,1,0.483178,0.546710
1067,1,0.48785,0.542180,1,0.48972,0.537546,0,1,0.508411,1.0,1,0.483178,0.533604
1068,1,0.48785,0.569741,1,0.48972,0.521313,1,0,0.508411,0.5,1,0.483178,0.560094


In [25]:
df_prediction.to_csv("../data/model_accuracy_IBM.csv")
