#### Tune for xgboost

Modules

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, LSTM, Activation, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from kerastuner.tuners import BayesianOptimization

seed = 444 # random number seed

#### Utilities function

In [2]:
def plot_confusion_matrix(the_pred, 
                          the_true, 
                          save_plot=False, 
                          the_path=None):
    # generate confusion matrix
    confusion_matrix = metrics.confusion_matrix(the_pred, the_true)
    
    # generate the plot
    plot_df = pd.DataFrame(confusion_matrix, index=["0", "1"], columns=["0", "1"])
    fig, ax = plt.subplots(figsize=(6,5))
    sns.heatmap(plot_df, annot=True, cmap="YlGnBu", fmt="g", ax=ax)
    plt.xticks(rotation=30)
    plt.xlabel("True")
    plt.ylabel("Predicted")
    
    # if save plot
    if save_plot == True:
        fig.savefig(the_path, dpi=500)   

def model_performance(y_test,y_pred):
    res = pd.DataFrame()
    res['accuracy'] = [metrics.accuracy_score(y_test,y_pred)]
    res['Down_precision'] = [metrics.classification_report(y_test,y_pred,
                                                   output_dict=True)['0']['precision']]
    res['Up_precision'] = [metrics.classification_report(y_test,y_pred,output_dict=True)['1']['precision']]
    res['f1_score'] = [metrics.f1_score(y_test,y_pred,average='weighted')]
    res['recall_score'] = [metrics.recall_score(y_test,y_pred,average='weighted')]
    return res

Load data

In [3]:
data_path = os.path.dirname(os.getcwd())
data_path = os.path.join(data_path, "Data/features.csv")
data = pd.read_csv(data_path)

data = data[data["Date"] < "2020-01-01"]
del data["Date"]
data_y = data.pop("direction").values
data_x = data.values

Normalize the data: 0 mean and unit variance

In [4]:
# extract categorical features
condition_macd = data.columns == "macd"
macd = data_x[:, condition_macd]
data_x_nomalized = data_x[:, [not i for i in condition_macd]]  # all features except macd

# normalized
scaler = StandardScaler()
data_x_nomalized =  scaler.fit_transform(data_x_nomalized)

# append macd
data_x_nomalized = np.hstack((data_x_nomalized, macd))

##### Boosting tree: xgboost

Convert data to xgboost format

In [None]:
# train-validation split
trainX, validationX, trainY, validationY = train_test_split(
                                                                data_x_nomalized,
                                                                data_y,
                                                                train_size=0.8,
                                                                random_state=seed
                                                            )
# convert to xgboost format
trainX_xg = xgb.DMatrix(trainX, label=trainY) # TODO delete
validationX_xg = xgb.DMatrix(validationX, label=validationY) # TODO delete

Use the grid search to find the best params

In [None]:
# set up param grid
eta = [0.1, 0.015, 0.01, 0.0015, 0.001]
gamma = [0.5, 1.0, 1.5, 2.0, 2.5, 0]
max_depth = [i for i in range(1, 20)]
boosting_rounds = [i for i in range(2, 51)]
parameters = {
    "n_estimators": boosting_rounds,
    "learning_rate": eta,
    "gamma": gamma,
    "max_depth": max_depth
}

# set up classifier
xgboost_classifier = xgb.XGBClassifier(objective="binary:logistic")
xg_gridSearch = GridSearchCV(
    xgboost_classifier,
    parameters,
    scoring="accuracy",
    verbose=2,
    n_jobs=-1,
)

# find params
xg_gridSearch.fit(trainX, trainY)

# extract tuned model
xgboost_tuned = xg_gridSearch.best_estimator_
with open(os.path.join(os.getcwd(), "models/xgboost.pickle"), 'wb') as f:
    pickle.dump(xgboost_tuned, f)

# print
print("Best Params:")
print(xg_gridSearch.best_params_)
print("Best Score:")
print(xg_gridSearch.best_score_)

# Best Params:
# {'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 24}
# Best Score:
# 0.7090909090909092

Model performance on validation dataset

In [None]:
# load pickle
with open(os.path.join(os.getcwd(), "models/xgboost.pickle"), "rb") as f:
    xgboost_tuned = pickle.load(f)

# generate prediction on validation dataset
xgboost_pred = xgboost_tuned.predict(validationX)
xgboost_pred = [1 if i >= 0.5 else 0 for i in xgboost_pred]

# confusion matrix
xgboost_confusion_matrix = plot_confusion_matrix(xgboost_pred, 
                                                 validationY,
                                                save_plot=True,
                                                the_path=os.path.join(os.getcwd(), "confusion_matrix/xgboost.png"))

# model performance
xgboost_performance = model_performance(validationY, xgboost_pred)
print(xgboost_performance)

#    accuracy  Down_precision  Up_precision  f1_score  recall_score
# 0  0.547619        0.666667      0.527778  0.481481      0.547619

#### LSTM

Preprocess the data

In [5]:
# one-hot encoding
data_shape = data_x_nomalized.shape
# one-hot encoding for macd
macd = data_x_nomalized[:, data_shape[1] - 1]
data_x_nomalized = data_x_nomalized[:, :data_shape[1] - 1]
macd = to_categorical(macd, num_classes=2)
data_x_nomalized = np.hstack((data_x_nomalized, macd))
# one-hot encoding for label
data_y_encoded = to_categorical(data_y, num_classes=2)

# split to train and test
trainX, validationX, trainY, validationY = train_test_split(
                                                                data_x_nomalized,
                                                                data_y_encoded,
                                                                train_size=0.8,
                                                                random_state=seed
                                                            )

In [6]:
# special steps for LSTM models: reshape input to be [samples, time steps, features]
trainX = trainX.reshape(trainX.shape[0], 1, trainX.shape[1])
validationX = validationX.reshape(validationX.shape[0], 1, validationX.shape[1])

Build model functions:

In [7]:
# build model
def build_model_LSTM(hp):
    # model
    model = Sequential()
    
    # input layer
    input_num_unit= hp.Int("input_layer_units", 50, 100, 10)
    input_drop_ratio = hp.Float("input_layer_dropRatio", 0.0, 0.5, 0.1)
    model.add(LSTM(units=input_num_unit, input_shape=(1, 27), return_sequences=True))
    model.add(Dropout(input_drop_ratio))

    # layer set
    # LSTM layers set 1
    num_layers_1 = hp.Int("num_layers_set_1", 0, 3)
    num_units_1 = hp.Int("num_units_set_1", 50, 100, 10)
    dropRatio_1 = hp.Float("dropRatio_set_1", 0.0, 0.5, 0.1)
    for i in range(num_layers_1):
        model.add(LSTM(units=num_units_1, return_sequences=True))
        model.add(Dropout(dropRatio_1))

    # LSTM layer set 2
    num_layers_2 = hp.Int("num_layers_set_2", 0, 3)
    num_units_2 = hp.Int("num_units_set_2", 10, 50, 10)
    dropRatio_2 = hp.Float("dropRatio_set_2", 0.0, 0.5, 0.1)
    for i in range(num_layers_2):
        model.add(LSTM(units=num_units_2, return_sequences=True))
        model.add(Dropout(dropRatio_2))
    
    # LSTM last layer
    num_units_3 = hp.Int("num_units_set_3", 10, 50, 10)
    model.add(LSTM(units=num_units_3, return_sequences=False))
    
    # output layer
    model.add(Dense(2, activation='sigmoid'))

    # tune learning rate
    lr = hp.Choice("lr", values=[0.1, 0.015, 0.01, 0.0015, 0.001])
    model.compile(
        optimizer=Adam(learning_rate=lr), loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    return model

Bayesian Optimization, tune best hyperparam

In [10]:
tuner_LSTM = BayesianOptimization(
    build_model_LSTM,
    objective="val_accuracy",
    max_trials=100,
    executions_per_trial=3,
    directory=os.getcwd(),
    project_name="tune_LSTM",
    overwrite = False
)

tuner_LSTM.search(
    x=trainX,
    y=trainY,
    verbose=1,
    epochs=100,
    batch_size=30,
    validation_data=(validationX, validationY)
)

INFO:tensorflow:Reloading Oracle from existing project /Users/lihaohang/Desktop/FE-595-Final/Models_train/tune_LSTM/oracle.json
INFO:tensorflow:Reloading Tuner from /Users/lihaohang/Desktop/FE-595-Final/Models_train/tune_LSTM/tuner0.json
INFO:tensorflow:Oracle triggered exit


ANN Model

Restore the test and train data:

In [None]:
trainX = trainX.reshape(trainX.shape[0], trainX.shape[2])
validationX = validationX.reshape(validationX.shape[0], validationX.shape[2])

Build model function:

In [None]:
def build_model_ANN(hp):
    # model
    model = Sequential()
    model.add(Input(shape=(27,)))

    # tune layers
    # dense layers set 1
    num_layers_1 = hp.Int("n_layers_1", 0, 3)
    num_units_1 = hp.Int("n_units_1", 50, 100, 10)
    for i in range(num_layers_1):
        model.add(Dense(num_units_1))
        model.add(Activation("relu"))

    # dense layer set 2
    num_layers_2 = hp.Int("n_layers_2", 0, 3)
    num_units_2 = hp.Int("n_units_2", 10, 50, 10)
    for i in range(num_layers_2):
        model.add(Dense(num_units_2))
        model.add(Activation("relu"))

    # output layer
    model.add(Dense(2, activation="sigmoid"))

    # tune learning rate
    lr = hp.Choice("lr", values=[0.1, 0.015, 0.01, 0.0015, 0.001])
    model.compile(
        optimizer=Adam(learning_rate=lr), loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    return model

Bayesian Optimization, tune best hyperparam

In [None]:
tuner_ANN = BayesianOptimization(
    build_model_ANN,
    objective="val_accuracy",
    max_trials=100,
    executions_per_trial=3,
    directory=os.getcwd(),
    project_name="tune_ANN",
    overwrite = False
)

tuner_ANN.search(
    x=trainX,
    y=trainY,
    verbose=1,
    epochs=100,
    batch_size=30,
    validation_data=(validationX, validationY)
)

In [None]:
# retrive result
tuner_ANN.get_best_hyperparameters()[0].values

In [None]:
tuner_ANN.results_summary()