# Machine Learning Models for stock prediction - non transformation version

# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import warnings
warnings.filterwarnings("ignore")

## Import data

The data was downloaded from Bloomberg on:
* Exchange rate of Vietnam with its major trading partners: the China and the US
* Precious metal spot price and future price: Gold, Silver, Palladium, Platinum
* Global Stock Indices: Hang Seng Index, Nasdaq 100, Nasdaq Composite, Nikkei 225, SP500, DOJI, Shanghai Shenzhen CSI3000, Shanghai Shenzhen Composite and Singapore Stock Index
* Volatility stock index: VIX Index

The data will be imported from previous EDA session, which has been cleaned.

In [121]:
# Import data
data = pd.read_csv('data.csv', index_col = ['Date'])

# Feature Extraction

We will generate the following set of features:
- Return of n lag days
- Lags price of n lag days
- Diff price of n lag days

In [126]:
# Generate lag function

def generate_lag(df, n_lags):
    """
    Generate n lag for each columns of the dataframe
    
    """
     # df_copy
    df2 = df.copy()
    
    # Create column list
    new_columns = []
    
    # Define n_lags:
    lag_range = range(1, n_lags + 1)
    
    # Generate new columns with lags
    for col in df2.columns:
        for i in lag_range:
            new_col_name = f"{col}_lag_{i}"
            df2[new_col_name] = df2[col].shift(periods = i)
            new_columns.append(new_col_name)
    
    return df2[new_columns]

In [119]:
# Generate return
def generate_return(df, n_lags):
    """
    Generate n lag return for each columns of the dataframe
    
    """
    # df_copy
    df2 = df.copy()
    
    # Create column list
    new_columns = []
    
    # Define lag range
    lag_range = range(1, n_lags + 1)
    
    # Generate new columns with lags
    for col in df2.columns:
        for i in lag_range:
            new_col_name = f"{col}_return_{i}"
            df2[new_col_name] = df2[col].pct_change(periods = i)
            new_columns.append(new_col_name)
    
    return df2[new_columns]

In [125]:
# Generate diff
def generate_diff(df, n_lags):
    """
    Generate n lag diff for each columns of the dataframe
    
    """
     # df_copy
    df2 = df.copy()
    
    # Create column list
    new_columns = []
    
    # Define n_lags:
    lag_range = range(1, n_lags + 1)
    
    # Generate new columns with lags
    for col in df2.columns:
        for i in lag_range:
            new_col_name = f"{col}_return_{i}"
            df2[new_col_name] = df2[col].diff(periods = i)
            new_columns.append(new_col_name)
    
    return df2[new_columns]

In [127]:
# Generate lag
data_lag = generate_lag(data, 10)
data_ret = generate_return(data, 10)
data_diff = generate_diff(data, 10)

In [129]:
# Merge all of dataframes into one big data
data_features = pd.concat([data, data_lag, data_ret, data_diff], axis = 1)
data_features.head()

Unnamed: 0_level_0,index_sp500,spot_palladium,index_shsz_csi300,index_vni,index_sp500_lag_1,index_sp500_lag_2,index_sp500_lag_3,index_sp500_lag_4,index_sp500_lag_5,index_sp500_lag_6,...,index_vni_return_1,index_vni_return_2,index_vni_return_3,index_vni_return_4,index_vni_return_5,index_vni_return_6,index_vni_return_7,index_vni_return_8,index_vni_return_9,index_vni_return_10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7/2/2007,1519.43,368.5,16130,994.17,,,,,,,...,,,,,,,,,,
7/3/2007,1524.87,365.0,16131,977.3,1519.43,,,,,,...,-16.87,,,,,,,,,
7/4/2007,1522.19,366.0,16130,1013.56,1524.87,1519.43,,,,,...,36.26,19.39,,,,,,,,
7/5/2007,1525.4,365.5,16130,1012.82,1522.19,1524.87,1519.43,,,,...,-0.74,35.52,18.65,,,,,,,
7/6/2007,1530.44,367.0,16135,1010.53,1525.4,1522.19,1524.87,1519.43,,,...,-2.29,-3.03,33.23,16.36,,,,,,


In [130]:
# Drop missing values
data_features = data_features.iloc[10:]

<class 'pandas.core.frame.DataFrame'>
Index: 4165 entries, 7/16/2007 to 6/30/2023
Columns: 124 entries, index_sp500 to index_vni_return_10
dtypes: float64(123), int64(1)
memory usage: 4.0+ MB


In [132]:
data_features.head()

Unnamed: 0_level_0,index_sp500,spot_palladium,index_shsz_csi300,index_vni,index_sp500_lag_1,index_sp500_lag_2,index_sp500_lag_3,index_sp500_lag_4,index_sp500_lag_5,index_sp500_lag_6,...,index_vni_return_1,index_vni_return_2,index_vni_return_3,index_vni_return_4,index_vni_return_5,index_vni_return_6,index_vni_return_7,index_vni_return_8,index_vni_return_9,index_vni_return_10
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7/16/2007,1549.52,367.5,16130,995.83,1552.5,1547.7,1518.76,1510.12,1531.85,1530.44,...,-19.9,-21.85,-34.02,-27.21,-14.44,-14.7,-16.99,-17.73,18.53,1.66
7/17/2007,1549.37,364.5,16130,1000.33,1549.52,1552.5,1547.7,1518.76,1510.12,1531.85,...,4.5,-15.4,-17.35,-29.52,-22.71,-9.94,-10.2,-12.49,-13.23,23.03
7/18/2007,1546.17,368.75,16135,994.73,1549.37,1549.52,1552.5,1547.7,1518.76,1510.12,...,-5.6,-1.1,-21.0,-22.95,-35.12,-28.31,-15.54,-15.8,-18.09,-18.83
7/19/2007,1553.08,371.13,16140,984.43,1546.17,1549.37,1549.52,1552.5,1547.7,1518.76,...,-10.3,-15.9,-11.4,-31.3,-33.25,-45.42,-38.61,-25.84,-26.1,-28.39
7/20/2007,1534.1,372.0,16137,981.31,1553.08,1546.17,1549.37,1549.52,1552.5,1547.7,...,-3.12,-13.42,-19.02,-14.52,-34.42,-36.37,-48.54,-41.73,-28.96,-29.22


# Machine Learning models - Regression

We will perform the test on several models to determine which models will perform best. The models are:
- Linear Regression (include Ridge and Lasso for avoid multicolinearity)
- Decision Trees Regressor
- Gradient Boosting Regressor
- XGBoost Regressor

## Train test split - 70% 20% 10%

In [12]:
# Define train test split
def train_test_split(df, target):
    # Define train, cv, test time
    train_time = int(round(len(df) * 0.7))
    cv_time = int(round(len(df) * 0.1))
    
    # Define X, y
    X = df.drop(target, axis = 1)
    y = df[target]
    
    # Train test split
    X_train = X.iloc[:train_time]
    X_cv = X.iloc[train_time : (train_time + cv_time)]
    X_test = X.iloc[(train_time + cv_time):]
    
    y_train = y.iloc[:train_time]
    y_cv = y.iloc[train_time : (train_time + cv_time)]
    y_test = y.iloc[(train_time + cv_time):]
    
    # Print out to check shape
    print(X_train.shape)
    print(X_cv.shape)
    print(X_test.shape)
    
    print(y_train.shape)
    print(y_cv.shape)
    print(y_test.shape)
    
    return X_train, y_train, X_cv, y_cv, X_test, y_test

In [22]:
# Train test split the dataset
X_train, y_train, X_cv, y_cv, X_test, y_test = train_test_split(data, 'index_vni')

(2922, 3)
(418, 3)
(835, 3)
(2922,)
(418,)
(835,)


In [31]:
# Import scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_cv_scaled = scaler.transform(X_cv)
X_test_scaled = scaler.transform(X_test)

## Prepare models

We will prepare regression models. We will fit on non-tuned models first to see the model, then based on the result, we will perform further hyperparameters tunning if needed

In [77]:
# Import model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [85]:
# Call out object
lr = LinearRegression()
ridge = Ridge(alpha = 15)
lasso = Lasso(alpha = 20)
dtr = DecisionTreeRegressor(criterion = 'squared_error', max_depth = 10, min_samples_split = 4)
gb = GradientBoostingRegressor()
xgb = XGBRegressor()
rf = RandomForestRegressor(criterion = 'squared_error', max_depth = 10, min_samples_split = 4)

# Set models list
models = [lr, ridge, lasso, dtr, gb, xgb, rf]

## Test models

In [86]:
# Import metrics
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse

def rmse(mse):
    rmse = np.sqrt(mse)

In [87]:
# Define evaluate model
def evaluate_model(model, X_train, y_train, X_cv, y_cv):
    
    # Fit model and obtain result
    model.fit(X_train, y_train)
    y_pred_cv = model.predict(X_cv)
    MAE = mae(y_cv, y_pred_cv)
    MSE = mse(y_cv, y_pred_cv)
    RMSE = rmse(MSE)
    
    # Print result
    print(f"{model} result: \n"
          f"MAE: {MAE} \n"
          f"MSE: {MSE} \n"
          f"RMSE: {RMSE} \n")

In [88]:
# Test the model
for model in models:
    evaluate_model(model, X_train_scaled, y_train, X_cv_scaled, y_cv)

LinearRegression() result: 
MAE: 102.65879809516349 
MSE: 21204.551532465335 
RMSE: None 

Ridge(alpha=15) result: 
MAE: 99.68983842747956 
MSE: 19937.125148123094 
RMSE: None 

Lasso(alpha=20) result: 
MAE: 87.66394658271142 
MSE: 9232.927680218125 
RMSE: None 

DecisionTreeRegressor(max_depth=10, min_samples_split=4) result: 
MAE: 48.391081638755985 
MSE: 4055.2495698626776 
RMSE: None 

GradientBoostingRegressor() result: 
MAE: 40.860416708907614 
MSE: 3682.6169748885495 
RMSE: None 

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=No

XGBoost Regressor has the lowest error, and shows potentials to tunning for more. Furthermore, we just simply fit scaled data without any feature extraction.