# Stock Selection Machine learning
## Targets: 
1. Train 4 models to achieve the accuracy target
2. Backtesting the models to calculate profits generated both from stock price and dividend yield
3. Consider dividend yield during model training, and make a comparison with the previous results

## 1. Packages imports and initialization

In [None]:
# Basics and visualization
import pandas as pd
import numpy as np
import umap
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
from sklearn.pipeline import make_pipeline, Pipeline
import json
from numpy import vstack

# Data Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.utils import resample
import os
from sklearn import model_selection

# Models to be considered
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import xgboost as xgb
import torch
from torch import nn
from torch.autograd import Variable
import torchvision
from torchvision.transforms import ToTensor, Lambda
from torch.optim import SGD
from torch.nn.init import xavier_uniform_

# Evaluation
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score
from sklearn.metrics import mean_squared_error as MSE, r2_score, mean_absolute_error
from torch.nn import MSELoss
from math import sqrt

# Warning ignorance
import warnings
warnings.filterwarnings("ignore")

In [None]:
path = "E:/NUS_Exchange/Research/Datasets/"
try:
    os.mkdir(path + "Output")
    os.mkdir(path + "Data")
    os.mkdir(path + "Output/GridSearchCV_Result")
    os.mkdir(path + "Output/HoldOutValidation")
except:
    print("Folder already exists")

## 2. Data Processing

In [None]:
# Reading csv files from local repository
def readX(pathname):
    '''
    Read X related file
    '''
    df = pd.read_csv(pathname, index_col=[0], header=[0])
    return df

### 2.1 Data loading

In [None]:
def list_dir(file_dir):
    # list_csv = []
    dir_list = os.listdir(file_dir)
    for cur_file in dir_list:
        path = os.path.join(file_dir,cur_file)
        if os.path.isfile(path):
            # print("{0} : is file!".format(cur_file))
            dir_files = os.path.join(file_dir, cur_file)
        if os.path.splitext(path)[1] == '.csv':
            csv_file = os.path.join(file_dir, cur_file)
            # print(os.path.join(file_dir, cur_file))
            # print(csv_file)
            list_csv.append(csv_file)
        if os.path.isdir(path):
            # print("{0} : is dir".format(cur_file))
            # print(os.path.join(file_dir, cur_file))
            list_dir(path)
    return list_csv
 
 
if __name__ == '__main__':
    paths = r'E:/NUS_Exchange/Research/Datasets/Data/ASX_stockconsidering'
    list_csv = []
    list_dir(file_dir=paths)
    print(list_csv)

### 2.2 Stock price data scaling

### Use one stock as an example

In [None]:
AIA_df = readX(list_csv[0])
AIA_df['Date'] = pd.to_datetime(AIA_df.index)
AIA_df = AIA_df.set_index(['Date'], drop=True)

split_date1 = pd.Timestamp('2017-01-03')
split_date2 = pd.Timestamp('2019-11-05')
split_date3 = pd.Timestamp('2021-04-09')
split_date4 = pd.Timestamp('2021-04-12')
split_date5 = pd.Timestamp('2022-09-08')

TV_AIA_df = AIA_df.loc[split_date1:split_date3]
Test_AIA_df = AIA_df.loc[split_date4:split_date5]

# Didn't include dividend and split
X_train = TV_AIA_df[["Volume", "OpenClose_spread", "Highlow_spread", "5_Days_MA", "10_Days_MA", "15_Days_MA", "30_Days_MA", "5_Days_VAR", "15_Days_VAR", "30_Days_VAR", 
                        "15_Days_EWMA", "15_Days_RSI", "15_Days_MFI", "15_Days_ATR", "ForceIndex", "Typical_MACD"]]
X_train_dividend = TV_AIA_df[["Dividends"]]
X_test = Test_AIA_df[["Volume", "OpenClose_spread", "Highlow_spread", "5_Days_MA", "10_Days_MA", "15_Days_MA", "30_Days_MA", "5_Days_VAR", "15_Days_VAR", "30_Days_VAR", 
                        "15_Days_EWMA", "15_Days_RSI", "15_Days_MFI", "15_Days_ATR", "ForceIndex", "Typical_MACD"]]
X_test_dividend = Test_AIA_df[["Dividends"]]
Y_train = TV_AIA_df[["Price"]]
Y_test = Test_AIA_df[["Price"]]

plt.figure(figsize=(10, 6))
ax = Y_train['Price'].plot()
Y_test['Price'].plot(ax=ax)
plt.legend(['train', 'validation'])

In [None]:
scales = [StandardScaler(), MinMaxScaler(), QuantileTransformer()]

## 3. Test and Criteria

In [None]:
def getScore(pipe_lr, X_valid, y_valid, prob = False):
    '''
    Get ROC
    '''
    y_pred = pipe_lr.predict(X_valid)
    if prob:
        y_pred = pipe_lr.predict_proba(X_valid)[:,1]
    print('ROC: %.4f' % roc_auc_score(y_true=y_valid, y_score=y_pred))

def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()
        actual = actual.reshape((len(actual), 1))
        # store
        predictions.append(yhat)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    # calculate mse
    mse = MSE(actuals, predictions)
    return mse

def getPrediction_ANN(predict_dl, model):
    predictions = list()
    for i, (inputs, targets) in enumerate(predict_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        predictions.append(yhat)
    predictions = vstack(predictions)
    
    return pd.DataFrame(data=predictions)



Rolling windows

In [None]:
model_mse = []
Totaldatas = 720
Window = 30
Rollingtimes = 12

def RollingTrain(X_train, Y_train, times, selectedModel, scale):
    X1 = X_train.iloc[Window * times : Totaldatas + Window * times]
    Y1 = Y_train.iloc[Window * times : Totaldatas + Window * times]
    X1_ = X_train.iloc[Totaldatas + Window * times : Totaldatas + Window * (times + 1)]
    Y1_ = Y_train.iloc[Totaldatas + Window * times : Totaldatas + Window * (times + 1)]

    scaler = scale.fit(X1)
    X1_scaled = scaler.transform(X1)
    X1__scaled = scaler.fit_transform(X1_)
    model = selectedModel.fit(X1_scaled, Y1)
    prediction = model.predict(X1__scaled)
    mse = MSE(Y1_, prediction)
    model_mse.append(mse)

def TrainRolling(Rollingtimes, selectedModel, scale):
    for i in range(0, Rollingtimes):
        RollingTrain(X_train, Y_train, i, selectedModel, scale)
    return np.mean(model_mse)


## 4. Random Forest

### 4.1 Parameters to be considered

In [None]:
rf_n_estimators = [50, 100, 300, 500, 800, 1200]
rf_max_depth = [None, 20, 100, 300, 500, 1000]
rf_max_leaf_nodes = [1000, 500, 200, 100]
rf_min_samples_split = [2, 10, 20, 50]


In [None]:
rf_parameters = []
def RandomForestTraining():
    averageMSE = []
    for n_estimators in rf_n_estimators:
        for max_depth in rf_max_depth:
            for max_leaf_nodes in rf_max_leaf_nodes:
                for min_samples_split in rf_min_samples_split:
                    for scaler in scales:
                        rf = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, max_leaf_nodes = max_leaf_nodes, min_samples_split = min_samples_split)
                        averageMSE.append(TrainRolling(Rollingtimes, rf, scaler))
                        rf_parameters.append(["rf_n_estimators: ", n_estimators, "; rf_max_depth:", max_depth, "; rf_max_leaf_nodes: ", max_leaf_nodes, "; rf_min_samples_split: ", 
                                             min_samples_split, "; scaler: ", scaler])
    return averageMSE

In [None]:
x = RandomForestTraining()