## Install/ Import Library

In [None]:
! pip install yfinance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import re
from scipy.sparse import csr_matrix

from tqdm.auto import tqdm
from pathlib import Path
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

import yfinance

import pickle


## Helper Functions

### load sp500 data & prepare technical index

In [None]:
def load_sp500_data(start_date = '2014-12-31', end_date = '2021-01-05'):
  sp = yfinance.Ticker('^GSPC')
  sp_history = sp.history(start =start_date, end = end_date)
  sp_history.reset_index(inplace = True)
  sp_history['Date'] = pd.to_datetime(sp_history['Date'].dt.date)
  sp_history.drop(labels = ['Dividends', 'Stock Splits'], axis = 1, inplace = True)
  sp_history = sp_history.loc[:,['Date', 'Open', 'High', 'Low', 'Volume', 'Close']]
  sp_history['Close+1day'] = sp_history['Close'].shift(-1)
  sp_history.dropna(inplace = True)
  sp_history['up_down'] = sp_history[['Close', 'Close+1day']].apply(lambda x: 1 if x['Close+1day'] > x['Close'] else 0, axis = 1)

  appl = yfinance.Ticker('AAPL')
  appl_history = appl.history(start =start_date, end = end_date)
  appl_history.reset_index(inplace = True)
  appl_history['Date'] = pd.to_datetime(appl_history['Date'].dt.date)
  appl_history['Close'] = appl_history['Close'].pct_change()

  msft = yfinance.Ticker('MSFT')
  msft_history = msft.history(start =start_date, end = end_date)
  msft_history.reset_index(inplace = True)
  msft_history['Date'] = pd.to_datetime(msft_history['Date'].dt.date)
  msft_history['Close'] = msft_history['Close'].pct_change()

  amzn = yfinance.Ticker('AMZN')
  amzn_history = amzn.history(start =start_date, end = end_date)
  amzn_history.reset_index(inplace = True)
  amzn_history['Date'] = pd.to_datetime(amzn_history['Date'].dt.date)
  amzn_history['Close'] = amzn_history['Close'].pct_change()

  nvda = yfinance.Ticker('NVDA')
  nvda_history = nvda.history(start =start_date, end = end_date)
  nvda_history.reset_index(inplace = True)
  nvda_history['Date'] = pd.to_datetime(nvda_history['Date'].dt.date)
  nvda_history['Close'] = nvda_history['Close'].pct_change()

  brk = yfinance.Ticker('BRK-B')
  brk_history = brk.history(start =start_date, end = end_date)
  brk_history.reset_index(inplace = True)
  brk_history['Date'] = pd.to_datetime(brk_history['Date'].dt.date)
  brk_history['Close'] = brk_history['Close'].pct_change()

  googl = yfinance.Ticker('GOOGL')
  googl_history = googl.history(start =start_date, end = end_date)
  googl_history.reset_index(inplace = True)
  googl_history['Date'] = pd.to_datetime(googl_history['Date'].dt.date)
  googl_history['Close'] = googl_history['Close'].pct_change()

  tsla = yfinance.Ticker('TSLA')
  tsla_history = tsla.history(start =start_date, end = end_date)
  tsla_history.reset_index(inplace = True)
  tsla_history['Date'] = pd.to_datetime(tsla_history['Date'].dt.date)
  tsla_history['Close'] = tsla_history['Close'].pct_change()

  meta = yfinance.Ticker('META')
  meta_history = meta.history(start =start_date, end = end_date)
  meta_history.reset_index(inplace = True)
  meta_history['Date'] = pd.to_datetime(meta_history['Date'].dt.date)
  meta_history['Close'] = meta_history['Close'].pct_change()

  xom = yfinance.Ticker('XOM')
  xom_history = xom.history(start =start_date, end = end_date)
  xom_history.reset_index(inplace = True)
  xom_history['Date'] = pd.to_datetime(xom_history['Date'].dt.date)
  xom_history['Close'] = xom_history['Close'].pct_change()

  jpm = yfinance.Ticker('JPM')
  jpm_history = jpm.history(start =start_date, end = end_date)
  jpm_history.reset_index(inplace = True)
  jpm_history['Date'] = pd.to_datetime(jpm_history['Date'].dt.date)
  jpm_history['Close'] = jpm_history['Close'].pct_change()

  for i in [10, 20, 30]:
    sp_history['MA'+str(i)] = sp_history['Close'].rolling(i).mean()

  for df, name in zip([appl_history, msft_history, amzn_history, nvda_history, brk_history, 
                       googl_history, tsla_history, meta_history, xom_history, jpm_history], 
                      ['appl_Close', 'msft_Close', 'amzn_Close', 'nvda_Close', 'brk_Close', 
                       'googl_Close', 'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close']):
    df = df[['Date', 'Close']].rename(columns = {'Close': name})
    sp_history = sp_history.merge(df, left_on = 'Date', right_on = 'Date', how = 'left')
  
  sp_history = sp_history[(sp_history['Date'] >= '2014-12-31') & (sp_history['Date'] <= '2020-12-31')].copy()
  sp_history.dropna(inplace = True)

  return sp_history

### load text dataset & combine sp500

In [None]:
def load_fold_dataset(directory: 'str', 
                      file_name: 'str',
                      sp500_fn: 'function'):
  
  path = Path(directory)
  file_path = path / file_name
  data = pd.read_csv(file_path)
  start_date = data['timestamp'].unique()[0]
  end_date = data['timestamp'].unique()[-1]
  sp500 = sp500_fn()

  column_subset_X = ['Date','Close'] + list(sp500.columns[8:])

  data_X = sp500.loc[(sp500['Date']>= start_date) & (sp500['Date'] <= end_date),column_subset_X].copy()
  data_y = sp500.loc[(sp500['Date']>= start_date) & (sp500['Date'] <= end_date),['Date', 'up_down']]

  return data_X, data_y

In [None]:
def load_dataset(sp500_fn: 'function', 
                 training_data = True):

  sp500 = sp500_fn()

  column_subset_X = ['Date','Close'] + list(sp500.columns[8:])

  data_X = sp500.loc[:,column_subset_X].copy()
  data_y = sp500.loc[:,['Date', 'up_down']]
  
  if training_data == True:
    X_train = data_X[data_X['Date']<='2018-12-31'].copy()
    X_valid = data_X[(data_X['Date']>='2019-01-01') & (data_X['Date']<='2019-12-31')].copy()
    y_train = data_y[data_y['Date']<='2018-12-31'].copy()
    y_valid = data_y[(data_X['Date']>='2019-01-01') & (data_X['Date']<='2019-12-31')].copy()
    return X_train, X_valid, y_train, y_valid
  
  elif training_data == False:
    X_train = data_X[data_X['Date']<='2019-12-31'].copy()
    X_test = data_X[(data_X['Date']>='2020-01-01')].copy()
    y_train = data_y[data_y['Date']<='2019-12-31'].copy()
    y_test = data_y[(data_X['Date']>='2020-01-01')].copy()
    return X_train, X_test, y_train, y_test

### function to prepare 10 folds data

In [None]:
def prepare_folds_dataset(cv_path: 'str'):
  cv_path = Path(cv_path)
  fold_dataset = {'fold-1':{'train':None, 'valid':None}, 
                  'fold-2':{'train':None, 'valid':None}, 
                  'fold-3':{'train':None, 'valid':None}, 
                  'fold-4':{'train':None, 'valid':None}, 
                  'fold-5':{'train':None, 'valid':None}, 
                  'fold-6':{'train':None, 'valid':None}, 
                  'fold-7':{'train':None, 'valid':None}, 
                  'fold-8':{'train':None, 'valid':None}, 
                  'fold-9':{'train':None, 'valid':None}, 
                  'fold-10':{'train':None, 'valid':None}}

  for i in cv_path.iterdir():
    i.name
    for j in i.iterdir():
      if 'train' in j.name:
        train_path = j

        X_fold_train, y_fold_train = load_fold_dataset(directory = i, 
                                            file_name = train_path.name,
                                            sp500_fn = load_sp500_data)
        fold_dataset[i.name]['train'] = (X_fold_train, y_fold_train)
      elif 'valid' in j.name:
        valid_path = j

        X_fold_valid, y_fold_valid = load_fold_dataset(directory = i,
                                            file_name = valid_path.name,
                                            sp500_fn = load_sp500_data)
        fold_dataset[i.name]['valid'] = (X_fold_valid, y_fold_valid)
  
  return fold_dataset

### load loughran_mcdonal sentiment dictionary

In [None]:
def loughran_mcdonald_dict(directory: 'str', file_name: 'str'):
  path = Path(directory)
  file_path = path / file_name
  data = pd.read_csv(file_path)

  data_part = data[['Word', 'Negative', 'Positive', 
                    'Uncertainty', 'Litigious', 'Strong_Modal', 
                    'Weak_Modal', 'Constraining']].copy()

  data_part['Word'] = data_part['Word'].str.lower()

  for i in list(data_part.columns)[1:]:
    data_part[i] = data_part[i].apply(lambda x: 1 if x >0 else 0)
  
  data_part.drop(index = 50741, inplace = True) # drop nan values
  data_part.reset_index(drop = True, inplace = True)

  return data_part

### gridsearchcv, using 10 folds dataset (Linear SVC)

In [None]:
def gridsearchcv(folds_data: 'dict', parametergrid: 'list', model: 'function'):
  folds = ['fold-1', 'fold-2', 'fold-3', 'fold-4', 'fold-5', 'fold-6', 'fold-7', 'fold-8', 'fold-9', 'fold-10']
  score_dict = {'parameter': [], 'fold':[], 'train_accuracy':[], 'train_f1':[], 'valid_accuracy': [], 'valid_f1': []}

  for parameter in tqdm(parametergrid):
    model_initialized = model.set_params(classification__C = parameter['classification__C'], 
                                         classification__penalty = parameter['classification__penalty'])
    
    for fold in folds:
      score_dict['parameter'].append(parameter)
      X_train, y_train = folds_data[fold]['train']
      X_valid, y_valid = folds_data[fold]['valid']
      model_initialized.fit(X_train, y_train['up_down'].values)
      train_acc = model_initialized.score(X_train, y_train['up_down'].values)
      predicted_y_train = model_initialized.predict(X_train)
      train_f1 = f1_score(y_train['up_down'].values, predicted_y_train, average = 'macro')
      valid_acc = model_initialized.score(X_valid, y_valid['up_down'].values)
      predicted_y_valid = model_initialized.predict(X_valid)
      valid_f1 = f1_score(y_valid['up_down'].values, predicted_y_valid, average = 'macro')

      score_dict['fold'].append(fold)
      score_dict['train_accuracy'].append(train_acc)
      score_dict['train_f1'].append(train_f1)
      score_dict['valid_accuracy'].append(valid_acc)
      score_dict['valid_f1'].append(valid_f1)
  
  return score_dict


### function to visualize gridsearchcv scores (linearSVC)

In [None]:
def visualize_gridsearchcv(cv_result: 'df'):
  plt.figure(figsize = (12, 6))
  plt.plot(cv_result['classification__C'], cv_result[('valid_f1', 'mean')], label = 'valid f1 macro', color = 'blue')
  plt.errorbar(x = cv_result['classification__C'], y = cv_result[('valid_f1', 'mean')], yerr = cv_result[('valid_f1', 'std')], label = '+/- 1 std', color = 'blue')
  
  for row in cv_result.itertuples():
    plt.annotate('{:.4f}'.format(row[-2]), xy = (row[1] + 0.0001, row[-2] + 0.0002), color = 'black', fontsize = 10)
    plt.annotate(f'std: {row[-1]:.4f}', xy = (row[1] + 0.0001, row[-2] + 0.005), color = 'red', fontsize = 10)
  
  plt.legend()
  plt.xlim((0, 0.055))
  plt.xlabel('linear SVC, C value')
  plt.ylabel('valid f1 score (macro)')
  plt.show()

### gridsearchcv, using 10 folds dataset (XGBoost)

In [None]:
def gridsearchcv_xgb(folds_data: 'dict', parametergrid: 'list', model: 'function'):
  folds = ['fold-1', 'fold-2', 'fold-3', 'fold-4', 'fold-5', 'fold-6', 'fold-7', 'fold-8', 'fold-9', 'fold-10']
  score_dict = {'parameter': [], 'fold':[], 'train_accuracy':[], 'train_f1':[], 'valid_accuracy': [], 'valid_f1': []}

  for parameter in tqdm(parametergrid):
    model_initialized = model.set_params(classification__learning_rate = parameter['classification__learning_rate'], 
                                         classification__max_depth = parameter['classification__max_depth'])
    
    for fold in folds:
      score_dict['parameter'].append(parameter)
      X_train, y_train = folds_data[fold]['train']
      X_valid, y_valid = folds_data[fold]['valid']
      model_initialized.fit(X_train, y_train['up_down'].values)
      train_acc = model_initialized.score(X_train, y_train['up_down'].values)
      predicted_y_train = model_initialized.predict(X_train)
      train_f1 = f1_score(y_train['up_down'].values, predicted_y_train, average = 'macro')
      valid_acc = model_initialized.score(X_valid, y_valid['up_down'].values)
      predicted_y_valid = model_initialized.predict(X_valid)
      valid_f1 = f1_score(y_valid['up_down'].values, predicted_y_valid, average = 'macro')

      score_dict['fold'].append(fold)
      score_dict['train_accuracy'].append(train_acc)
      score_dict['train_f1'].append(train_f1)
      score_dict['valid_accuracy'].append(valid_acc)
      score_dict['valid_f1'].append(valid_f1)
  
  return score_dict

### function to visualize gridsearchcv result (XGBoost)

In [None]:
def visualize_gridsearchcv_xgb(cv_result: 'df'):
  depth_list = [3, 4, 5]  
  df_cv_result = cv_result[['classification__learning_rate',
                            'classification__max_depth',
                            'valid_f1']].copy()
                               
  df_cv_result.rename(columns = {'classification__learning_rate': 'learning_rate', 
                                 'classification__max_depth': 'max_depth'}, 
                      inplace = True)

  plt.figure(figsize = (8, 6))

  for idx, depth_no in zip(range(len(depth_list)), depth_list):
    ax = plt.subplot(3, 1, idx+1)
    ax.plot(df_cv_result.loc[(df_cv_result['max_depth'] == depth_no), 'learning_rate'], 
            df_cv_result.loc[(df_cv_result['max_depth'] == depth_no), ('valid_f1', 'mean')].values,
            color = 'blue', 
            linewidth = 1, 
            label = f'depth: {depth_no}')
    ax.errorbar(x = df_cv_result.loc[(df_cv_result['max_depth'] == depth_no), 'learning_rate'],
                y = df_cv_result.loc[(df_cv_result['max_depth'] == depth_no), ('valid_f1', 'mean')].values, 
                yerr = df_cv_result.loc[(df_cv_result['max_depth'] == depth_no), ('valid_f1', 'std')].values, 
                label = '+/- 1 std', color = 'blue')
    
    for row in df_cv_result.loc[(df_cv_result['max_depth'] == depth_no),:].itertuples():
      ax.annotate(f'{row[-2]:.4f}', (row[1], row[-2] + 0.001), fontsize = 10)
      ax.annotate(f'{row[-1]:.4f}', (row[1], row[-1] + 0.004), fontsize = 10)

    
    ax.set_ylim((0.3, 0.6))
    ax.set_ylabel('mean f1', fontsize = 12)
    plt.yticks(fontsize=9)
    ax.legend(fontsize = 10)

    if idx != len(depth_list) - 1:
      ax.tick_params(labelbottom=False)
    
    if idx == 0:
      plt.title('validation f1 macro of the grid search cv results')
    
  plt.tight_layout(pad = 0.5)
  plt.xlabel('XGBoost, learning rate', fontsize = 12)
  plt.show()

### function to visualize confusion matrix (binary label)

In [None]:
def performance_metrics_binary(model, data, true_y, train_valid_test: 'str'):
  predicted_y = model.predict(data)
  accuracy_score = model.score(data, true_y)
  f1score = f1_score(true_y, predicted_y, average = 'macro')
  

  plt.figure(figsize = (4, 4))
  sns.set(font_scale=1.2)
  cm_result = confusion_matrix(true_y, predicted_y, normalize = 'pred')

  confusion_matrix_result_heatmap = sns.heatmap(cm_result, 
                                                cmap="Blues", 
                                                annot = True, 
                                                fmt=".2f", annot_kws={'size': 15}, 
                                                xticklabels=['Negative', 'Positive'], 
                                                yticklabels=['Negative', 'Positive'])

  confusion_matrix_result_heatmap.set(xlabel='Predicted Label', ylabel='True Label', title = 'price movement')

  plt.show()
  print(f'\n{train_valid_test} accuracy: {accuracy_score}, {train_valid_test} f1 score: {f1score}')
  return accuracy_score, f1score

## Load Data

### prepare loughran-mcdonald sentiment dictionary

In [None]:
lm_sent_dict = loughran_mcdonald_dict(directory = '../data/TF-IDF Models', 
                                      file_name = 'Loughran-McDonald_MasterDictionary_1993-2021.csv')

### load 10 folds dataset

In [None]:
folds_data_path = Path(r'../data/TF-IDF Models/Intermediate Output/dict_folds_data.pickle')

if folds_data_path.is_file():
  with open(folds_data_path, 'rb') as f_1:
    dict_folds_data = pickle.load(f_1)

else:
  dict_folds_data = prepare_folds_dataset(cv_path = r'../data/TF-IDF Models/Cross Validation_fold_data')

### load and split training/ valid dataset

In [None]:
X_train, X_valid, y_train, y_valid = load_dataset(sp500_fn = load_sp500_data, training_data = True)

## Build LinearSVC Pipeline

### initialize necessary functions

In [None]:
minmaxscaler_price = MinMaxScaler()

### build pipeline (binary label)

In [None]:
ma_pipeline = Pipeline(steps = [('norm_price', minmaxscaler_price)])

column_transformer = ColumnTransformer(transformers = [('price_ma', ma_pipeline, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                  'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                  'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                  'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close'])])

clf_pipeline_binary = Pipeline(steps = [('preprocessing', column_transformer), 
                                        ('classification',LinearSVC(C= 0.05, class_weight = 'balanced'))])

### 10-fold gridsearchcv (Linear SVC)

In [None]:
parameters_grid = {'classification__C': [0.001, 0.005, 0.01, 0.05], 
                   'classification__penalty': ['l2']}

parameters = ParameterGrid(parameters_grid)

result = gridsearchcv(folds_data = dict_folds_data, parametergrid = parameters, model = clf_pipeline_binary)

In [None]:
parameters_grid_1 = {'classification__C': [], 'classification__penalty': []}

df_result = pd.DataFrame(result)

for parameter in df_result['parameter']:
  for key, value in parameter.items():
    parameters_grid_1[key].append(value)

df_result_1 = df_result.merge(pd.DataFrame(parameters_grid_1), how = 'left', left_index = True, right_index = True)
df_result_complete = df_result_1.iloc[:,1:].groupby(['classification__C',
                                                     'classification__penalty'], as_index = False).agg({'train_accuracy': ['mean', 'std'], 'train_f1': ['mean', 'std'], 'valid_accuracy': ['mean', 'std'], 'valid_f1': ['mean', 'std']})

In [None]:
df_result.head()

In [None]:
df_result_complete.iloc[df_result_complete[('valid_f1', 'mean')].idxmax()] # the best combination of parameters 

### visualize gridsearchcv result (Linear SVC)

In [None]:
visualize_gridsearchcv(cv_result = df_result_complete)

### confirm the best model (Linear SVC)

In [None]:
minmaxscaler_price = MinMaxScaler()

ma_pipeline = Pipeline(steps = [('norm_price', minmaxscaler_price)])

column_transformer = ColumnTransformer(transformers = [('price_ma', ma_pipeline, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                  'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                  'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                  'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close'])])

clf_pipeline_binary = Pipeline(steps = [('preprocessing', column_transformer), 
                                        ('classification',LinearSVC(C= 0.05, class_weight = 'balanced'))])

clf_pipeline_binary.fit(X_train, y_train['up_down'].values)

### performance evaluation and confusion matrix (train dataset)

In [None]:
accuracy_score_train, f1score_train = performance_metrics_binary(model = clf_pipeline_binary, 
                                                                 data = X_train, 
                                                                 true_y = y_train['up_down'].values, 
                                                                 train_valid_test = 'train')

### performance evaluation & confusion matrix (valid dataset)

In [None]:
accuracy_score_valid, f1score_valid = performance_metrics_binary(model = clf_pipeline_binary, 
                                                                 data = X_valid, 
                                                                 true_y = y_valid['up_down'].values, 
                                                                 train_valid_test = 'valid')

## Build XGBoost Classifier

### build pipeline

In [None]:
minmaxscaler_price_xgboost = MinMaxScaler()
ma_pipeline_xg = Pipeline(steps = [('norm_price', minmaxscaler_price_xgboost)])
column_transformer_xg = ColumnTransformer(transformers = [('price_ma', ma_pipeline_xg, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                        'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                        'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                        'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close'])])
clf_xgb = XGBClassifier(booster = 'gbtree', min_split_loss = 0.01, learning_rate = 0.01, max_depth = 3, n_estimators = 1000, scale_pos_weight = 0.9)
clf_pipeline_binary_xgboost = Pipeline(steps = [('preprocessing', column_transformer_xg), 
                                                ('classification',clf_xgb)])

### 10-fold gridsearchcv (XGBoost Classifier)

In [None]:
parameters_grid_xgb = {'classification__learning_rate': [0.0005, 0.0007, 0.001], 
                   'classification__max_depth': [3, 4 , 5]}

parameters_xgb = ParameterGrid(parameters_grid_xgb)

result_xgb = gridsearchcv_xgb(folds_data = dict_folds_data, parametergrid = parameters_xgb, model = clf_pipeline_binary_xgboost)

In [None]:
parameters_grid_xgb_1 = {'classification__learning_rate': [], 'classification__max_depth': []}

df_result_xgb = pd.DataFrame(result_xgb)

for parameter in df_result_xgb['parameter']:
  for key, value in parameter.items():
    parameters_grid_xgb_1[key].append(value)

df_result_xgb_1 = df_result_xgb.merge(pd.DataFrame(parameters_grid_xgb_1), how = 'left', left_index = True, right_index = True)
df_result_xgb_complete = df_result_xgb_1.iloc[:,1:].groupby(['classification__learning_rate',
                                                             'classification__max_depth'], as_index = False).agg({'train_accuracy': ['mean', 'std'], 'train_f1': ['mean', 'std'], 'valid_accuracy': ['mean', 'std'], 'valid_f1': ['mean', 'std']})

In [None]:
df_result_xgb_complete.iloc[df_result_xgb_complete[('valid_f1', 'mean')].idxmax()] # the best combination of parameters 

In [None]:
df_result_xgb_complete

In [None]:
df_cv_result = df_result_xgb_complete[['classification__learning_rate',
                            'classification__max_depth',
                            'valid_f1']].copy()
                               
df_cv_result.rename(columns = {'classification__learning_rate': 'learning_rate', 
                               'classification__max_depth': 'max_depth'}, 
                      inplace = True)

### visualize gridsearchcv result (XGBoost)

In [None]:
visualize_gridsearchcv_xgb(df_result_xgb_complete)

### confirm the best model

In [None]:
minmaxscaler_price_xgboost = MinMaxScaler()
ma_pipeline_xg = Pipeline(steps = [('norm_price', minmaxscaler_price_xgboost)])
column_transformer_xg = ColumnTransformer(transformers = [('price_ma', ma_pipeline_xg, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                        'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                        'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                        'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close'])])
clf_xgb = XGBClassifier(booster = 'gbtree', min_split_loss = 0.01, learning_rate = 0.001, max_depth = 4, n_estimators = 1000, scale_pos_weight = 0.9)
clf_pipeline_binary_xgboost = Pipeline(steps = [('preprocessing', column_transformer_xg), 
                                                ('classification',clf_xgb)])

clf_pipeline_binary_xgboost.fit(X_train, y_train['up_down'].values)

### performance evaluation & confusion matrix (train)

In [None]:
accuracy_score_valid, f1score_valid = performance_metrics_binary(model = clf_pipeline_binary_xgboost, 
                                                                 data = X_train, 
                                                                 true_y = y_train['up_down'].values, 
                                                                 train_valid_test = 'train')

### performance evaluation & confusion matrix (valid)

In [None]:
accuracy_score_valid, f1score_valid = performance_metrics_binary(model = clf_pipeline_binary_xgboost, 
                                                                 data = X_valid, 
                                                                 true_y = y_valid['up_down'].values, 
                                                                 train_valid_test = 'valid')

## Build Stacking Model

In [None]:
logistic = LogisticRegression()
stack_pipeline = StackingClassifier(estimators = [('linearsvc', clf_pipeline_binary), ('xgboost', clf_pipeline_binary_xgboost)], 
                                    final_estimator = logistic)

stack_pipeline.fit(X_train, y_train['up_down'].values)

### performance evaluation & confusion matrix (training data)

In [None]:
accuracy_score_train, f1score_train = performance_metrics_binary(model = stack_pipeline, 
                                                                 data = X_train, 
                                                                 true_y = y_train['up_down'].values, 
                                                                 train_valid_test = 'train')

### performance evaluation & confusion matrix (valid data)

In [None]:
accuracy_score_valid, f1score_valid = performance_metrics_binary(model = stack_pipeline, 
                                                                 data = X_valid, 
                                                                 true_y = y_valid['up_down'].values, 
                                                                 train_valid_test = 'valid')

### 10-fold cross validation (Monte Carlo)

In [None]:
folds = ['fold-1', 'fold-2', 'fold-3', 'fold-4', 'fold-5', 'fold-6', 'fold-7', 'fold-8', 'fold-9', 'fold-10']
mccv_score_dict = {'fold':[], 'train_accuracy':[], 'train_f1':[], 'valid_accuracy': [], 'valid_f1': []}
  
for fold in folds:
  X_train, y_train = dict_folds_data[fold]['train']
  X_valid, y_valid = dict_folds_data[fold]['valid']

  minmaxscaler_price = MinMaxScaler()

  ma_pipeline = Pipeline(steps = [('norm_price', minmaxscaler_price)])

  column_transformer = ColumnTransformer(transformers = [('price_ma', ma_pipeline, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                    'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                    'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                    'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close'])])

  clf_pipeline_binary = Pipeline(steps = [('preprocessing', column_transformer), 
                                          ('classification',LinearSVC(C= 0.05, class_weight = 'balanced'))])

  #-------------------------------------------------------------------------------------------------------------

  minmaxscaler_price_xgboost = MinMaxScaler()
  ma_pipeline_xg = Pipeline(steps = [('norm_price', minmaxscaler_price_xgboost)])
  column_transformer_xg = ColumnTransformer(transformers = [('price_ma', ma_pipeline_xg, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                          'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                          'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                          'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close'])])
  clf_xgb = XGBClassifier(booster = 'gbtree', min_split_loss = 0.01, learning_rate = 0.001, max_depth = 4, n_estimators = 1000, scale_pos_weight = 0.9)
  clf_pipeline_binary_xgboost = Pipeline(steps = [('preprocessing', column_transformer_xg), 
                                                  ('classification',clf_xgb)])
  
  #--------------------------------------------------------------------------------------------------------------

  logistic = LogisticRegression()
  stack_pipeline = StackingClassifier(estimators = [('linearsvc', clf_pipeline_binary), ('xgboost', clf_pipeline_binary_xgboost)], 
                                    final_estimator = logistic)

  stack_pipeline.fit(X_train, y_train['up_down'].values)
  train_acc = stack_pipeline.score(X_train, y_train['up_down'].values)
  predicted_y_train = stack_pipeline.predict(X_train)
  train_f1 = f1_score(y_train['up_down'].values, predicted_y_train, average = 'macro')
  valid_acc = stack_pipeline.score(X_valid, y_valid['up_down'].values)
  predicted_y_valid = stack_pipeline.predict(X_valid)
  valid_f1 = f1_score(y_valid['up_down'].values, predicted_y_valid, average = 'macro')

  mccv_score_dict['fold'].append(fold)
  mccv_score_dict['train_accuracy'].append(train_acc)
  mccv_score_dict['train_f1'].append(train_f1)
  mccv_score_dict['valid_accuracy'].append(valid_acc)
  mccv_score_dict['valid_f1'].append(valid_f1)

In [None]:
df_mccv_score_stacking = pd.DataFrame(mccv_score_dict)
df_mccv_score_stacking.loc[len(df_mccv_score_stacking)] = ['average', 
                                                           df_mccv_score_stacking['train_accuracy'].mean(), 
                                                           df_mccv_score_stacking['train_f1'].mean(), 
                                                           df_mccv_score_stacking['valid_accuracy'].mean(), 
                                                           df_mccv_score_stacking['valid_f1'].mean()]

df_mccv_score_stacking.loc[len(df_mccv_score_stacking)] = ['std', 
                                                           df_mccv_score_stacking.iloc[:-1,1].std(), 
                                                           df_mccv_score_stacking.iloc[:-1,2].std(), 
                                                           df_mccv_score_stacking.iloc[:-1,3].std(), 
                                                           df_mccv_score_stacking.iloc[:-1,4].std()]

In [None]:
df_mccv_score_stacking

In [None]:
plt.figure(figsize = (14, 6))
ax1 = plt.subplot(2, 2, 1)
ax1.plot(df_mccv_score_stacking.iloc[:-2,0], df_mccv_score_stacking.iloc[:-2,1], label = 'train_accuracy', color = 'black', linewidth = 0.5)
ax1.axhline(df_mccv_score_stacking.iloc[10, 1], label = 'avg train_accuracy', color = 'black', linewidth = 0.2, linestyle = '--')
ax1.set_ylim((0.3, 0.6))
ax1.tick_params(labelbottom=False)
ax1.legend()
ax2 = plt.subplot(2, 2, 3)
ax2.plot(df_mccv_score_stacking.iloc[:-2,0], df_mccv_score_stacking.iloc[:-2,3], label = 'valid_accuracy', color = 'black', linewidth = 0.5)
ax2.axhline(df_mccv_score_stacking.iloc[10, 3], label = 'avg valid_accuracy', color = 'black', linewidth = 0.2, linestyle = '--')
ax2.set_ylim((0.3, 0.6))
plt.xticks(rotation = 90)
plt.legend()
ax3 = plt.subplot(2, 2, 2)
ax3.plot(df_mccv_score_stacking.iloc[:-2,0], df_mccv_score_stacking.iloc[:-2,2], label = 'train_f1', color = 'red', linewidth = 0.5)
ax3.axhline(df_mccv_score_stacking.iloc[10, 2], label = 'avg train_f1', color = 'red', linewidth = 0.2, linestyle = '--')
ax3.set_ylim((0.1, 0.4))
ax3.tick_params(labelbottom=False)
plt.legend()
ax4 = plt.subplot(2, 2, 4)
ax4.plot(df_mccv_score_stacking.iloc[:-2,0], df_mccv_score_stacking.iloc[:-2,4], label = 'valid_f1', color = 'red', linewidth = 0.5)
ax4.axhline(df_mccv_score_stacking.iloc[10, 4], label = 'avg valid_f1', color = 'red', linewidth = 0.2, linestyle = '--')
ax4.set_ylim((0.1, 0.4))
plt.xticks(rotation = 90)
plt.legend()
plt.show()


## Stacking Models : performace evaluation & visualization (train on training & valid dataset and test on test dataset)

In [None]:
X_all_train, X_test, y_all_train, y_test = load_dataset(sp500_fn = load_sp500_data, training_data = False)


In [None]:
mccv_score_dict_all = {'train_accuracy':[], 'train_f1':[], 'test_accuracy': [], 'test_f1': []}

minmaxscaler_price = MinMaxScaler()

ma_pipeline = Pipeline(steps = [('norm_price', minmaxscaler_price)])

column_transformer = ColumnTransformer(transformers = [('price_ma', ma_pipeline, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                  'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                  'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                  'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close'])])

clf_pipeline_binary = Pipeline(steps = [('preprocessing', column_transformer), 
                                        ('classification',LinearSVC(C= 0.05, class_weight = 'balanced'))])

#-------------------------------------------------------------------------------------------------------------

minmaxscaler_price_xgboost = MinMaxScaler()
ma_pipeline_xg = Pipeline(steps = [('norm_price', minmaxscaler_price_xgboost)])
column_transformer_xg = ColumnTransformer(transformers = [('price_ma', ma_pipeline_xg, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                        'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                        'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                        'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close'])])
clf_xgb = XGBClassifier(booster = 'gbtree', min_split_loss = 0.01, learning_rate = 0.001, max_depth = 4, n_estimators = 1000, scale_pos_weight = 0.9)
clf_pipeline_binary_xgboost = Pipeline(steps = [('preprocessing', column_transformer_xg), 
                                                ('classification',clf_xgb)])

#--------------------------------------------------------------------------------------------------------------

logistic = LogisticRegression()
stack_pipeline = StackingClassifier(estimators = [('linearsvc', clf_pipeline_binary), ('xgboost', clf_pipeline_binary_xgboost)], 
                                  final_estimator = logistic)

stack_pipeline.fit(X_all_train, y_all_train['up_down'].values)
train_all_acc = stack_pipeline.score(X_all_train, y_all_train['up_down'].values)
predicted_y_all_train = stack_pipeline.predict(X_all_train)
train_all_f1 = f1_score(y_all_train['up_down'].values, predicted_y_all_train, average = 'macro')
test_acc = stack_pipeline.score(X_test, y_test['up_down'].values)
predicted_y_test = stack_pipeline.predict(X_test)
test_f1 = f1_score(y_test['up_down'].values, predicted_y_test, average = 'macro')

mccv_score_dict_all['train_accuracy'].append(train_all_acc)
mccv_score_dict_all['train_f1'].append(train_all_f1)
mccv_score_dict_all['test_accuracy'].append(test_acc)
mccv_score_dict_all['test_f1'].append(test_f1)

In [None]:
pd.DataFrame(mccv_score_dict_all)

### performance evaluation & confusion matrix (train & valid)

In [None]:
accuracy_score_valid, f1score_valid = performance_metrics_binary(model = stack_pipeline, 
                                                                 data = X_all_train, 
                                                                 true_y = y_all_train['up_down'].values, 
                                                                 train_valid_test = 'train_valid')

### performance evalution & confusion matrix (test)

In [None]:
accuracy_score_valid, f1score_valid = performance_metrics_binary(model = stack_pipeline, 
                                                                 data = X_test, 
                                                                 true_y = y_test['up_down'].values, 
                                                                 train_valid_test = 'test')