## Install/ Import Library

In [None]:
! pip install -U spacy
! python -m spacy download en_core_web_lg
! pip install yfinance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import re
from scipy.sparse import csr_matrix

import spacy
from tqdm.auto import tqdm
from pathlib import Path
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import pickle

from xgboost import XGBClassifier

import yfinance


## Helper Functions

### load sp500 data & prepare technical index

In [None]:
def load_sp500_data(start_date = '2014-12-31', end_date = '2021-01-05'):
  sp = yfinance.Ticker('^GSPC')
  sp_history = sp.history(start =start_date, end = end_date)
  sp_history.reset_index(inplace = True)
  sp_history['Date'] = pd.to_datetime(sp_history['Date'].dt.date)
  sp_history.drop(labels = ['Dividends', 'Stock Splits'], axis = 1, inplace = True)
  sp_history = sp_history.loc[:,['Date', 'Open', 'High', 'Low', 'Volume', 'Close']]
  sp_history['Close+1day'] = sp_history['Close'].shift(-1)
  sp_history.dropna(inplace = True)
  sp_history['up_down'] = sp_history[['Close', 'Close+1day']].apply(lambda x: 1 if x['Close+1day'] > x['Close'] else 0, axis = 1)

  appl = yfinance.Ticker('AAPL')
  appl_history = appl.history(start =start_date, end = end_date)
  appl_history.reset_index(inplace = True)
  appl_history['Date'] = pd.to_datetime(appl_history['Date'].dt.date)
  appl_history['Close'] = appl_history['Close'].pct_change()

  msft = yfinance.Ticker('MSFT')
  msft_history = msft.history(start =start_date, end = end_date)
  msft_history.reset_index(inplace = True)
  msft_history['Date'] = pd.to_datetime(msft_history['Date'].dt.date)
  msft_history['Close'] = msft_history['Close'].pct_change()

  amzn = yfinance.Ticker('AMZN')
  amzn_history = amzn.history(start =start_date, end = end_date)
  amzn_history.reset_index(inplace = True)
  amzn_history['Date'] = pd.to_datetime(amzn_history['Date'].dt.date)
  amzn_history['Close'] = amzn_history['Close'].pct_change()

  nvda = yfinance.Ticker('NVDA')
  nvda_history = nvda.history(start =start_date, end = end_date)
  nvda_history.reset_index(inplace = True)
  nvda_history['Date'] = pd.to_datetime(nvda_history['Date'].dt.date)
  nvda_history['Close'] = nvda_history['Close'].pct_change()

  brk = yfinance.Ticker('BRK-B')
  brk_history = brk.history(start =start_date, end = end_date)
  brk_history.reset_index(inplace = True)
  brk_history['Date'] = pd.to_datetime(brk_history['Date'].dt.date)
  brk_history['Close'] = brk_history['Close'].pct_change()

  googl = yfinance.Ticker('GOOGL')
  googl_history = googl.history(start =start_date, end = end_date)
  googl_history.reset_index(inplace = True)
  googl_history['Date'] = pd.to_datetime(googl_history['Date'].dt.date)
  googl_history['Close'] = googl_history['Close'].pct_change()

  tsla = yfinance.Ticker('TSLA')
  tsla_history = tsla.history(start =start_date, end = end_date)
  tsla_history.reset_index(inplace = True)
  tsla_history['Date'] = pd.to_datetime(tsla_history['Date'].dt.date)
  tsla_history['Close'] = tsla_history['Close'].pct_change()

  meta = yfinance.Ticker('META')
  meta_history = meta.history(start =start_date, end = end_date)
  meta_history.reset_index(inplace = True)
  meta_history['Date'] = pd.to_datetime(meta_history['Date'].dt.date)
  meta_history['Close'] = meta_history['Close'].pct_change()

  xom = yfinance.Ticker('XOM')
  xom_history = xom.history(start =start_date, end = end_date)
  xom_history.reset_index(inplace = True)
  xom_history['Date'] = pd.to_datetime(xom_history['Date'].dt.date)
  xom_history['Close'] = xom_history['Close'].pct_change()

  jpm = yfinance.Ticker('JPM')
  jpm_history = jpm.history(start =start_date, end = end_date)
  jpm_history.reset_index(inplace = True)
  jpm_history['Date'] = pd.to_datetime(jpm_history['Date'].dt.date)
  jpm_history['Close'] = jpm_history['Close'].pct_change()

  for i in [10, 20, 30]:
    sp_history['MA'+str(i)] = sp_history['Close'].rolling(i).mean()

  for df, name in zip([appl_history, msft_history, amzn_history, nvda_history, brk_history, 
                       googl_history, tsla_history, meta_history, xom_history, jpm_history], 
                      ['appl_Close', 'msft_Close', 'amzn_Close', 'nvda_Close', 'brk_Close', 
                       'googl_Close', 'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close']):
    df = df[['Date', 'Close']].rename(columns = {'Close': name})
    sp_history = sp_history.merge(df, left_on = 'Date', right_on = 'Date', how = 'left')
  
  sp_history = sp_history[(sp_history['Date'] >= '2014-12-31') & (sp_history['Date'] <= '2020-12-31')].copy()
  sp_history.dropna(inplace = True)

  return sp_history

### tokenize & clean data

In [None]:
def tokenization_clean(text: 'str', nlp_model: 'spacy_model'):
  doc = nlp_model(text)
  tok_aft_spacy = [re.sub(r'[^\w\s]', '', tok.lemma_.lower()) for tok in doc 
                   if not tok.is_stop
                   and not tok.is_punct 
                   and not tok.like_num 
                   and not tok.like_url 
                   and not tok.is_space 
                   and not tok.like_email 
                   and not tok.is_left_punct 
                   and not tok.is_right_punct 
                   and not tok.is_digit 
                   and not tok.is_currency]
  
  join_tok_aft_spacy = ' '.join(tok_aft_spacy)
  return join_tok_aft_spacy

### load text dataset & combine sp500

In [None]:
def load_fold_dataset(directory: 'str', 
                      file_name: 'str',
                      sp500_fn: 'function', 
                      fn_tokenization_clean: 'function', 
                      nlp_model):
  
  tqdm.pandas()
  path = Path(directory)
  file_path = path / file_name
  data = pd.read_csv(file_path)
  data = data[['timestamp', 'text']].groupby('timestamp', as_index = False).agg({'text':' '.join})
  sp500 = sp500_fn()

  data['tokenized_text'] = data['text'].progress_apply(lambda x: fn_tokenization_clean(x, nlp_model))
  data['timestamp'] = pd.to_datetime(data['timestamp'])

  column_subset_X = ['Date','Close'] + list(sp500.columns[7:]) + ['tokenized_text']

  data_merge = sp500.merge(data[['timestamp', 'tokenized_text']], how = 'left', left_on = 'Date', right_on = 'timestamp')
  data_merge = data_merge.dropna()
  data_merge_X = data_merge.loc[:,column_subset_X].copy()
  data_merge_y = data_merge.loc[:,['Date', 'up_down']]

  return data_merge_X, data_merge_y

In [None]:
def load_dataset(directory: 'str', 
                 file_name: 'str', 
                 sp500_fn: 'function',
                 fn_tokenization_clean: 'function', 
                 nlp_model, 
                 training_data = True):
  
  tqdm.pandas()
  path = Path(directory)
  file_path = path / file_name

  with open(file_path, 'r', encoding="utf-8") as f:
    data = eval(f.read())

  data = pd.DataFrame({'timestamp':list(data.keys()), 'text': list(data.values())})
  sp500 = sp500_fn()

  data['tokenized_text'] = data['text'].progress_apply(lambda x: fn_tokenization_clean(x, nlp_model))
  data['timestamp'] = pd.to_datetime(data['timestamp'])

  column_subset_X = ['Date','Close'] + list(sp500.columns[7:]) + ['tokenized_text']

  data_merge = sp500.merge(data, how = 'left', left_on = 'Date', right_on = 'timestamp')
  data_merge = data_merge.dropna()
  data_merge_X = data_merge.loc[:,column_subset_X].copy()
  data_merge_y = data_merge.loc[:,['Date', 'up_down']]
  
  if training_data == True:
    X_train = data_merge_X[data_merge_X['Date']<='2018-12-31'].copy()
    X_valid = data_merge_X[data_merge_X['Date']>='2019-01-01'].copy()
    y_train = data_merge_y[data_merge_y['Date']<='2018-12-31'].copy()
    y_valid = data_merge_y[data_merge_y['Date']>='2019-01-01'].copy()
    return X_train, X_valid, y_train, y_valid
  
  elif training_data == False:
    return data_merge_X, data_merge_y

### function to prepare 10 folds data

In [None]:
def prepare_folds_dataset(cv_path: 'str'):
  cv_path = Path(cv_path)
  fold_dataset = {'fold-1':{'train':None, 'valid':None}, 
                  'fold-2':{'train':None, 'valid':None}, 
                  'fold-3':{'train':None, 'valid':None}, 
                  'fold-4':{'train':None, 'valid':None}, 
                  'fold-5':{'train':None, 'valid':None}, 
                  'fold-6':{'train':None, 'valid':None}, 
                  'fold-7':{'train':None, 'valid':None}, 
                  'fold-8':{'train':None, 'valid':None}, 
                  'fold-9':{'train':None, 'valid':None}, 
                  'fold-10':{'train':None, 'valid':None}}

  for i in cv_path.iterdir():
    i.name
    for j in i.iterdir():
      if 'train' in j.name:
        train_path = j

        X_fold_train, y_fold_train = load_fold_dataset(directory = i, 
                                            file_name = train_path.name,
                                            sp500_fn = load_sp500_data,
                                            fn_tokenization_clean = tokenization_clean,
                                            nlp_model = nlp_model)
        fold_dataset[i.name]['train'] = (X_fold_train, y_fold_train)
      elif 'valid' in j.name:
        valid_path = j

        X_fold_valid, y_fold_valid = load_fold_dataset(directory = i,
                                            file_name = valid_path.name,
                                            sp500_fn = load_sp500_data,
                                            fn_tokenization_clean = tokenization_clean,
                                            nlp_model = nlp_model)
        fold_dataset[i.name]['valid'] = (X_fold_valid, y_fold_valid)
  
  return fold_dataset

### load loughran_mcdonal sentiment dictionary

In [None]:
def loughran_mcdonald_dict(directory: 'str', file_name: 'str'):
  path = Path(directory)
  file_path = path / file_name
  data = pd.read_csv(file_path)

  data_part = data[['Word', 'Negative', 'Positive', 
                    'Uncertainty', 'Litigious', 'Strong_Modal', 
                    'Weak_Modal', 'Constraining']].copy()

  data_part['Word'] = data_part['Word'].str.lower()

  for i in list(data_part.columns)[1:]:
    data_part[i] = data_part[i].apply(lambda x: 1 if x >0 else 0)
  
  data_part.drop(index = 50741, inplace = True) # drop nan values
  data_part.reset_index(drop = True, inplace = True)

  return data_part

### multiplication of doc-term maxtrix and term-sentiment matrix

In [None]:
def combine_count_sent(data, 
                       loughran_mcdonald_dict_fn = loughran_mcdonald_dict, 
                       directory = '../data/TF-IDF Models', 
                       file_name = 'Loughran-McDonald_MasterDictionary_1993-2021.csv'):
  
  sent_dict = loughran_mcdonald_dict_fn(directory, file_name)
  sparse_sent_dict = csr_matrix(sent_dict.iloc[:,1:].values)
  news_sentiment = (data*sparse_sent_dict)

  return news_sentiment.toarray()

### gridsearchcv, using 10 folds dataset (Linear SVC)

In [None]:
def gridsearchcv(folds_data: 'dict', parametergrid: 'list', model: 'function'):
  folds = ['fold-1', 'fold-2', 'fold-3', 'fold-4', 'fold-5', 'fold-6', 'fold-7', 'fold-8', 'fold-9', 'fold-10']
  score_dict = {'parameter': [], 'fold':[], 'train_accuracy':[], 'train_f1':[], 'valid_accuracy': [], 'valid_f1': []}

  for parameter in tqdm(parametergrid):
    model_initialized = model.set_params(preprocessing__tfidf__tfidf_vectorizer__max_features = parameter['preprocessing__tfidf__tfidf_vectorizer__max_features'], 
                                         classification__C = parameter['classification__C'], 
                                         classification__penalty = parameter['classification__penalty'])
    
    for fold in folds:
      score_dict['parameter'].append(parameter)
      X_train, y_train = folds_data[fold]['train']
      X_valid, y_valid = folds_data[fold]['valid']
      model_initialized.fit(X_train, y_train['up_down'].values)
      train_acc = model_initialized.score(X_train, y_train['up_down'].values)
      predicted_y_train = model_initialized.predict(X_train)
      train_f1 = f1_score(y_train['up_down'].values, predicted_y_train, average = 'macro')
      valid_acc = model_initialized.score(X_valid, y_valid['up_down'].values)
      predicted_y_valid = model_initialized.predict(X_valid)
      valid_f1 = f1_score(y_valid['up_down'].values, predicted_y_valid, average = 'macro')

      score_dict['fold'].append(fold)
      score_dict['train_accuracy'].append(train_acc)
      score_dict['train_f1'].append(train_f1)
      score_dict['valid_accuracy'].append(valid_acc)
      score_dict['valid_f1'].append(valid_f1)
  
  return score_dict


### function to visualize gridsearchcv scores (linearSVC)

In [None]:
def visualize_gridsearchcv(cv_result: 'sklearn_gridsearch_model'):
  features_list = [2000, 4000, 6000, 8000, 10000, 12000]
  color = ['#3498DB']*len(features_list)
  
  df_cv_result = cv_result[['classification__C', 
                            'preprocessing__tfidf__tfidf_vectorizer__max_features', 
                            'valid_f1']].copy()
                               
  df_cv_result.rename(columns = {'classification__C': 'linearSVC_C', 
                                 'preprocessing__tfidf__tfidf_vectorizer__max_features': 'tfidf_feature'}, 
                      inplace = True)
  
  plt.figure(figsize = (8, 6))

  for idx, feature_no, color in zip(range(len(features_list)), features_list, color):
    ax = plt.subplot(6, 1, idx+1)
    ax.plot(df_cv_result.loc[(df_cv_result['tfidf_feature'] == feature_no), 'linearSVC_C'], 
            df_cv_result.loc[(df_cv_result['tfidf_feature'] == feature_no), 'valid_f1'],
            color = color, 
            linewidth = 1, 
            label = f'tfidf_feature number: {feature_no}')
    
    #ax.set_ylim((0., 0.62))
    ax.set_ylabel('mean f1', fontsize = 8)
    plt.yticks(fontsize=8)
    ax.legend(fontsize = 8)

    if idx != len(features_list) - 1:
      ax.tick_params(labelbottom=False)
    
    if idx == 0:
      plt.title('validation f1 score of the grid search cv results')
    
    

  plt.xlabel('linear SVC, C value')
  
  plt.tight_layout(pad = 0.2)

  plt.show()

### gridsearchcv, using 10 folds dataset (XGBoost)

In [None]:
def gridsearchcv_xgb(folds_data: 'dict', parametergrid: 'list', model: 'function'):
  folds = ['fold-1', 'fold-2', 'fold-3', 'fold-4', 'fold-5', 'fold-6', 'fold-7', 'fold-8', 'fold-9', 'fold-10']
  score_dict = {'parameter': [], 'fold':[], 'train_accuracy':[], 'train_f1':[], 'valid_accuracy': [], 'valid_f1': []}

  for parameter in tqdm(parametergrid):
    model_initialized = model.set_params(classification__learning_rate = parameter['classification__learning_rate'], 
                                         classification__max_depth = parameter['classification__max_depth'])
    
    for fold in folds:
      score_dict['parameter'].append(parameter)
      X_train, y_train = folds_data[fold]['train']
      X_valid, y_valid = folds_data[fold]['valid']
      model_initialized.fit(X_train, y_train['up_down'].values)
      train_acc = model_initialized.score(X_train, y_train['up_down'].values)
      predicted_y_train = model_initialized.predict(X_train)
      train_f1 = f1_score(y_train['up_down'].values, predicted_y_train, average = 'macro')
      valid_acc = model_initialized.score(X_valid, y_valid['up_down'].values)
      predicted_y_valid = model_initialized.predict(X_valid)
      valid_f1 = f1_score(y_valid['up_down'].values, predicted_y_valid, average = 'macro')

      score_dict['fold'].append(fold)
      score_dict['train_accuracy'].append(train_acc)
      score_dict['train_f1'].append(train_f1)
      score_dict['valid_accuracy'].append(valid_acc)
      score_dict['valid_f1'].append(valid_f1)
  
  return score_dict

### function to visualize gridsearchcv result (XGBoost)

In [None]:
def visualize_gridsearchcv_xgb(cv_result: 'sklearn gridsearch model'):
  depth_list = [3, 4, 5]
  color = ['#0099FF', '#2D88C5', '#3F7090']
  
  df_cv_result = cv_result[['classification__learning_rate',
                            'classification__max_depth',
                            'valid_f1']].copy()
                               
  df_cv_result.rename(columns = {'classification__learning_rate': 'learning_rate', 
                                 'classification__max_depth': 'max_depth'}, 
                      inplace = True)
  
  plt.figure(figsize = (8, 6))

  for depth_no, color in zip(depth_list, color):
    plt.plot(df_cv_result.loc[(df_cv_result['max_depth'] == depth_no), 'learning_rate'], 
             df_cv_result.loc[(df_cv_result['max_depth'] == depth_no), 'valid_f1'], 
             color = color, label = f'max_depth: {depth_no}')
    

  plt.xlabel('XGBoost, learning rate')
  plt.ylabel('mean valid f1')
  plt.tight_layout(pad = 0.5)
  plt.legend()
  plt.show()

### function to visualize confusion matrix (binary label)

In [None]:
def performance_metrics_binary(model, data, true_y, train_valid_test: 'str'):
  predicted_y = model.predict(data)
  accuracy_score = model.score(data, true_y)
  f1score = f1_score(true_y, predicted_y, average = 'macro')
  

  plt.figure(figsize = (4, 4))
  sns.set(font_scale=1.2)
  cm_result = confusion_matrix(true_y, predicted_y, normalize = 'pred')

  confusion_matrix_result_heatmap = sns.heatmap(cm_result, 
                                                cmap="Blues", 
                                                annot = True, 
                                                fmt=".2f", annot_kws={'size': 15}, 
                                                xticklabels=['Negative', 'Positive'], 
                                                yticklabels=['Negative', 'Positive'])

  confusion_matrix_result_heatmap.set(xlabel='Predicted Label', ylabel='True Label', title = 'price movement')

  plt.show()
  print(f'\n{train_valid_test} accuracy: {accuracy_score}, {train_valid_test} f1 score: {f1score}')
  return accuracy_score, f1score

## Load Data

### initialize spacy model

In [None]:
nlp_model = spacy.load('en_core_web_lg')

### prepare loughran-mcdonald sentiment dictionary

In [None]:
lm_sent_dict = loughran_mcdonald_dict(directory = '../data/TF-IDF Models', 
                                      file_name = 'Loughran-McDonald_MasterDictionary_1993-2021.csv')

### load 10 folds dataset

In [None]:
folds_data_path = Path(r'../data/TF-IDF Models/Intermediate Output/dict_folds_data.pickle')

if folds_data_path.is_file():
  with open(folds_data_path, 'rb') as f_1:
    dict_folds_data = pickle.load(f_1)

else:
  dict_folds_data = prepare_folds_dataset(cv_path = r'../data/TF-IDF Models/Cross Validation_fold_data')

## Build LinearSVC Pipeline

### initialize necessary functions

In [None]:
lm_countvectorizer = CountVectorizer(vocabulary = lm_sent_dict ['Word'])
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 2), max_features = 10000)
minmaxscaler = MinMaxScaler()
minmaxscaler_price = MinMaxScaler()
fn_combine = FunctionTransformer(combine_count_sent)
fn_transform_sparse = FunctionTransformer(csr_matrix)

### build pipeline (binary label)

In [None]:
tfidf_pipline = Pipeline(steps = [('tfidf_vectorizer', tfidf_vectorizer)])

lm_pipeline = Pipeline(steps = [('count_vectorizer', lm_countvectorizer), 
                                ('matrix_mul', fn_combine), 
                                ('norm', minmaxscaler), 
                                ('sparse_matrix', fn_transform_sparse)])

ma_pipeline = Pipeline(steps = [('norm_price', minmaxscaler_price)])

column_transformer = ColumnTransformer(transformers = [('price_ma', ma_pipeline, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                  'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                  'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                  'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close']), 
                                                       ('tfidf', tfidf_pipline, 'tokenized_text'),
                                                       ('lm_count', lm_pipeline, 'tokenized_text')])

clf_pipeline_binary = Pipeline(steps = [('preprocessing', column_transformer), 
                                        ('classification',LinearSVC(C= 0.05, class_weight = 'balanced'))])

### GridSearchCV

In [None]:
parameters_grid = {'preprocessing__tfidf__tfidf_vectorizer__max_features':[2000, 4000, 6000, 8000, 10000, 12000],
                   'classification__C': [0.001, 0.005, 0.01, 0.05], 
                   'classification__penalty': ['l2']}

parameters = ParameterGrid(parameters_grid)

result = gridsearchcv(folds_data = dict_folds_data, parametergrid = parameters, model = clf_pipeline_binary)

In [None]:
parameters_grid_1 = {'preprocessing__tfidf__tfidf_vectorizer__max_features':[],
                   'classification__C': [], 
                   'classification__penalty': []}

df_result = pd.DataFrame(result)

for parameter in df_result['parameter']:
  for key, value in parameter.items():
    parameters_grid_1[key].append(value)

df_result_1 = df_result.merge(pd.DataFrame(parameters_grid_1), how = 'left', left_index = True, right_index = True)
df_result_complete = df_result_1.iloc[:,1:].groupby(['preprocessing__tfidf__tfidf_vectorizer__max_features', 
                                                     'classification__C',
                                                     'classification__penalty'], as_index = False).mean()

In [None]:
visualize_gridsearchcv(df_result_complete)

In [None]:
# the best combination of parameters
df_result_complete.iloc[df_result_complete['valid_f1'].idxmax()]

### confirm best model

In [None]:
X_train, X_valid, y_train, y_valid = load_dataset(directory = '../data/TF-IDF Models/News Article Text File_Agg Daily', 
                                                  file_name = 'articles_2015_2019_concated_summaries.txt', 
                                                  sp500_fn = load_sp500_data,
                                                  fn_tokenization_clean = tokenization_clean,
                                                  nlp_model = nlp_model, 
                                                  training_data = True)

In [None]:
lm_countvectorizer = CountVectorizer(vocabulary = lm_sent_dict ['Word'])
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 2), max_features = 4000)
minmaxscaler = MinMaxScaler()
minmaxscaler_price = MinMaxScaler()
fn_combine = FunctionTransformer(combine_count_sent)
fn_transform_sparse = FunctionTransformer(csr_matrix)

tfidf_pipline = Pipeline(steps = [('tfidf_vectorizer', tfidf_vectorizer)])

lm_pipeline = Pipeline(steps = [('count_vectorizer', lm_countvectorizer), 
                                ('matrix_mul', fn_combine), 
                                ('norm', minmaxscaler), 
                                ('sparse_matrix', fn_transform_sparse)])

ma_pipeline = Pipeline(steps = [('norm_price', minmaxscaler_price)])

column_transformer = ColumnTransformer(transformers = [('price_ma', ma_pipeline, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                  'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                  'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                  'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close']), 
                                                       ('tfidf', tfidf_pipline, 'tokenized_text'),
                                                       ('lm_count', lm_pipeline, 'tokenized_text')])

clf_pipeline_binary = Pipeline(steps = [('preprocessing', column_transformer), 
                                        ('classification',LinearSVC(C= 0.05, class_weight = 'balanced'))])

clf_pipeline_binary.fit(X_train, y_train['up_down'].values)

### performance evaluation & confusion matrix (train dataset)

In [None]:
accuracy_score_train, f1score_train = performance_metrics_binary(model = clf_pipeline_binary, 
                                                                 data = X_train, 
                                                                 true_y = y_train['up_down'].values, 
                                                                 train_valid_test = 'train')

### performance evaluation & confusion matrix (valid dataset)

In [None]:
accuracy_score_valid, f1score_valid = performance_metrics_binary(model = clf_pipeline_binary, 
                                                                 data = X_valid, 
                                                                 true_y = y_valid['up_down'].values, 
                                                                 train_valid_test = 'valid')

## Build XGBoost Classifier

### initialize necessay functions

In [None]:
lm_countvectorizer_xgboost = CountVectorizer(vocabulary = lm_sent_dict ['Word'])
tfidf_vectorizer_xgboost = TfidfVectorizer(ngram_range = (1, 2), max_features = 10000)
minmaxscaler_lm_xgboost = MinMaxScaler()
minmaxscaler_price_xgboost = MinMaxScaler()
fn_combine_xgboost = FunctionTransformer(combine_count_sent)
fn_transform_sparse_xgboost = FunctionTransformer(csr_matrix)
clf_xgb = XGBClassifier(booster = 'gbtree', tree_method='gpu_hist', min_split_loss = 0.01, learning_rate = 0.01, max_depth = 3, n_estimators = 1000, scale_pos_weight = 0.9)

In [None]:
tfidf_pipline_xg = Pipeline(steps = [('tfidf_vectorizer', tfidf_vectorizer_xgboost)])

lm_pipeline_xg = Pipeline(steps = [('count_vectorizer', lm_countvectorizer_xgboost), 
                                   ('matrix_mul', fn_combine_xgboost), 
                                   ('norm', minmaxscaler_lm_xgboost), 
                                   ('sparse_matrix', fn_transform_sparse_xgboost)])

ma_pipeline_xg = Pipeline(steps = [('norm_price', minmaxscaler_price_xgboost)])

column_transformer_xg = ColumnTransformer(transformers = [('price_ma', ma_pipeline_xg, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                        'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                        'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                        'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close']), 
                                                          ('tfidf', tfidf_pipline_xg, 'tokenized_text'),
                                                          ('lm_count', lm_pipeline_xg, 'tokenized_text')])

clf_pipeline_binary_xgboost = Pipeline(steps = [('preprocessing', column_transformer_xg), 
                                                ('classification',clf_xgb)])

### GridSearchCV

In [None]:
parameters_grid_xgb = {'classification__learning_rate': [0.0005, 0.0007, 0.001], 
                   'classification__max_depth': [3, 4 , 5]}

parameters_xgb = ParameterGrid(parameters_grid_xgb)

result_xgb = gridsearchcv_xgb(folds_data = dict_folds_data, parametergrid = parameters_xgb, model = clf_pipeline_binary_xgboost)

In [None]:
parameters_grid_xgb_1 = {'classification__learning_rate': [], 'classification__max_depth': []}

df_result_xgb = pd.DataFrame(result_xgb)

for parameter in df_result_xgb['parameter']:
  for key, value in parameter.items():
    parameters_grid_xgb_1[key].append(value)

df_result_xgb_1 = df_result_xgb.merge(pd.DataFrame(parameters_grid_xgb_1), how = 'left', left_index = True, right_index = True)
df_result_xgb_complete = df_result_xgb_1.iloc[:,1:].groupby(['classification__learning_rate','classification__max_depth'], as_index = False).mean()


In [None]:
visualize_gridsearchcv_xgb(df_result_xgb_complete)

In [None]:
# the best combination of parameters
df_result_xgb_complete.iloc[df_result_xgb_complete['valid_f1'].idxmax()]

### confirm the best model

In [None]:
lm_countvectorizer_xgboost = CountVectorizer(vocabulary = lm_sent_dict ['Word'])
tfidf_vectorizer_xgboost = TfidfVectorizer(ngram_range = (1, 2), max_features = 10000)
minmaxscaler_lm_xgboost = MinMaxScaler()
minmaxscaler_price_xgboost = MinMaxScaler()
fn_combine_xgboost = FunctionTransformer(combine_count_sent)
fn_transform_sparse_xgboost = FunctionTransformer(csr_matrix)
clf_xgb = XGBClassifier(booster = 'gbtree', tree_method='gpu_hist', min_split_loss = 0.01, learning_rate = 0.0005, max_depth = 5, n_estimators = 1000, scale_pos_weight = 0.9)

tfidf_pipline_xg = Pipeline(steps = [('tfidf_vectorizer', tfidf_vectorizer_xgboost)])

lm_pipeline_xg = Pipeline(steps = [('count_vectorizer', lm_countvectorizer_xgboost), 
                                   ('matrix_mul', fn_combine_xgboost), 
                                   ('norm', minmaxscaler_lm_xgboost), 
                                   ('sparse_matrix', fn_transform_sparse_xgboost)])

ma_pipeline_xg = Pipeline(steps = [('norm_price', minmaxscaler_price_xgboost)])

column_transformer_xg = ColumnTransformer(transformers = [('price_ma', ma_pipeline_xg, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                        'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                        'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                        'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close']), 
                                                          ('tfidf', tfidf_pipline_xg, 'tokenized_text'),
                                                          ('lm_count', lm_pipeline_xg, 'tokenized_text')])

clf_pipeline_binary_xgboost = Pipeline(steps = [('preprocessing', column_transformer_xg), 
                                                ('classification',clf_xgb)])

In [None]:
clf_pipeline_binary_xgboost.fit(X_train, y_train['up_down'].values)


In [None]:
accuracy_score_valid, f1score_valid = performance_metrics_binary(model = clf_pipeline_binary_xgboost, 
                                                                 data = X_train, 
                                                                 true_y = y_train['up_down'].values, 
                                                                 train_valid_test = 'train')

In [None]:
accuracy_score_valid, f1score_valid = performance_metrics_binary(model = clf_pipeline_binary_xgboost, 
                                                                 data = X_valid, 
                                                                 true_y = y_valid['up_down'].values, 
                                                                 train_valid_test = 'valid')

## Build Stacking Models

In [None]:
logistic = LogisticRegression()
stack_pipeline = StackingClassifier(estimators = [('linearsvc', clf_pipeline_binary), ('xgboost', clf_pipeline_binary_xgboost)], 
                                    final_estimator = logistic)

stack_pipeline.fit(X_train, y_train['up_down'].values)

### performance evaluation & confusion matrix (training data)

In [None]:
accuracy_score_train, f1score_train = performance_metrics_binary(model = stack_pipeline, 
                                                                 data = X_train, 
                                                                 true_y = y_train['up_down'].values, 
                                                                 train_valid_test = 'train')

### performance evaluation & confusion matrix (valid data)

In [None]:
accuracy_score_valid, f1score_valid = performance_metrics_binary(model = stack_pipeline, 
                                                                 data = X_valid, 
                                                                 true_y = y_valid['up_down'].values, 
                                                                 train_valid_test = 'valid')

### cross validation (Monte Carlo CV) / 10 folds

In [None]:
folds = ['fold-1', 'fold-2', 'fold-3', 'fold-4', 'fold-5', 'fold-6', 'fold-7', 'fold-8', 'fold-9', 'fold-10']
mccv_score_dict = {'fold':[], 'train_accuracy':[], 'train_f1':[], 'valid_accuracy': [], 'valid_f1': []}
  
for fold in tqdm(folds):
  X_train, y_train = dict_folds_data[fold]['train']
  X_valid, y_valid = dict_folds_data[fold]['valid']

  lm_countvectorizer = CountVectorizer(vocabulary = lm_sent_dict ['Word'])
  tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 2), max_features = 4000)
  minmaxscaler = MinMaxScaler()
  minmaxscaler_price = MinMaxScaler()
  fn_combine = FunctionTransformer(combine_count_sent)
  fn_transform_sparse = FunctionTransformer(csr_matrix)

  tfidf_pipline = Pipeline(steps = [('tfidf_vectorizer', tfidf_vectorizer)])

  lm_pipeline = Pipeline(steps = [('count_vectorizer', lm_countvectorizer), 
                                  ('matrix_mul', fn_combine), 
                                  ('norm', minmaxscaler), 
                                  ('sparse_matrix', fn_transform_sparse)])

  ma_pipeline = Pipeline(steps = [('norm_price', minmaxscaler_price)])

  column_transformer = ColumnTransformer(transformers = [('price_ma', ma_pipeline, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                    'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                    'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                    'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close']), 
                                                        ('tfidf', tfidf_pipline, 'tokenized_text'),
                                                        ('lm_count', lm_pipeline, 'tokenized_text')])

  clf_pipeline_binary = Pipeline(steps = [('preprocessing', column_transformer), 
                                          ('classification',LinearSVC(C= 0.05, class_weight = 'balanced'))])

  #-------------------------------------------------------------------------------------------------------------

  lm_countvectorizer_xgboost = CountVectorizer(vocabulary = lm_sent_dict ['Word'])
  tfidf_vectorizer_xgboost = TfidfVectorizer(ngram_range = (1, 2), max_features = 10000)
  minmaxscaler_lm_xgboost = MinMaxScaler()
  minmaxscaler_price_xgboost = MinMaxScaler()
  fn_combine_xgboost = FunctionTransformer(combine_count_sent)
  fn_transform_sparse_xgboost = FunctionTransformer(csr_matrix)
  clf_xgb = XGBClassifier(booster = 'gbtree', tree_method='gpu_hist', min_split_loss = 0.01, learning_rate = 0.0005, max_depth = 5, n_estimators = 1000, scale_pos_weight = 0.9)

  tfidf_pipline_xg = Pipeline(steps = [('tfidf_vectorizer', tfidf_vectorizer_xgboost)])

  lm_pipeline_xg = Pipeline(steps = [('count_vectorizer', lm_countvectorizer_xgboost), 
                                    ('matrix_mul', fn_combine_xgboost), 
                                    ('norm', minmaxscaler_lm_xgboost), 
                                    ('sparse_matrix', fn_transform_sparse_xgboost)])

  ma_pipeline_xg = Pipeline(steps = [('norm_price', minmaxscaler_price_xgboost)])

  column_transformer_xg = ColumnTransformer(transformers = [('price_ma', ma_pipeline_xg, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                          'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                          'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                          'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close']), 
                                                            ('tfidf', tfidf_pipline_xg, 'tokenized_text'),
                                                            ('lm_count', lm_pipeline_xg, 'tokenized_text')])

  clf_pipeline_binary_xgboost = Pipeline(steps = [('preprocessing', column_transformer_xg), 
                                                  ('classification',clf_xgb)])
  
  #--------------------------------------------------------------------------------------------------------------

  logistic = LogisticRegression()
  stack_pipeline = StackingClassifier(estimators = [('linearsvc', clf_pipeline_binary), ('xgboost', clf_pipeline_binary_xgboost)], 
                                    final_estimator = logistic)

  stack_pipeline.fit(X_train, y_train['up_down'].values)
  train_acc = stack_pipeline.score(X_train, y_train['up_down'].values)
  predicted_y_train = stack_pipeline.predict(X_train)
  train_f1 = f1_score(y_train['up_down'].values, predicted_y_train, average = 'macro')
  valid_acc = stack_pipeline.score(X_valid, y_valid['up_down'].values)
  predicted_y_valid = stack_pipeline.predict(X_valid)
  valid_f1 = f1_score(y_valid['up_down'].values, predicted_y_valid, average = 'macro')

  mccv_score_dict['fold'].append(fold)
  mccv_score_dict['train_accuracy'].append(train_acc)
  mccv_score_dict['train_f1'].append(train_f1)
  mccv_score_dict['valid_accuracy'].append(valid_acc)
  mccv_score_dict['valid_f1'].append(valid_f1)

In [None]:
df_mccv_score_stacking = pd.DataFrame(mccv_score_dict)
df_mccv_score_stacking.loc[len(df_mccv_score_stacking)] = ['average', 
                                                           df_mccv_score_stacking['train_accuracy'].mean(), 
                                                           df_mccv_score_stacking['train_f1'].mean(), 
                                                           df_mccv_score_stacking['valid_accuracy'].mean(), 
                                                           df_mccv_score_stacking['valid_f1'].mean()]

df_mccv_score_stacking.loc[len(df_mccv_score_stacking)] = ['std', 
                                                           df_mccv_score_stacking.iloc[:-1,1].std(), 
                                                           df_mccv_score_stacking.iloc[:-1,2].std(), 
                                                           df_mccv_score_stacking.iloc[:-1,3].std(), 
                                                           df_mccv_score_stacking.iloc[:-1,4].std()]

df_mccv_score_stacking

In [None]:
plt.figure(figsize = (14, 6))
ax1 = plt.subplot(2, 2, 1)
ax1.plot(df_mccv_score_stacking.iloc[:-2,0], df_mccv_score_stacking.iloc[:-2,1], label = 'train_accuracy', color = 'black', linewidth = 0.5)
ax1.axhline(df_mccv_score_stacking.iloc[10, 1], label = 'avg train_accuracy', color = 'black', linewidth = 0.2, linestyle = '--')
ax1.set_ylim((0.3, 0.6))
ax1.tick_params(labelbottom=False)
ax1.legend()
ax2 = plt.subplot(2, 2, 3)
ax2.plot(df_mccv_score_stacking.iloc[:-2,0], df_mccv_score_stacking.iloc[:-2,3], label = 'valid_accuracy', color = 'black', linewidth = 0.5)
ax2.axhline(df_mccv_score_stacking.iloc[10, 3], label = 'avg valid_accuracy', color = 'black', linewidth = 0.2, linestyle = '--')
ax2.set_ylim((0.3, 0.6))
plt.xticks(rotation = 90)
plt.legend()
ax3 = plt.subplot(2, 2, 2)
ax3.plot(df_mccv_score_stacking.iloc[:-2,0], df_mccv_score_stacking.iloc[:-2,2], label = 'train_f1', color = 'red', linewidth = 0.5)
ax3.axhline(df_mccv_score_stacking.iloc[10, 2], label = 'avg train_f1', color = 'red', linewidth = 0.2, linestyle = '--')
ax3.set_ylim((0.2, 0.6))
ax3.tick_params(labelbottom=False)
plt.legend()
ax4 = plt.subplot(2, 2, 4)
ax4.plot(df_mccv_score_stacking.iloc[:-2,0], df_mccv_score_stacking.iloc[:-2,4], label = 'valid_f1', color = 'red', linewidth = 0.5)
ax4.axhline(df_mccv_score_stacking.iloc[10, 4], label = 'avg valid_f1', color = 'red', linewidth = 0.2, linestyle = '--')
ax4.set_ylim((0.2, 0.6))
plt.xticks(rotation = 90)
plt.legend()
plt.show()

## Stacking Models : performace evaluation & visualization (train on training & valid dataset and test on test dataset)

### load train/ valid/ test dataset

In [None]:
X_all_train, y_all_train = load_dataset(directory = '../data/TF-IDF Models/News Article Text File_Agg Daily', 
                                        file_name = 'articles_2015_2019_concated_summaries.txt', 
                                        sp500_fn = load_sp500_data,
                                        fn_tokenization_clean = tokenization_clean,
                                        nlp_model = nlp_model, 
                                        training_data = False)


X_all_test, y_all_test = load_dataset(directory = '../data/TF-IDF Models/News Article Text File_Agg Daily', 
                                           file_name = 'articles_2020_concated_summaries.txt', 
                                           sp500_fn = load_sp500_data,
                                           fn_tokenization_clean = tokenization_clean,
                                           nlp_model = nlp_model,
                                           training_data = False)

### initialize & train stacking model

In [None]:
lm_countvectorizer = CountVectorizer(vocabulary = lm_sent_dict ['Word'])
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 2), max_features = 4000)
minmaxscaler = MinMaxScaler()
minmaxscaler_price = MinMaxScaler()
fn_combine = FunctionTransformer(combine_count_sent)
fn_transform_sparse = FunctionTransformer(csr_matrix)

tfidf_pipline = Pipeline(steps = [('tfidf_vectorizer', tfidf_vectorizer)])

lm_pipeline = Pipeline(steps = [('count_vectorizer', lm_countvectorizer), 
                                ('matrix_mul', fn_combine), 
                                ('norm', minmaxscaler), 
                                ('sparse_matrix', fn_transform_sparse)])

ma_pipeline = Pipeline(steps = [('norm_price', minmaxscaler_price)])

column_transformer = ColumnTransformer(transformers = [('price_ma', ma_pipeline, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                  'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                  'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                  'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close']), 
                                                      ('tfidf', tfidf_pipline, 'tokenized_text'),
                                                      ('lm_count', lm_pipeline, 'tokenized_text')])

clf_pipeline_binary = Pipeline(steps = [('preprocessing', column_transformer), 
                                        ('classification',LinearSVC(C= 0.05, class_weight = 'balanced'))])

#-------------------------------------------------------------------------------------------------------------

lm_countvectorizer_xgboost = CountVectorizer(vocabulary = lm_sent_dict ['Word'])
tfidf_vectorizer_xgboost = TfidfVectorizer(ngram_range = (1, 2), max_features = 10000)
minmaxscaler_lm_xgboost = MinMaxScaler()
minmaxscaler_price_xgboost = MinMaxScaler()
fn_combine_xgboost = FunctionTransformer(combine_count_sent)
fn_transform_sparse_xgboost = FunctionTransformer(csr_matrix)
clf_xgb = XGBClassifier(booster = 'gbtree', tree_method='gpu_hist', min_split_loss = 0.01, learning_rate = 0.0005, max_depth = 5, n_estimators = 1000, scale_pos_weight = 0.9)

tfidf_pipline_xg = Pipeline(steps = [('tfidf_vectorizer', tfidf_vectorizer_xgboost)])

lm_pipeline_xg = Pipeline(steps = [('count_vectorizer', lm_countvectorizer_xgboost), 
                                  ('matrix_mul', fn_combine_xgboost), 
                                  ('norm', minmaxscaler_lm_xgboost), 
                                  ('sparse_matrix', fn_transform_sparse_xgboost)])

ma_pipeline_xg = Pipeline(steps = [('norm_price', minmaxscaler_price_xgboost)])

column_transformer_xg = ColumnTransformer(transformers = [('price_ma', ma_pipeline_xg, ['Close', 'MA10', 'MA20', 'MA30',
                                                                                        'appl_Close', 'msft_Close', 'amzn_Close', 
                                                                                        'nvda_Close', 'brk_Close', 'googl_Close', 
                                                                                        'tsla_Cloae', 'meta_Close', 'xom_Close', 'jpm_Close']), 
                                                          ('tfidf', tfidf_pipline_xg, 'tokenized_text'),
                                                          ('lm_count', lm_pipeline_xg, 'tokenized_text')])

clf_pipeline_binary_xgboost = Pipeline(steps = [('preprocessing', column_transformer_xg), 
                                                ('classification',clf_xgb)])

#--------------------------------------------------------------------------------------------------------------

logistic = LogisticRegression()
stack_pipeline = StackingClassifier(estimators = [('linearsvc', clf_pipeline_binary), ('xgboost', clf_pipeline_binary_xgboost)], 
                                  final_estimator = logistic)

stack_pipeline.fit(X_all_train, y_all_train['up_down'].values)

In [None]:
accuracy_score_all_train, f1score_all_train = performance_metrics_binary(model = stack_pipeline,
                                                                         data = X_all_train,
                                                                         true_y = y_all_train['up_down'].values,
                                                                         train_valid_test = 'train_valid')

### performance evaluation & confusion matrix (test dataset)

In [None]:
accuracy_score_all, f1score_all = performance_metrics_binary(model = stack_pipeline,
                                                             data = X_all_test,
                                                             true_y = y_all_test['up_down'].values, 
                                                             train_valid_test = 'test')