## Install/ Import Library

In [None]:
! pip install -U spacy
! python -m spacy download en_core_web_lg

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import re
from scipy.sparse import csr_matrix 

import spacy
from tqdm.auto import tqdm
from pathlib import Path
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, OneHotEncoder
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import ParameterGrid

import pickle

mpl.rcParams.update(mpl.rcParamsDefault)

## Helper Functions

### tokenize & clean data

In [None]:
def tokenization_clean(text: 'str', nlp_model: 'spacy_model'):
  doc = nlp_model(text)
  tok_aft_spacy = [re.sub(r'[^\w\s]', '', tok.lemma_.lower()) for tok in doc 
                   if not tok.is_stop
                   and not tok.is_punct 
                   and not tok.like_num 
                   and not tok.like_url 
                   and not tok.is_space 
                   and not tok.like_email 
                   and not tok.is_left_punct 
                   and not tok.is_right_punct 
                   and not tok.is_digit 
                   and not tok.is_currency]
  
  join_tok_aft_spacy = ' '.join(tok_aft_spacy)
  return join_tok_aft_spacy

### load dataset

In [None]:
def load_fold_dataset(directory: 'str', 
                      file_name: 'str', 
                      fn_tokenization_clean: 'function', 
                      nlp_model):
  
  tqdm.pandas()
  path = Path(directory)
  file_path = path / file_name
  data = pd.read_csv(file_path)
  data['tokenized_text'] = data['text'].progress_apply(lambda x: fn_tokenization_clean(x, nlp_model))
  data['timestamp'] = pd.to_datetime(data['timestamp'])
  y = data[['sentiment']].copy()
  return data, y

In [None]:
def load_dataset(directory: 'str', file_name: 'str', fn_tokenization_clean: 'function', 
                 nlp_model, training_data = True):
  
  tqdm.pandas()
  path = Path(directory)
  file_path = path / file_name
  data = pd.read_csv(file_path)

  data['tokenized_text'] = data['text'].progress_apply(lambda x: fn_tokenization_clean(x, nlp_model))

  if training_data == True:
    X_train = data.loc[(data['timestamp']<='2018-12-31'), ['timestamp', 'text', 'tokenized_text']].copy()
    X_valid = data.loc[(data['timestamp']>='2019-01-01') & (data['timestamp']<='2019-12-31'), ['timestamp', 'text', 'tokenized_text']].copy()
    y_train = data.loc[(data['timestamp']<='2018-12-31'), ['timestamp', 'topics', 'sentiment']].copy()
    y_valid = data.loc[(data['timestamp']>='2019-01-01') & (data['timestamp']<='2019-12-31'), ['timestamp', 'topics', 'sentiment']].copy()

    y_train = y_train.loc[:,['timestamp', 'sentiment']].copy()
    y_train['sentiment'] = y_train['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    y_valid = y_valid.loc[:,['timestamp', 'sentiment']].copy()
    y_valid['sentiment'] = y_valid['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    return X_train, X_valid, y_train, y_valid
  
  elif training_data == False:
    X = data.loc[:, ['timestamp', 'text', 'tokenized_text']].copy()
    y = data.loc[:, ['timestamp', 'sentiment']].copy()
    y['sentiment'] = y['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    return X, y

### function to prepare 10 fold datasets

In [None]:
def prepare_folds_dataset(cv_path: 'str'):
  cv_path = Path(cv_path)
  fold_dataset = {'fold-1':{'train':None, 'valid':None}, 
                  'fold-2':{'train':None, 'valid':None}, 
                  'fold-3':{'train':None, 'valid':None}, 
                  'fold-4':{'train':None, 'valid':None}, 
                  'fold-5':{'train':None, 'valid':None}, 
                  'fold-6':{'train':None, 'valid':None}, 
                  'fold-7':{'train':None, 'valid':None}, 
                  'fold-8':{'train':None, 'valid':None}, 
                  'fold-9':{'train':None, 'valid':None}, 
                  'fold-10':{'train':None, 'valid':None}}

  for i in cv_path.iterdir():
    i.name
    for j in i.iterdir():
      if 'train' in j.name:
        train_path = j

        X_fold_train, y_fold_train = load_fold_dataset(directory = i, 
                                            file_name = train_path.name,
                                            fn_tokenization_clean = tokenization_clean,
                                            nlp_model = nlp_model)
        
        fold_dataset[i.name]['train'] = (X_fold_train, y_fold_train)
      elif 'valid' in j.name:
        valid_path = j

        X_fold_valid, y_fold_valid = load_fold_dataset(directory = i,
                                            file_name = valid_path.name,
                                            fn_tokenization_clean = tokenization_clean,
                                            nlp_model = nlp_model)
        
        fold_dataset[i.name]['valid'] = (X_fold_valid, y_fold_valid)
  
  return fold_dataset

### load loughran_mcdonal sentiment dictionary

In [None]:
def loughran_mcdonald_dict(directory: 'str', file_name: 'str'):
  path = Path(directory)
  file_path = path / file_name
  data = pd.read_csv(file_path)

  data_part = data[['Word', 'Negative', 'Positive', 
                    'Uncertainty', 'Litigious', 'Strong_Modal', 
                    'Weak_Modal', 'Constraining']].copy()

  data_part['Word'] = data_part['Word'].str.lower()

  for i in list(data_part.columns)[1:]:
    data_part[i] = data_part[i].apply(lambda x: 1 if x >0 else 0)
  
  data_part.drop(index = 50741, inplace = True) # drop nan values
  data_part.reset_index(drop = True, inplace = True)

  return data_part

### multiplication of doc-term maxtrix and term-sentiment matrix

In [None]:
def combine_count_sent(data, 
                       loughran_mcdonald_dict_fn = loughran_mcdonald_dict, 
                       directory = '../data/TF-IDF Models', 
                       file_name = 'Loughran-McDonald_MasterDictionary_1993-2021.csv'):
  
  sent_dict = loughran_mcdonald_dict_fn(directory, file_name)
  sparse_sent_dict = csr_matrix(sent_dict.iloc[:,1:].values)
  news_sentiment = (data*sparse_sent_dict)

  return news_sentiment.toarray()

### function of 10 fold gridsearchcv

In [None]:
def gridsearchcv(folds_data: 'dict', parametergrid: 'list', model: 'function'):
  folds = ['fold-1', 'fold-2', 'fold-3', 'fold-4', 'fold-5', 'fold-6', 'fold-7', 'fold-8', 'fold-9', 'fold-10']
  score_dict = {'parameter': [], 'fold':[], 'train_accuracy':[], 'train_f1':[], 'valid_accuracy': [], 'valid_f1': []}

  for parameter in tqdm(parametergrid):
    model_initialized = model.set_params(preprocessing__tfidf__tfidf_vectorizer__max_features = parameter['preprocessing__tfidf__tfidf_vectorizer__max_features'], 
                                         classification__C = parameter['classification__C'], 
                                         classification__penalty = parameter['classification__penalty'])
    
    for fold in folds:
      score_dict['parameter'].append(parameter)
      X_train, y_train = folds_data[fold]['train']
      X_valid, y_valid = folds_data[fold]['valid']
      y_train_encoded = y_train['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
      y_valid_encoded = y_valid['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

      model_initialized.fit(X_train, y_train_encoded.values)
      train_acc = model_initialized.score(X_train, y_train_encoded.values)
      predicted_y_train = model_initialized.predict(X_train)
      train_f1 = f1_score(y_train_encoded.values, predicted_y_train, average = 'macro')
      valid_acc = model_initialized.score(X_valid, y_valid_encoded.values)
      predicted_y_valid = model_initialized.predict(X_valid)
      valid_f1 = f1_score(y_valid_encoded.values, predicted_y_valid, average = 'macro')

      score_dict['fold'].append(fold)
      score_dict['train_accuracy'].append(train_acc)
      score_dict['train_f1'].append(train_f1)
      score_dict['valid_accuracy'].append(valid_acc)
      score_dict['valid_f1'].append(valid_f1)
      
  
  return score_dict

### function to visualize gridsearchcv scores

In [None]:
def visualize_gridsearchcv(cv_result: 'sklearn gridsearch model', binary = True):
  color = ['#AED6F1', '#5DADE2', '#2E86C1', '#21618C', '#1B2631']
  features_list = [2000, 4000, 6000, 8000, 10000]

  if binary != True:

    df_cv_result = pd.DataFrame(cv_result)
    df_cv_result = df_cv_result[['classification__estimator__C', 
                                'preprocessing__tfidf__tfidf_vectorizer__max_features', 
                                'valid_f1']].copy()
                                
    df_cv_result.rename(columns = {'classification__estimator__C': 'linearSVC_C', 
                                  'preprocessing__tfidf__tfidf_vectorizer__max_features': 'tfidf_feature'}, 
                        inplace = True)
  elif binary == True:

    df_cv_result = pd.DataFrame(cv_result)
    df_cv_result = df_cv_result[['classification__C', 
                                'preprocessing__tfidf__tfidf_vectorizer__max_features', 
                                'valid_f1']].copy()
                                
    df_cv_result.rename(columns = {'classification__C': 'linearSVC_C', 
                                  'preprocessing__tfidf__tfidf_vectorizer__max_features': 'tfidf_feature'}, 
                        inplace = True)
  
  fig, ax = plt.subplots(5, 1, figsize = (11, 7))
  for idx, feature_no, color in zip(range(len(features_list)), features_list, color):
    ax[idx].plot(df_cv_result.loc[(df_cv_result['tfidf_feature'] == feature_no), 'linearSVC_C'], 
                 df_cv_result.loc[(df_cv_result['tfidf_feature'] == feature_no), ('valid_f1', 'mean')],
                 color = color,
                 linewidth = 0.5, 
                 label = f'tfidf_feature number: {feature_no}')
    
    ax[idx].errorbar(df_cv_result.loc[(df_cv_result['tfidf_feature'] == feature_no), 'linearSVC_C'], 
                     df_cv_result.loc[(df_cv_result['tfidf_feature'] == feature_no), ('valid_f1', 'mean')],
                     df_cv_result.loc[(df_cv_result['tfidf_feature'] == feature_no), ('valid_f1', 'std')], 
                     linestyle='None', 
                     marker='o', 
                     markersize=3, 
                     color = color)
    
    ax[idx].set_ylabel('mean valid f1', fontsize =7)
    ax[idx].legend(fontsize = 8)
    ax[idx].set_ylim(0.75, 0.79)
    ax[idx].tick_params(labelsize = 8, pad = 0.8)
    
    for text, x, y in zip(df_cv_result.loc[(df_cv_result['tfidf_feature'] == feature_no), ('valid_f1', 'mean')], 
                          df_cv_result.loc[(df_cv_result['tfidf_feature'] == feature_no), 'linearSVC_C'], 
                          df_cv_result.loc[(df_cv_result['tfidf_feature'] == feature_no), ('valid_f1', 'mean')]):
      ax[idx].annotate('{:.4f}'.format(text), (x+0.0005, y+0.001), fontsize = 9)
    
    if idx == 0:
      ax[idx].set_title('f1 score (macro) of the grid search cv results (error bar: +/- 1 std)')
    
    

  plt.xlabel('linear SVC, C value', fontsize = 7)
  plt.show()

### function to visualize confusion matrix (binary label)

In [None]:
def performance_metrics_binary(model, data, true_y, train_valid: 'str'):
  predicted_y = model.predict(data)
  accuracy_score = model.score(data, true_y)
  f1score = f1_score(true_y, predicted_y, average = 'macro')
  

  plt.figure(figsize = (4, 4))
  sns.set(font_scale=1.2)
  cm_result = confusion_matrix(true_y, predicted_y, normalize = 'true')

  confusion_matrix_result_heatmap = sns.heatmap(cm_result, 
                                                cmap="Blues", 
                                                annot = True, 
                                                fmt=".2f", annot_kws={'size': 15}, 
                                                xticklabels=['Negative', 'Positive'], 
                                                yticklabels=['Negative', 'Positive'])

  confusion_matrix_result_heatmap.set(xlabel='Predicted Label', ylabel='True Label', title = 'sentiment')

  plt.show()
  print(f'\n{train_valid} accuracy: {accuracy_score}, {train_valid} f1 score: {f1score}')
  return accuracy_score, f1score

## Load Data

### initialize spacy model

In [None]:
nlp_model = spacy.load('en_core_web_lg')

### prepare loughran-mcdonald sentiment dictionary

In [None]:
lm_sent_dict = loughran_mcdonald_dict(directory = '../data/TF-IDF Models', 
                                      file_name = 'Loughran-McDonald_MasterDictionary_1993-2021.csv')

### prepare 10 folds of dataset

In [None]:
folds_data_path = Path(r'../data/TF-IDF Models/Intermediate Output/dict_folds_data_text_only.pickle')

if folds_data_path.is_file():
  with open(folds_data_path, 'rb') as f_1:
    dict_folds_data = pickle.load(f_1)

else:
  dict_folds_data = prepare_folds_dataset(cv_path = r'../data/TF-IDF Models/Cross Validation_fold_data')

## Binary Label Classifier & Pipeline

### initialize necessary functions

In [None]:
lm_countvectorizer = CountVectorizer(vocabulary = lm_sent_dict ['Word'])
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 2), max_features = 10000)
minmaxscaler = MinMaxScaler()
fn_combine = FunctionTransformer(combine_count_sent)
fn_transform_sparse = FunctionTransformer(csr_matrix)

### build pipeline (binary label)

In [None]:
tfidf_pipline = Pipeline(steps = [('tfidf_vectorizer', tfidf_vectorizer)])

lm_pipeline = Pipeline(steps = [('count_vectorizer', lm_countvectorizer), 
                                ('matrix_mul', fn_combine), 
                                ('norm', minmaxscaler), 
                                ('sparse_matrix', fn_transform_sparse)])

column_transformer = ColumnTransformer(transformers = [('tfidf', tfidf_pipline, 'tokenized_text'),
                                                       ('lm_count', lm_pipeline, 'tokenized_text')])

clf_pipeline_binary = Pipeline(steps = [('preprocessing', column_transformer), 
                                        ('classification',LinearSVC(C= 0.9, class_weight = 'balanced'))])

### prepare X and y (binary)

In [None]:
X_train, X_valid, y_train, y_valid = load_dataset(directory = '../data/TF-IDF Models/News Article Text File',
                                                  file_name = 'articles_2015_2019.csv', 
                                                  fn_tokenization_clean = tokenization_clean, 
                                                  nlp_model = nlp_model, 
                                                  training_data = True)

### GridSearchCV

In [None]:
parameters_grid = {'preprocessing__tfidf__tfidf_vectorizer__max_features':[2000, 4000, 6000, 8000, 10000],
                   'classification__C': [0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.2, 1.3], 
                   'classification__penalty': ['l2']}

grid = ParameterGrid(parameters_grid)

grid_score_dict = gridsearchcv(folds_data = dict_folds_data, 
                               parametergrid = grid, 
                               model = clf_pipeline_binary)

In [None]:
parameters_grid_1 = {'preprocessing__tfidf__tfidf_vectorizer__max_features':[],
                   'classification__C': [], 
                   'classification__penalty': []}

df_result_binary = pd.DataFrame(grid_score_dict)

for parameter in df_result_binary['parameter']:
  for key, value in parameter.items():
    parameters_grid_1[key].append(value)

df_result_binary_1 = df_result_binary.merge(pd.DataFrame(parameters_grid_1), how = 'left', left_index = True, right_index = True)
df_result_binary_complete = df_result_binary_1.iloc[:,1:].groupby(['preprocessing__tfidf__tfidf_vectorizer__max_features',
                                                                   'classification__C',
                                                                   'classification__penalty'], as_index = False).agg({'train_accuracy':['mean', 'std'], 'train_f1':['mean', 'std'], 'valid_accuracy': ['mean', 'std'], 'valid_f1': ['mean', 'std']})

In [None]:
df_result_binary_complete

In [None]:
visualize_gridsearchcv(df_result_binary_complete, binary = True)

In [None]:
df_result_binary_complete.iloc[df_result_binary_complete[('valid_f1', 'mean')].idxmax()]

### initialize model

In [None]:
lm_countvectorizer = CountVectorizer(vocabulary = lm_sent_dict ['Word'])
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 2), max_features = 10000)
minmaxscaler = MinMaxScaler()
fn_combine = FunctionTransformer(combine_count_sent)
fn_transform_sparse = FunctionTransformer(csr_matrix)

tfidf_pipline = Pipeline(steps = [('tfidf_vectorizer', tfidf_vectorizer)])

lm_pipeline = Pipeline(steps = [('count_vectorizer', lm_countvectorizer), 
                                ('matrix_mul', fn_combine), 
                                ('norm', minmaxscaler), 
                                ('sparse_matrix', fn_transform_sparse)])

column_transformer = ColumnTransformer(transformers = [('tfidf', tfidf_pipline, 'tokenized_text'),
                                                       ('lm_count', lm_pipeline, 'tokenized_text')])

clf_pipeline_binary = Pipeline(steps = [('preprocessing', column_transformer), 
                                        ('classification', LinearSVC(C= 0.1, class_weight = 'balanced'))])

### cross validation (Monte Carlo CV) / 10 folds

In [None]:
folds = ['fold-1', 'fold-2', 'fold-3', 'fold-4', 'fold-5', 'fold-6', 'fold-7', 'fold-8', 'fold-9', 'fold-10']
mccv_score_dict = {'fold':[], 'train_accuracy':[], 'train_f1':[], 'valid_accuracy': [], 'valid_f1': []}
  
for fold in tqdm(folds):
  X_train_fold, y_train_fold = dict_folds_data[fold]['train']
  X_valid_fold, y_valid_fold = dict_folds_data[fold]['valid']

  y_train_fold_encoded = y_train_fold['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
  y_valid_fold_encoded = y_valid_fold['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

  clf_pipeline_binary.fit(X_train_fold, y_train_fold_encoded.values)
  train_acc = clf_pipeline_binary.score(X_train_fold, y_train_fold_encoded.values)
  predicted_y_train = clf_pipeline_binary.predict(X_train_fold)
  train_f1 = f1_score(y_train_fold_encoded.values, predicted_y_train, average = 'macro')
  valid_acc = clf_pipeline_binary.score(X_valid_fold, y_valid_fold_encoded.values)
  predicted_y_valid = clf_pipeline_binary.predict(X_valid_fold)
  valid_f1 = f1_score(y_valid_fold_encoded.values, predicted_y_valid, average = 'macro')

  mccv_score_dict['fold'].append(fold)
  mccv_score_dict['train_accuracy'].append(train_acc)
  mccv_score_dict['train_f1'].append(train_f1)
  mccv_score_dict['valid_accuracy'].append(valid_acc)
  mccv_score_dict['valid_f1'].append(valid_f1)

In [None]:
df_mccv_result = pd.DataFrame(mccv_score_dict)
df_mccv_result.loc[len(df_mccv_result)] = ['average', 
                                           df_mccv_result['train_accuracy'].mean(), 
                                           df_mccv_result['train_f1'].mean(), 
                                           df_mccv_result['valid_accuracy'].mean(), 
                                           df_mccv_result['valid_f1'].mean()]

df_mccv_result.loc[len(df_mccv_result)] = ['std', 
                                           df_mccv_result.iloc[:-1,1].std(), 
                                           df_mccv_result.iloc[:-1,2].std(), 
                                           df_mccv_result.iloc[:-1,3].std(), 
                                           df_mccv_result.iloc[:-1,4].std()]
df_mccv_result

### confirm the best model of binary label

In [None]:
lm_countvectorizer = CountVectorizer(vocabulary = lm_sent_dict ['Word'])
tfidf_vectorizer = TfidfVectorizer(ngram_range = (1, 2), max_features = 10000)
minmaxscaler = MinMaxScaler()
fn_combine = FunctionTransformer(combine_count_sent)
fn_transform_sparse = FunctionTransformer(csr_matrix)

tfidf_pipline = Pipeline(steps = [('tfidf_vectorizer', tfidf_vectorizer)])

lm_pipeline = Pipeline(steps = [('count_vectorizer', lm_countvectorizer), 
                                ('matrix_mul', fn_combine), 
                                ('norm', minmaxscaler), 
                                ('sparse_matrix', fn_transform_sparse)])

column_transformer = ColumnTransformer(transformers = [('tfidf', tfidf_pipline, 'tokenized_text'),
                                                       ('lm_count', lm_pipeline, 'tokenized_text')])

clf_pipeline_binary = Pipeline(steps = [('preprocessing', column_transformer), 
                                        ('classification', LinearSVC(C= 0.1, class_weight = 'balanced'))])

clf_pipeline_binary.fit(X_train, y_train['sentiment'].values)

### performance evaluation & visualization (train dataset)

In [None]:
accuracy_score_train, f1score_train = performance_metrics_binary(model = clf_pipeline_binary, data = X_train, true_y = y_train['sentiment'].values, train_valid = 'train')

### performance evaluation & visualization (valid dataset)

In [None]:
accuracy_score_valid, f1score_valid = performance_metrics_binary(model = clf_pipeline_binary, data = X_valid, true_y = y_valid['sentiment'].values, train_valid = 'valid')