## Install/ Import Library

In [None]:
# Download Language model
! python -m spacy download en_core_web_lg

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchmetrics
from torchmetrics import Accuracy, F1Score
import scipy
from pathlib import Path
import os
import pickle
from tqdm.auto import tqdm
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from transformers import AutoTokenizer,AutoModel,AutoConfig
from summarizer import Summarizer
from sklearn.preprocessing import OneHotEncoder

## GloVe Embedding Preparation

### check if GloVe embedding file exists

In [None]:
import requests
from pathlib import Path
import zipfile

path = Path(r'../data')
glove_path = path / 'golve'

if glove_path.is_dir():
  print('file exists')
elif not glove_path.is_dir():
  glove_path.mkdir(parents = True, exist_ok = True)

  with open(glove_path / 'golve_6B.zip', 'wb') as f:
    request = requests.get(r'https://nlp.stanford.edu/data/glove.6B.zip')
    f.write(request.content)
  
  with zipfile.ZipFile(glove_path / 'golve_6B.zip', 'r') as zip_ref:
    zip_ref.extractall(glove_path)
    print('finished downloading')

### load GloVe embedding file

In [None]:
vocab,embeddings = [],[]

with open('../data/golve/glove.6B.100d.txt', 'r', encoding="utf-8") as file:
  for line in file:
    values = line.split()
    vocab.append(values[0])
    vector = np.asarray(values[1:], "float32")
    embeddings.append(vector)

### extract word & GloVe embedding

In [None]:
vocab_arr = np.array(vocab)
embs_arr = np.array(embeddings)
vocab_arr = np.insert(vocab_arr, 0, '<pad>')
vocab_arr = np.insert(vocab_arr, 1, '<unk>')

pad_emb_arr = np.zeros((1,embs_arr.shape[1]))   #embedding for '<pad>' token.
unk_emb_arr = np.mean(embs_arr,axis=0,keepdims=True)    #embedding for '<unk>' token.

embs_arr = np.vstack((pad_emb_arr,unk_emb_arr,embs_arr))
vocab_dict = {word: idx for idx, word in enumerate(vocab_arr)}

## Helper Functions

### function to tokenize & clean text

In [None]:
def tokenization_clean(text: 'str', nlp_model: 'spacy model'):
  doc = nlp_model(text)
  tok_aft_spacy = [re.sub(r'[^\w\s]', '', tok.lemma_.lower()) for tok in doc 
                   if not tok.is_stop
                   and not tok.is_punct 
                   and not tok.like_num 
                   and not tok.like_url 
                   and not tok.is_space 
                   and not tok.like_email 
                   and not tok.is_left_punct 
                   and not tok.is_right_punct 
                   and not tok.is_digit 
                   and not tok.is_currency]
  
  join_tok_aft_spacy = ' '.join(tok_aft_spacy)
  return join_tok_aft_spacy

### function to load news data

In [None]:
def load_dataset(directory: 'str', file_name: 'str', fn_tokenization_clean: 'function', 
                 nlp_model, training_data = True):
  
  tqdm.pandas()
  path = Path(directory)
  file_path = path / file_name
  data = pd.read_csv(file_path)

  data['tokenized_text'] = data['text'].progress_apply(lambda x: fn_tokenization_clean(x, nlp_model))

  if training_data == True:
    X_train = data.loc[(data['timestamp']<='2018-12-31'), ['timestamp', 'text', 'tokenized_text']].copy()
    X_valid = data.loc[(data['timestamp']>='2019-01-01') & (data['timestamp']<='2019-12-31'), ['timestamp', 'text', 'tokenized_text']].copy()
    y_train = data.loc[(data['timestamp']<='2018-12-31'), ['timestamp', 'topics', 'sentiment']].copy()
    y_valid = data.loc[(data['timestamp']>='2019-01-01') & (data['timestamp']<='2019-12-31'), ['timestamp', 'topics', 'sentiment']].copy()

    y_train = y_train.loc[:,['timestamp', 'sentiment']].copy()
    y_train['sentiment'] = y_train['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    y_valid = y_valid.loc[:,['timestamp', 'sentiment']].copy()
    y_valid['sentiment'] = y_valid['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    return X_train, X_valid, y_train, y_valid
  
  elif training_data == False:
    X = data.loc[:, ['timestamp', 'text', 'tokenized_text']].copy()
    y = data.loc[:, ['timestamp', 'sentiment']].copy()
    y['sentiment'] = y['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
    return X, y

### function to load a fold dataset

In [None]:
def load_fold_dataset(directory: 'str', 
                      file_name: 'str', 
                      fn_tokenization_clean: 'function', 
                      nlp_model):
  
  tqdm.pandas()
  path = Path(directory)
  file_path = path / file_name
  data = pd.read_csv(file_path)
  data['tokenized_text'] = data['text'].progress_apply(lambda x: fn_tokenization_clean(x, nlp_model))
  data['timestamp'] = pd.to_datetime(data['timestamp'])
  y = data[['sentiment']].copy()
  return data, y

### function to prepare 10 folds of dataset

In [None]:
def prepare_folds_dataset(cv_path: 'str', multiple_label = True):
  cv_path = Path(cv_path)
  fold_dataset = {'fold-1':{'train':None, 'valid':None}, 
                  'fold-2':{'train':None, 'valid':None}, 
                  'fold-3':{'train':None, 'valid':None}, 
                  'fold-4':{'train':None, 'valid':None}, 
                  'fold-5':{'train':None, 'valid':None}, 
                  'fold-6':{'train':None, 'valid':None}, 
                  'fold-7':{'train':None, 'valid':None}, 
                  'fold-8':{'train':None, 'valid':None}, 
                  'fold-9':{'train':None, 'valid':None}, 
                  'fold-10':{'train':None, 'valid':None}}

  for i in cv_path.iterdir():
    i.name
    for j in i.iterdir():
      if 'train' in j.name:
        train_path = j

        X_fold_train, y_fold_train = load_fold_dataset(directory = i, 
                                            file_name = train_path.name,
                                            fn_tokenization_clean = tokenization_clean,
                                            nlp_model = nlp_model, 
                                            multiple_label = multiple_label)
        
        fold_dataset[i.name]['train'] = (X_fold_train, y_fold_train)
      elif 'valid' in j.name:
        valid_path = j

        X_fold_valid, y_fold_valid = load_fold_dataset(directory = i,
                                            file_name = valid_path.name,
                                            fn_tokenization_clean = tokenization_clean,
                                            nlp_model = nlp_model, 
                                            multiple_label = multiple_label)
        
        fold_dataset[i.name]['valid'] = (X_fold_valid, y_fold_valid)
  
  return fold_dataset

### function to combine datasets

In [None]:
def combine_onehotencoder(data_original, data_encoder):
  df_complete = pd.concat([data_original, data_encoder], axis = 1)
  df_complete.drop(['text', 'sentiment', 'topics'], axis = 1, inplace = True)

  return df_complete

### mapping word to index of embedding

In [None]:
def word2idx(data: 'DataFrame', target: 'DataFrame', max_length):
  tok_word_id_list = []
  for tok in data['tokenized_text']:
    tok_word_id = [vocab_dict[i] if i in vocab_dict else vocab_dict['<unk>'] for i in tok.split(' ')]

    if len(tok_word_id) < max_length:
      tok_word_id.extend([vocab_dict['<pad>']]*(max_length - len(tok_word_id)))
      tok_word_id_list.append(tok_word_id) #train_word to idx
    
    elif len(tok_word_id) >= max_length:
      tok_word_id = tok_word_id[:max_length]
      tok_word_id_list.append(tok_word_id)
  
  labels = target['sentiment'].values
  return tok_word_id_list, labels

### datasets & dataloader

In [None]:
class Word_Embedding_Dataset():
    def __init__(self,data,targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        x, y = self.data[idx].copy(), self.targets[idx].copy()
        return x, y

#############################################################
def emb_make_loader(X, Y, shuffle=False,batch_size=400, drop = True):
    
    X_val = np.expand_dims(X, -1)
    dataset = Word_Embedding_Dataset(data=X_val, targets=Y)
    ################## SHUFFLE #############################
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle = shuffle, drop_last = drop)
    #############################################################
    return dataloader
#############################################################

## Load data

In [None]:
nlp_model = spacy.load('en_core_web_lg')

### prepare X and y

In [None]:
X_train, X_valid, y_train, y_valid = load_dataset(directory = '../data/GloVe Model/News Article Text File', 
                                                  file_name = 'articles_2015_2019.csv', 
                                                  fn_tokenization_clean = tokenization_clean, 
                                                  nlp_model = nlp_model,
                                                  training_data = True)

In [None]:
X_train_idx, y_train_1 = word2idx(data = X_train, target = y_train, max_length = 200)
X_valid_idx, y_valid_1 = word2idx(data = X_valid, target = y_valid, max_length = 200)

In [None]:
dataloader_all_train = emb_make_loader(X_train_idx, y_train_1, shuffle = False, batch_size=200, drop = True)
dataloader_all_valid = emb_make_loader(X_valid_idx, y_valid_1, shuffle = False, batch_size=200, drop = True)

### prepare 10 folds 0f dataset

In [None]:
folds_data_path = Path(r'../data/GloVe Model/Intermediate Output/dict_folds_data_text_only.pickle')

if folds_data_path.is_file():
  with open(folds_data_path, 'rb') as f_1:
    dict_folds_data = pickle.load(f_1)

else:
  dict_folds_data = prepare_folds_dataset(cv_path = r'../data/GloVe Model/Cross Validation_fold_data')

## LSTM Model

In [None]:
class LSTMClassifier_2(nn.Module):
  def __init__(self, emb_size, emb_dim, batch, emb_arr, input_dim, hidden_dim, n_layer, dropout, output, device):
    super(LSTMClassifier_2, self).__init__()
    self.hidden_size = hidden_dim
    self.num_layers = n_layer
    self.batch_size = batch
    self.device = device

    emb_arr = torch.tensor(emb_arr).clone().detach()
    emb_arr = emb_arr.type(torch.float)
    emb_arr = emb_arr.to(device)
    init_glove_emb = nn.Embedding(emb_size, emb_dim, padding_idx = 0, device = device)
    init_glove_emb = init_glove_emb.from_pretrained(emb_arr)

    self.word_embedding = init_glove_emb

    self.conv_1 = nn.Conv2d(in_channels = 1, 
                            out_channels = 1, 
                            kernel_size = (1, 100), 
                            stride = 1, 
                            device = device)
    self.pool_1 = nn.MaxPool2d(kernel_size = (3, 1))
    self.tanh_1 = nn.Tanh()

    self.conv_2 = nn.Conv2d(in_channels = 1, 
                            out_channels = 1, 
                            kernel_size = (2, 100), 
                            stride = 1,
                            device = device)
    self.pool_2 = nn.MaxPool2d(kernel_size = (3, 1))
    self.tanh_2 = nn.Tanh()

    self.conv_3 = nn.Conv2d(in_channels = 1, 
                            out_channels = 1, 
                            kernel_size = (3, 100), 
                            stride = 1,
                            device = device)
    self.pool_3 = nn.MaxPool2d(kernel_size = (3, 1))
    self.tanh_3 = nn.Tanh()

    self.conv_4 = nn.Conv2d(in_channels = 1, 
                            out_channels = 1, 
                            kernel_size = (4, 100), 
                            stride = 1,
                            device = device)
    self.pool_4 = nn.MaxPool2d(kernel_size = (3, 1))
    self.tanh_4 = nn.Tanh()

    self.lstm = nn.LSTM(input_dim, hidden_dim, n_layer, dropout = dropout, batch_first = True, bidirectional = True).to(device)
    self.dropout = nn.Dropout(dropout)
    self.tanh_5 = nn.Tanh()
    self.flatten = nn.Flatten()
    self.fc1 = nn.Linear(263*hidden_dim*2, output)
    self.h0, self.c0 = self.init_hidden()

  def init_hidden(self):
    h0 = torch.zeros(self.num_layers*2, self.batch_size, self.hidden_size, dtype= torch.float, requires_grad=True).to(self.device)
    c0 = torch.zeros(self.num_layers*2, self.batch_size, self.hidden_size, dtype= torch.float, requires_grad=True).to(self.device)

    return h0, c0
  
  def forward(self, x):
    
    x = x.type(torch.int)
    x = torch.swapaxes(x, 1, 2)
    x = x.to(self.device)
    

    x_emb = self.word_embedding(x)
    x_emb = x_emb.type(torch.float)
    x_emb = x_emb.to(self.device)

    out_1 = self.conv_1(x_emb)
    out_1 = self.pool_1(out_1)
    out_1 = self.tanh_1(out_1)

    out_2 = self.conv_2(x_emb)
    out_2 = self.pool_2(out_2)
    out_2 = self.tanh_2(out_2)

    out_3 = self.conv_3(x_emb)
    out_3 = self.pool_3(out_3)
    out_3 = self.tanh_3(out_3)

    out_4 = self.conv_4(x_emb)
    out_4 = self.pool_4(out_4)
    out_4 = self.tanh_4(out_4)

    out = torch.concat([out_1, out_2, out_3, out_4], dim = 2)

    out = out.squeeze().unsqueeze(-1)
    output, (hn, cn) = self.lstm(out, (self.h0.detach(), self.c0.detach()))
    output = self.dropout(output)
    output = self.tanh_5(output)
    output = self.flatten(output)
    output1 = self.fc1(output)
    return output1

def logit_to_prob(logit):
  logit = logit.type(torch.float)
  logit = logit.squeeze()
  prob = torch.sigmoid(logit)

  return prob

def prob_to_label(prob):
  prob = prob.squeeze()
  label = torch.round(prob)

  return label

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

def metric_accuracy_f1(y_pred_label_tuple, y, device):
    acc = accuracy_score(y.detach().cpu(), y_pred_label_tuple.detach().cpu())
    f1 = f1_score(y.detach().cpu(), y_pred_label_tuple.detach().cpu(), average = 'macro')
    return acc, f1

def train_step(dataloader, model, logit_to_prob_fn, prob_to_label_fn, metric_fn, optimizer, device):
  model = model.to(device)
  model.train()
  bceloss = nn.BCEWithLogitsLoss().to(device)
  train_loss, train_accuracy, train_f1 = 0, 0, 0

  for batch, (X, y) in enumerate(dataloader):
    X, y = X.to(device), y.to(device)
    y_pred_logit_tuple = model(X)
    y_pred_prob_tuple = logit_to_prob(y_pred_logit_tuple).to(device)
    y_pred_label_tuple = prob_to_label(y_pred_prob_tuple).to(device)
    batch_loss = bceloss(y_pred_logit_tuple.squeeze(), y.type(torch.float))
    train_loss += batch_loss

    batch_train_accuracy, batch_f1 = metric_fn(y_pred_label_tuple, y, device)
    train_accuracy += batch_train_accuracy
    train_f1 += batch_f1

    # zero gradient
    optimizer.zero_grad()

    # backpropagation
    batch_loss.backward()

    # Optimize parameters
    optimizer.step()
  
  train_loss /= len(dataloader)
  train_accuracy /= len(dataloader)
  train_f1 /= len(dataloader)

  return {'train_loss': train_loss, 'train_accuracy': train_accuracy, 'train_f1': train_f1}


def test_step(dataloader, model, logit_to_prob_fn, prob_to_label_fn, metric_fn, device):
  model = model.to(device)
  model.eval()

  with torch.inference_mode():
    bceloss = nn.BCEWithLogitsLoss().to(device)
    test_loss, test_accuracy, test_f1 = 0, 0, 0

    for batch, (X, y) in enumerate(dataloader):
      X, y = X.to(device), y.to(device)
      y_pred_logit_tuple = model(X)
      y_pred_prob_tuple = logit_to_prob(y_pred_logit_tuple).to(device)
      y_pred_label_tuple = prob_to_label(y_pred_prob_tuple).to(device)
      batch_loss = bceloss(y_pred_logit_tuple.squeeze(), y.type(torch.float)).to(device)
      test_loss += batch_loss

      batch_test_accuracy, batch_test_f1 = metric_fn(y_pred_label_tuple, y, device)
      test_accuracy += batch_test_accuracy
      test_f1 += batch_test_f1
    
    test_loss /= len(dataloader)
    test_accuracy /= len(dataloader)
    test_f1 /= len(dataloader)
  return {'test_loss': test_loss, 'test_accuracy': test_accuracy, 'test_f1': test_f1}

In [None]:
emb_size = len(vocab_dict)
emb_dim = embs_arr.shape[1]
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = LSTMClassifier_2(emb_size = emb_size, emb_dim = emb_dim, batch = 200, emb_arr = embs_arr, 
                       input_dim = 1, hidden_dim = 10, n_layer = 10, dropout = 0.2, output = 1, device = device).to(device)


In [None]:
from tqdm.auto import tqdm
epochs = 20
torch.manual_seed(42)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.Adam(params = model.parameters(), lr = 0.0006)
complete_record = {'train_loss': [], 'valid_loss': [], 
                   'train_accuracy': [], 'valid_accuracy': [], 
                   'train_f1': [], 'valid_f1': []}

for epoch in tqdm(range(epochs)):
  print(f'Epoch: {epoch} -----------------')
  train_output = train_step(dataloader = dataloader_all_train, 
                            model = model, 
                            logit_to_prob_fn = logit_to_prob, 
                            prob_to_label_fn = prob_to_label, 
                            metric_fn = metric_accuracy_f1,
                            optimizer = optimizer, 
                            device = device)
  
  test_output = test_step(dataloader = dataloader_all_valid, 
                          model = model, 
                          logit_to_prob_fn = logit_to_prob, 
                          prob_to_label_fn = prob_to_label,
                          metric_fn = metric_accuracy_f1,
                          device = device)
  complete_record['train_loss'].append(train_output['train_loss'].item())
  complete_record['valid_loss'].append(test_output['test_loss'].item())
  complete_record['train_accuracy'].append(train_output['train_accuracy'])
  complete_record['valid_accuracy'].append(test_output['test_accuracy'])
  complete_record['train_f1'].append(train_output['train_f1'])
  complete_record['valid_f1'].append(test_output['test_f1'])

  print(f'train_loss: {train_output["train_loss"]: .5f}, test_lost: {test_output["test_loss"]: .5f}, train_acc: {train_output["train_accuracy"]: .2f}, test_acc: {test_output["test_accuracy"]: .2f}, train_f1: {train_output["train_f1"]: .5f}, test_f1: {test_output["test_f1"]: .5f}') 

In [None]:
pd.DataFrame(complete_record)

### 10-fold Monte Carlo CV

In [None]:
folds = ['fold-1', 'fold-2', 'fold-3', 'fold-4', 'fold-5', 'fold-6', 'fold-7', 'fold-8', 'fold-9', 'fold-10']
mccv_score_dict = {'fold-1': None, 'fold-2': None, 'fold-3': None, 'fold-4': None, 'fold-5': None,
                   'fold-6': None, 'fold-7': None, 'fold-8': None, 'fold-9': None, 'fold-10': None}

for fold in folds:
  print(f'{fold}:')
  X_train_fold, y_train_fold = dict_folds_data[fold]['train']
  X_valid_fold, y_valid_fold = dict_folds_data[fold]['valid']
  y_train_fold_encoded = y_train_fold['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
  y_valid_fold_encoded = y_valid_fold['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

  y_train_fold_encoded = pd.DataFrame(y_train_fold_encoded, columns = ['sentiment'])
  y_valid_fold_encoded = pd.DataFrame(y_valid_fold_encoded, columns = ['sentiment'])


  X_train_fold_idx, y_train_1 = word2idx(data = X_train_fold, target = y_train_fold_encoded, max_length = 200)
  X_valid_fold_idx, y_valid_1 = word2idx(data = X_valid_fold, target = y_valid_fold_encoded, max_length = 200)
  dataloader_fold_train = emb_make_loader(X_train_fold_idx, y_train_1, shuffle = False, batch_size=200, drop = True)
  dataloader_fold_valid = emb_make_loader(X_valid_fold_idx, y_valid_1, shuffle = False, batch_size=200, drop = True)

  emb_size = len(vocab_dict)
  emb_dim = embs_arr.shape[1]
  device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
  model = LSTMClassifier_2(emb_size = emb_size, emb_dim = emb_dim, batch = 200, emb_arr = embs_arr, 
                        input_dim = 1, hidden_dim = 10, n_layer = 10, dropout = 0.2, output = 1, device = device).to(device)

  epochs = 20
  torch.manual_seed(42)
  device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
  optimizer = torch.optim.Adam(params = model.parameters(), lr = 0.0006)
  complete_record = {'train_loss': [], 'valid_loss': [], 
                    'train_accuracy': [], 'valid_accuracy': [], 
                    'train_f1': [], 'valid_f1': []}

  for epoch in tqdm(range(epochs)):
    train_output = train_step(dataloader = dataloader_all_train, 
                              model = model, 
                              logit_to_prob_fn = logit_to_prob, 
                              prob_to_label_fn = prob_to_label, 
                              metric_fn = metric_accuracy_f1,
                              optimizer = optimizer, 
                              device = device)
    
    test_output = test_step(dataloader = dataloader_all_valid, 
                            model = model, 
                            logit_to_prob_fn = logit_to_prob, 
                            prob_to_label_fn = prob_to_label,
                            metric_fn = metric_accuracy_f1,
                            device = device)
    complete_record['train_loss'].append(train_output['train_loss'].item())
    complete_record['valid_loss'].append(test_output['test_loss'].item())
    complete_record['train_accuracy'].append(train_output['train_accuracy'])
    complete_record['valid_accuracy'].append(test_output['test_accuracy'])
    complete_record['train_f1'].append(train_output['train_f1'])
    complete_record['valid_f1'].append(test_output['test_f1'])
  
  mccv_score_dict[fold] = complete_record



In [None]:
pd.set_option('display.precision', 10)

folds = ['fold-1', 'fold-2', 'fold-3', 'fold-4', 'fold-5', 'fold-6', 'fold-7', 'fold-8', 'fold-9', 'fold-10']

df_extracted = pd.DataFrame({'train_loss': [], 'valid_loss': [], 
                             'train_accuracy': [], 'valid_accuracy': [], 
                             'train_f1': [], 'valid_f1': []})

for fold in folds:
  df = pd.DataFrame(mccv_score_dict[fold])
  df_extracted.loc[len(df_extracted)] = df.iloc[-1,:].to_dict().values()

df_extracted['folds'] = folds
df_extracted.loc[len(df_extracted)] = [df_extracted['train_loss'].mean(), df_extracted['valid_loss'].mean(), 
                                       df_extracted['train_accuracy'].mean(), df_extracted['valid_accuracy'].mean(), 
                                       df_extracted['train_f1'].mean(), df_extracted['valid_f1'].mean(), 'average']

df_extracted.loc[len(df_extracted)] = [df_extracted.iloc[:-1,0].std(), df_extracted.iloc[:-1,1].std(), 
                                       df_extracted.iloc[:-1,2].std(), df_extracted.iloc[:-1,3].std(), 
                                       df_extracted.iloc[:-1,4].std(), df_extracted.iloc[:-1,5].std(), 'std']

changed_column = ['folds'] + list(df_extracted.columns[:-1])
df_extracted[changed_column]                                  

### confirm the best model

#### load/ prepare data

In [None]:
X_train, X_valid, y_train, y_valid = load_dataset(directory = '../data/GloVe Model/News Article Text File', 
                                                  file_name = 'articles_2015_2019.csv', 
                                                  fn_tokenization_clean = tokenization_clean, 
                                                  nlp_model = nlp_model,
                                                  training_data = True)

X_train_idx, y_train_1 = word2idx(data = X_train, target = y_train, max_length = 200)
X_valid_idx, y_valid_1 = word2idx(data = X_valid, target = y_valid, max_length = 200)

dataloader_all_train = emb_make_loader(X_train_idx, y_train_1, shuffle = False, batch_size=200, drop = True)
dataloader_all_valid = emb_make_loader(X_valid_idx, y_valid_1, shuffle = False, batch_size=200, drop = True)

In [None]:
emb_size = len(vocab_dict)
emb_dim = embs_arr.shape[1]
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model_final = LSTMClassifier_2(emb_size = emb_size, emb_dim = emb_dim, batch = 200, emb_arr = embs_arr,
                               input_dim = 1, hidden_dim = 10, n_layer = 10, dropout = 0.2, output = 1, device = device).to(device)

from tqdm.auto import tqdm
epochs = 20
torch.manual_seed(42)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
optimizer = torch.optim.Adam(params = model_final.parameters(), lr = 0.0006)
complete_record_final = {'train_loss': [], 'valid_loss': [],
                         'train_accuracy': [], 'valid_accuracy': [],
                         'train_f1': [], 'valid_f1': []}

for epoch in tqdm(range(epochs)):
  print(f'Epoch: {epoch} -----------------')
  train_output = train_step(dataloader = dataloader_all_train, 
                            model = model_final, 
                            logit_to_prob_fn = logit_to_prob, 
                            prob_to_label_fn = prob_to_label, 
                            metric_fn = metric_accuracy_f1,
                            optimizer = optimizer, 
                            device = device)
  
  test_output = test_step(dataloader = dataloader_all_valid, 
                          model = model_final, 
                          logit_to_prob_fn = logit_to_prob, 
                          prob_to_label_fn = prob_to_label,
                          metric_fn = metric_accuracy_f1,
                          device = device)

### performance evaluation & confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
np.random.seed(42)

model.eval()
with torch.inference_mode():
  pred_y_batch_list = []
  true_y_batch_list = []

  for X, y in dataloader_all_valid:
    logit_validate = model(X)
    probability_validate = logit_to_prob(logit_validate)
    label_validation = prob_to_label(probability_validate)

    pred_y_batch_list.append(label_validation)
    true_y_batch_list.append(y)
  
  pred_y_batch_all = torch.cat(pred_y_batch_list, dim = 0)
  true_y_batch_all = torch.cat(true_y_batch_list, dim = 0)

  confusion_matrix_result = confusion_matrix(true_y_batch_all, pred_y_batch_all.to('cpu'), normalize = 'true')

In [None]:
import seaborn as sns

sns.set(font_scale=1.2)
plt.figure(figsize = (4, 4))
confusion_matrix_result_heatmap = sns.heatmap(confusion_matrix_result, 
                                              cmap="Blues", 
                                              annot = True, 
                                              fmt=".2f", annot_kws={'size': 15}, 
                                              xticklabels=['Negative', 'Positive'], 
                                              yticklabels=['Negative', 'Positive'])

confusion_matrix_result_heatmap.set(xlabel='Predicted Label', ylabel='True Label', title = 'Sentiment')
acc_score = accuracy_score(true_y_batch_all, pred_y_batch_all.to('cpu'))
f1_score_macro = f1_score(true_y_batch_all, pred_y_batch_all.to('cpu'), average = 'macro')
plt.show()
print(f'\nvalid accuracy score: {acc_score}, valid f1 score: {f1_score_macro}')