# Step 3A: BERT Model 1 - BERT sentiment analysis using US airline sentiment dataset (equalized sentiment count)

## 1. Required imports

### 1.1 Import required libraries

In [1]:
#---Neural Network libraries
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup            #huggingface transformers library
import tensorflow as tf                                                                                                  #tensorflow library         
import torch                                                                                                             #pytorch library
import torch.nn.functional as Func
import gc
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler                        #for loading data into our model

#---Data processing
from sklearn.model_selection import train_test_split                                                                     #for splitting data into training, testing and validation
from sklearn.metrics import f1_score
import pandas as pd                                                                                                      #for using data in the form of dataframes
import numpy as np
import re                                                                                                                #for data manipulation when cleaning datasets
import os

#---Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [2]:
import ipywidgets as widgets
widgets.IntSlider()

IntSlider(value=0)

### 1.2 Import training data to train the BERT model
- *This dataset is from kaggle - https://www.kaggle.com/crowdflower/twitter-airline-sentiment?select=Tweets.csv*
- *We only pick the required columns from the dataset*

In [4]:
training_data = pd.read_csv(r"C:\Users\Karthik\Desktop\Dissertation\BERT\Training dataset\Tweets.csv")
training_data = training_data[['tweet_id', 'airline_sentiment', 'text']]
training_data.set_index('tweet_id', inplace=True)
training_data.head()

Unnamed: 0_level_0,airline_sentiment,text
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
570306133677760513,neutral,@VirginAmerica What @dhepburn said.
570301130888122368,positive,@VirginAmerica plus you've added commercials t...
570301083672813571,neutral,@VirginAmerica I didn't today... Must mean I n...
570301031407624196,negative,@VirginAmerica it's really aggressive to blast...
570300817074462722,negative,@VirginAmerica and it's a really big bad thing...


### 1.3 Import reddit and twitter data for which we have to populate the sentiments
- *we will use them on the model after the model is trained with the training dataset from kaggle*
- *we save data in two dictionaries; one for reddit and one for twitter to make it more organized*

In [3]:
tick_list =  ['AMC', 'DKNG', 'TSLA', 'AMD', 'BABA']

In [4]:
reddit_df_dict = {tick: pd.read_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Reddit\\consolidated_pickle_files\\reddit_{tick}_df_for_BERT.pkl") for tick in tick_list}
# reddit_df_dict  

In [5]:
reddit_df_dict['TSLA']

Unnamed: 0,body,created_utc,id,top,year,month,day,ticker,YearMonDay
0,Non-troll post. I started 2 weeks ago and have...,1609500199,ghp72zs,top,21,Jan,01,TSLA,21Jan01
1,we eat cornbread on new years day to ensure a ...,1609524808,ghqo6qb,top,21,Jan,01,TSLA,21Jan01
2,TSLA 850 EOD,1612203941,glmp0gv,top,21,Feb,01,TSLA,21Feb01
3,joe weisenthal is the ultimate chad( who else ...,1612207510,glmyg66,top,21,Feb,01,TSLA,21Feb01
4,TSLA 🚀🚀🚀🚀🚀,1612211972,gln9v98,top,21,Feb,01,TSLA,21Feb01
...,...,...,...,...,...,...,...,...,...
15125,TSLA Drill Team 6 reporting for duty,1609359381,ghj65jr,top,20,Dec,30,TSLA,20Dec30
15126,TSLA’s still having them TSLA days I see. Join...,1609359393,ghj66f1,top,20,Dec,30,TSLA,20Dec30
15127,TSLA 🎰 EOY coming,1609360555,ghj8jam,top,20,Dec,30,TSLA,20Dec30
15128,Thank fuck I didn't dump my TSLA calls yesterday,1609361591,ghjamup,top,20,Dec,30,TSLA,20Dec30


In [6]:
twitter_df_dict = {tick: pd.read_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Twitter\\consolidated_pickle_files\\twitter_{tick}_df_for_BERT.pkl") for tick in tick_list}

In [7]:
twitter_df_dict['TSLA']

Unnamed: 0,date,content,likeCount,lang,cleaned_content,year,month,day,ticker,YearMonDay
0,2021-09-18 19:56:59+00:00,@The_RockTrading Bullish on $TSLA this week &a...,1,en,Bullish on $TSLA this week & $Aapl,21,Sep,18,TSLA,21Sep18
1,2021-09-18 04:16:52+00:00,$TSLA now is the same as $aapl in the 80s ! @e...,1,en,$TSLA now is the same as $aapl in the 80s !,21,Sep,18,TSLA,21Sep18
2,2021-09-16 19:01:28+00:00,"Added more $TSLA and $aapl to long, because I ...",0,en,"Added more $TSLA and $aapl to long, because I ...",21,Sep,16,TSLA,21Sep16
3,2021-09-16 15:23:53+00:00,@NeilRog49855230 @Gays4Tesla @TheMaverickWS Th...,3,en,There is plenty of information available on-li...,21,Sep,16,TSLA,21Sep16
4,2021-09-16 15:23:11+00:00,There is plenty of information available on-li...,0,en,There is plenty of information available on-li...,21,Sep,16,TSLA,21Sep16
...,...,...,...,...,...,...,...,...,...,...
179489,2020-06-01 18:42:22+00:00,Normally volatile Tesla $TSLA is the IBD Stock...,1,en,Normally volatile Tesla $TSLA is the IBD Stock...,20,Jun,01,TSLA,20Jun01
179490,2020-06-01 18:42:10+00:00,Normally volatile Tesla $TSLA is the IBD Stock...,1,en,Normally volatile Tesla $TSLA is the IBD Stock...,20,Jun,01,TSLA,20Jun01
179491,2020-06-01 18:41:35+00:00,$TSLA up $51.00 from next suggested buy entry ...,1,en,$TSLA up $51.00 from next suggested buy entry ...,20,Jun,01,TSLA,20Jun01
179492,2020-06-01 18:41:32+00:00,@Desert_Trader81 $TSLA on the move crossing HO...,0,en,$TSLA on the move crossing HOD $885 !!!! 900 w...,20,Jun,01,TSLA,20Jun01


## 2. Prepare training dataset

### 2.1 Get equal label counts

In [6]:
training_data.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [7]:
training_data = pd.concat([training_data.query('airline_sentiment=="negative"').sample(n=2363),training_data.query('airline_sentiment=="neutral"').sample(n=2363),training_data.query('airline_sentiment=="positive"')])

In [8]:
training_data.airline_sentiment.value_counts()

neutral     2363
negative    2363
positive    2363
Name: airline_sentiment, dtype: int64

### 2.2 One-Hot encode the labels
*There are 3 lables so we can encode the lables with 0,1 and 2*

In [9]:
training_data['label'] = training_data.airline_sentiment.replace({'neutral':0, 'positive':1, 'negative':2})
training_data.head()

Unnamed: 0_level_0,airline_sentiment,text,label
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
569893064342437888,negative,@AmericanAir Thanks for the response.Tough nig...,2
569740152966238208,negative,@AmericanAir I've been on hold ANOTHER hour an...,2
569875203473297408,negative,@united UA1130 Flight was a nightmare!! From p...,2
570306529947193344,negative,@AmericanAir I slept in the miami airport due ...,2
569311060903268352,negative,"@VirginAmerica a lot of ""apologies"" being thro...",2


### 2.3 Data cleaning

#### 2.3.1 Define data cleaning function

In [8]:
def text_preprocessing(text):
    text = re.sub(r'(@.*?)[\s]', ' ', text) #remove hashtags
    text = re.sub(r'http\S+', '', text)    #remove urls
    text = re.sub(r'&amp;amp', '&', text)  #remove double amps
    text = re.sub(r'\&amp;', '&', text)    #remove single amps
    text = re.sub(r'\s+', ' ', text)       #reduce multiple spaces into a single space
    return text

#### 2.3.2 Clean the data

In [None]:
#---Make a copy of the original raw text for future reference if required
training_data['original_text'] = training_data['text']
#---Use the above function to clean the tweet data
training_data['text'] = training_data.text.apply(lambda x: text_preprocessing(x))
training_data.head()

### 2.4 Split the cleaned training data into train and validation datasets
- *we do not need a test dataset. This is because we are using the training dataset only to train the BERT model*
- *since we are going to use the BERT model to classify a different corpus (reddit and twitter data), splitting the data into training and validation datasets serves our purpose*
- *this will also help in having a bigger training dataset*

In [11]:
#---we will use scikit-learn's train_test_split to classify records into train, test or validation dataset
x_train, x_val, y_train, y_val =  train_test_split(training_data.index.values,          #x values/ input values 
                                                   training_data.label.values,          #y values/ output values/ lables
                                                   test_size=0.25,                      #percentage of data to be used for test dataset
                                                   random_state=17,                     #shuffles the data prior to splitting
                                                   stratify=training_data.label.values  #stratification helps when the distribution of data is uneven like in our case where,
                                                                                        #we have a lot of negative compared to neutral and positive labels                                                   
)

#---add a new column to training data to classify records into train and val datasets
training_data['dataset'] = 'NA'
training_data.loc[x_train, 'dataset'] = 'train'
training_data.loc[x_val, 'dataset'] = 'val'

training_data.groupby(['airline_sentiment', 'label', 'dataset']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text,original_text
airline_sentiment,label,dataset,Unnamed: 3_level_1,Unnamed: 4_level_1
negative,2,train,1769,1769
negative,2,val,594,594
neutral,0,train,1766,1766
neutral,0,val,597,597
positive,1,train,1761,1761
positive,1,val,602,602


### 2.5 Tokenize and encode data

#### 2.5.1 Find the max tweet length in the training data
- *This is required to be input into the tokenizer*
- *BERT expects all its inputs to be of the same length so shorter sentences will be padded to maintain the length*

In [12]:
MAX_LEN = training_data['text'].str.len().max()
MAX_LEN

172

#### 2.5.2 Initialize huggingface tokenizer and tokenize and encode train and val datasets

- *In this step we will be tokenizing and encoding the input side of our datasets (i.e) the tweet comments*
- *We will be using batch_encode_plus() method in the tokenizer as our input to the tokenizer will be an array of the tweet column in training_data [training_data[training_data.dataset=='train'].text.values]*
- Ref:*https://huggingface.co/transformers/internal/tokenization_utils.html#pretrainedtokenizerbase*

In [52]:
#---Initialize bert-base-uncased tokenizer and select the option to convert all text to lowercase
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [14]:
#---Tokenize and encode train dataset
train_enc = tokenizer.batch_encode_plus(training_data[training_data.dataset=='train'].text.values, 
                                           add_special_tokens=True, 
                                           return_attention_mask=True, 
                                           pad_to_max_length=True, 
                                           max_length=MAX_LEN, 
                                           return_tensors='pt'
                                           )

#---Tokenize and encode val dataset
val_enc = tokenizer.batch_encode_plus(training_data[training_data.dataset=='val'].text.values, 
                                         add_special_tokens=True, 
                                         return_attention_mask=True, 
                                         pad_to_max_length=True, 
                                         max_length=MAX_LEN, 
                                         return_tensors='pt'
                                         )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### 2.6 Create tensor datasets from the encoded datasets from previous step
- *We must split the input_ids(encoded tokens) and attention masks(says which values a model should work on since we padded tweets shorter than 172) from the previous step*
- *we must create a tensor dataset with both input(encoded tweets from last step) and also outputs (the one-hot encoded lables)*
- *we create a tensor with the outputs so that the TensorDataset method can combine both inputs and outputs*

In [15]:
#---Get the input ids and attention masks from the tokenizer outputs
train_input_ids = train_enc['input_ids']
train_attn_masks = train_enc['attention_mask']
val_input_ids = val_enc['input_ids']
val_attn_masks = val_enc['attention_mask']

#---Create tensors with the output data
train_labels = torch.tensor(training_data[training_data.dataset=='train'].label.values)
val_labels = torch.tensor(training_data[training_data.dataset=='val'].label.values)

#---Create the required tensor datasets which will be used in the dataloader
train_tensor = TensorDataset(train_input_ids, train_attn_masks, train_labels)
val_tensor = TensorDataset(val_input_ids, val_attn_masks, val_labels)

len(train_tensor), len(val_tensor)

(5296, 1793)

### 2.7 Create dataloaders
- *We create two dataloaders - one for training data and one for validation data*
- *We use random sampling*
- *We use a  batch size of 32 in consideration of the low computing power at hand. It is also known that smaller batch sizes converge quicker - this seems like a good choice considering our limited training data*


In [16]:
batch_size = 32

# We Need two different dataloder
train_dloader = DataLoader(train_tensor, sampler=RandomSampler(train_tensor), batch_size=batch_size)
val_dloader = DataLoader(val_tensor, sampler=RandomSampler(val_tensor), batch_size=batch_size)

## 3. BERT model and performance definitions
- *Define BERT model, optimizer, scheduler and performance metrics*
- *Ignore the warning because we are going to train the model before using it to classify reddit and twitter data

### 3.1 Define the BERT model

In [18]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(training_data.label.unique()),    #Used for better code reproducability instead of initializing a constant value
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### 3.2 Move the model to GPU for better performance

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device) 
print(device)

### 3.3 Define evaluation function with performance metrics

https://huggingface.co/transformers/main_classes/output.html

In [22]:
def model_eval(validation_dataloader):
    #---set the model in evaluation mode to disable dropout layers, batch normalization, etc
    model.eval()
    
    #---Initialize some variables to calculate model performance post evaluation
    total_loss = 0                          #counter to keep a sum of loss values across all batches
    predicted_class, true_class = [], []    #counters to keep track of all the predicted and true classes across all batches
    
    #---Use model to predict with validation dataset
    for batch in tqdm(validation_dataloader):
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                                                         #move the batch elements to GPU for faster performance
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}
        
        #---enable no_grad option in pytorch as there will be no back-propogation during evaluation
        with torch.no_grad():
            #---Predict outputs using the model
            model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]             #Example output in every iteration: tensor(0.2658, device='cuda:0')
        total_loss += loss.item()
        #print('model op 0:', loss)
        
        #---get the predicted outputs (logits) for current batch and append to predicted_class list
        logits = model_output[1]                          #List of lists [[0.1 0.2 0.3][0.1 0.2 0.3]...32 lists inside a list]
        #move the logits to cpu and convert to numpy since we will compute the simple math error calculations in cpu and not GPU
        logits = logits.detach().cpu().numpy()            #array of list of lists array([[0.1 0.2 0.3][0.1 0.2 0.3]...32 lists inside a list])
        predicted_class.append(logits)                    #list containing array of list of lists [array, array, ...]
        #print('model op 1:', logits)
        #print('pred_class:', predicted_class)
        
        #---get the original true labels and append to true_class list
        true_labels = input_dict['labels'].cpu().numpy()   #array of list of labels [0,1,2,2,1,0,...32 labels]
        true_class.append(true_labels)                     #list of array of list of labels [array([32 labels]), array([32 labels]),...]
        #print('true_class:', true_class)
    
    #---Compute average loss
    average_loss = total_loss/len(validation_dataloader) 
    
    #---Compute prediction accuracy
    predictions = np.concatenate(predicted_class, axis=0)  #list of lists [[0.1 0.2 0.3][0.1 0.2 0.3]....]
    true_vals = np.concatenate(true_class, axis=0)         #List of lists
    correct_predictions = np.sum(np.argmax(predictions, axis=1).flatten() == true_vals)
    prediction_accuracy = correct_predictions / len(training_data[training_data.dataset=='val'])
    #print('predicaitons', predictions)
    #print('true_vals', true_vals)
    
    #---Compute f1 score
    flattened_predictions = np.argmax(predictions, axis=1).flatten()    #List of values in predictions
    flattened_labels = true_vals.flatten()                              #List of values in true_vals
    f1score = f1_score(flattened_labels, flattened_predictions, average='weighted')
    #print('preds_flat:', preds_flat)
    #print('labels_flat:', labels_flat)
    
    return average_loss, predictions, true_vals, prediction_accuracy, f1score

### 3.4 Clear GPU cache

In [9]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=True)



## 4. Trial and error to find the best parameters
- *The original paper for BERT worked on batch sizes of (16 and 32) and learning rates of (5e-5, 4e-5, 3e-5, 2e-5, 1e-5)*
- REF: *https://arxiv.org/abs/1810.04805*
- *We prefer AdamW optimizer function since it is the most widely used general purpose optimizer and Adam was the optimization function used in the original paper on BERT*
- *The scheduler will help bring the learning rate down if it senses overfitting*

### 4.1 Learning Rate: 4e-5, Batch size = 32

In [39]:
epochs = 3
lr = 4e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [25]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_TwitterUSAirline_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.6218656095036541
Validation loss: 0.5115847117022464
validation accuracy: 0.795872838817624
f1 score: 0.7968358762379277


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.34559505634817733
Validation loss: 0.5546318757298746
validation accuracy: 0.8014500836586727
f1 score: 0.7993793464665216


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.19899453961077224
Validation loss: 0.5979253647505844
validation accuracy: 0.8131622978248745
f1 score: 0.8131363182657215



### 4.2 Learning Rate: 3e-5, Batch size = 32

In [26]:
epochs = 3
lr = 3e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [27]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_TwitterUSAirline_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.2014769176582256
Validation loss: 0.708915053000837
validation accuracy: 0.8008923591745678
f1 score: 0.7994762140591439


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.10953409520695996
Validation loss: 0.7897362233253947
validation accuracy: 0.8053541550474066
f1 score: 0.8054065444594684


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.058956572244572174
Validation loss: 0.8368163414364844
validation accuracy: 0.8081427774679308
f1 score: 0.8079200170633898



### 4.3 Learning Rate: 2e-5, Batch size = 32

In [28]:
epochs = 3
lr = 2e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [29]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_TwitterUSAirline_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.07028504623049668
Validation loss: 0.9442426318179187
validation accuracy: 0.7986614612381484
f1 score: 0.7978682975830883


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.04333450746686052
Validation loss: 1.044968852348495
validation accuracy: 0.8064696040156163
f1 score: 0.8061379658628852


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.026527366777657563
Validation loss: 1.1613282686785649
validation accuracy: 0.8064696040156163
f1 score: 0.8061431531587261



### 4.4 Learning Rate: 1e-5, Batch size = 32

In [30]:
epochs = 3
lr = 1e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [31]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_TwitterUSAirline_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.03130397986493028
Validation loss: 1.1883599575151478
validation accuracy: 0.8031232571109872
f1 score: 0.8026091751191283


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.021463047406925114
Validation loss: 1.200392300995749
validation accuracy: 0.8025655326268823
f1 score: 0.8031848292031342


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=166.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=57.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.01627674711365337
Validation loss: 1.3181786202547843
validation accuracy: 0.8014500836586727
f1 score: 0.8013837129904956



### 4.5 Learning Rate: 4e-5, Batch size = 16

In [32]:
batch_size = 16

# We Need two different dataloder
train_dloader = DataLoader(train_tensor, sampler=RandomSampler(train_tensor), batch_size=batch_size)
val_dloader = DataLoader(val_tensor, sampler=RandomSampler(val_tensor), batch_size=batch_size)

epochs = 3
lr = 4e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [33]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_TwitterUSAirline_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.11336933263613395
Validation loss: 1.40177505878336
validation accuracy: 0.7796988287785834
f1 score: 0.7808960139235134


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.0668226255220361
Validation loss: 1.3368073928614705
validation accuracy: 0.7986614612381484
f1 score: 0.798238938888277


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.031194788838871768
Validation loss: 1.4059543909228909
validation accuracy: 0.7975460122699386
f1 score: 0.7984183227610774



### 4.6 Learning Rate: 3e-5, Batch size = 16

In [34]:
batch_size = 16

# We Need two different dataloder
train_dloader = DataLoader(train_tensor, sampler=RandomSampler(train_tensor), batch_size=batch_size)
val_dloader = DataLoader(val_tensor, sampler=RandomSampler(val_tensor), batch_size=batch_size)

epochs = 3
lr = 3e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [35]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_TwitterUSAirline_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.08165258794818622
Validation loss: 1.3088159010141873
validation accuracy: 0.7847183491355271
f1 score: 0.7853352503738621


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.0324793941697181
Validation loss: 1.4275403127892754
validation accuracy: 0.7947573898494144
f1 score: 0.7940687884577671


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.017855956626546376
Validation loss: 1.5000386506237142
validation accuracy: 0.7975460122699386
f1 score: 0.7974403846671503



### 4.7 Learning Rate: 2e-5, Batch size = 16

In [36]:
batch_size = 16

# We Need two different dataloder
train_dloader = DataLoader(train_tensor, sampler=RandomSampler(train_tensor), batch_size=batch_size)
val_dloader = DataLoader(val_tensor, sampler=RandomSampler(val_tensor), batch_size=batch_size)

epochs = 3
lr = 2e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [37]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_TwitterUSAirline_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.03474160280563831
Validation loss: 1.4872182012655781
validation accuracy: 0.7941996653653095
f1 score: 0.7946660240760588


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.019447657142438204
Validation loss: 1.5265365434514073
validation accuracy: 0.7908533184606804
f1 score: 0.7897338729018218


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.016787038144827512
Validation loss: 1.5583263466802253
validation accuracy: 0.7919687674288901
f1 score: 0.7911522671160824



### 4.8 Learning Rate: 1e-5, Batch size = 16

In [40]:
batch_size = 16

# We Need two different dataloder
train_dloader = DataLoader(train_tensor, sampler=RandomSampler(train_tensor), batch_size=batch_size)
val_dloader = DataLoader(val_tensor, sampler=RandomSampler(val_tensor), batch_size=batch_size)

epochs = 3
lr = 1e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [41]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_TwitterUSAirline_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.017547980702383774
Validation loss: 1.6495554934025782
validation accuracy: 0.7930842163970998
f1 score: 0.7924047169152861


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.011069880382806235
Validation loss: 1.6537227041228701
validation accuracy: 0.7969882877858337
f1 score: 0.7966815209923267


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=331.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=113.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.009143708631279527
Validation loss: 1.6660743541143404
validation accuracy: 0.7975460122699386
f1 score: 0.7973060814107703



## 5. Predict with the best model
- *Batch size of 32 and learning rate of 4e-4 gave the best results*

### 5.1 Load the selected model

In [10]:
reqd_model = torch.load(r"C:\Users\Karthik\Desktop\Dissertation\BERT\Models\BERT_TwitterUSAirline_Batch_32_LR_4e-05")

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
reqd_model.to(device) 
print(device)

cuda


### 5.2 Prepare the datasets to be used in the BERT model for prediction

#### 5.2.1 Clean the text data

In [12]:
for tick, tick_df in reddit_df_dict.items():
#     print(tick, tick_df)
    tick_df['cleaned_body'] = tick_df['body'].apply(lambda x: text_preprocessing(x))
    tick_df['LEN'] = tick_df.cleaned_body.str.len()    

In [13]:
for tick, tick_df in twitter_df_dict.items():
    tick_df['cleaned_body'] = tick_df['cleaned_content'].apply(lambda x: text_preprocessing(x))
    tick_df['LEN'] = tick_df.cleaned_body.str.len()    

#### 5.2.2 Handle records where text is greater than a length of 512
- *BERT can only handle a max length of 512*
- *For each comment > length of 512, break the comment into multiple sets of 512*
- *After breaking, create a new record for each of the broken parts and append to the original dataframe. Since we take the average sentiment scores, appending new rows with the same dates will not affect the data*

In [14]:
n=512
def ffn(xseries):
    list_of_lists = []
        
    if xseries['LEN']>512:
        parts = [xseries['cleaned_body'][i:i+n] for i in range(0, xseries['LEN'], n)]
    
        counter=0
        for i in parts:
            list_series = []
            list_series.append(xseries['body'])
            list_series.append(xseries['created_utc'])
            list_series.append(xseries['id'])
            list_series.append(xseries['top'])
            list_series.append(xseries['year'])
            list_series.append(xseries['month'])
            list_series.append(xseries['day'])
            list_series.append(xseries['ticker'])
            list_series.append(xseries['YearMonDay'])
            list_series.append(i)
            counter+=1
            list_series.append(counter)
            
            list_of_lists.append(list_series)
    
        return list_of_lists
    else:
        return 0

In [15]:
for tick, tick_df in reddit_df_dict.items():
    tick_df['mltp'] = tick_df.apply(ffn, axis=1)

In [16]:
for tick, tick_df in twitter_df_dict.items():
    tick_df['mltp'] = tick_df.apply(ffn, axis=1)

In [17]:
for tick, tick_df in reddit_df_dict.items():
    addition_list = tick_df.query('mltp != 0')['mltp']
    
    if len(addition_list)==0:
        continue
    
    ind_additions = []

    for list_of_sentences in addition_list:
        for sentence_part in list_of_sentences:
            ind_additions.append(sentence_part)
    
    
    df_additions = pd.DataFrame(ind_additions, columns=['body','created_utc','id','top','year','month','day','ticker','YearMonDay','cleaned_body','LEN'])
    
    tick_df.drop('mltp', axis=1, inplace=True)
    tick_df.drop(tick_df[tick_df.LEN > 512].index, inplace = True)
    tick_df = tick_df.append(df_additions, ignore_index=True)

In [18]:
for tick, tick_df in twitter_df_dict.items():
    addition_list = tick_df.query('mltp != 0')['mltp']
    
    if len(addition_list)==0:
        continue
    
    ind_additions = []

    for list_of_sentences in addition_list:
        for sentence_part in list_of_sentences:
            ind_additions.append(sentence_part)
    
    
    df_additions = pd.DataFrame(ind_additions, columns=['body','created_utc','id','top','year','month','day','ticker','YearMonDay','cleaned_body','LEN'])
    
    tick_df.drop('mltp', axis=1, inplace=True)
    tick_df.drop(tick_df[tick_df.LEN > 512].index, inplace = True)
    tick_df = tick_df.append(df_additions, ignore_index=True)

- *Make sure there is no data where comment length is greater than 512*

In [19]:
for tick, tick_df in reddit_df_dict.items():
    print(tick_df.query('LEN > 512'))

Empty DataFrame
Columns: [body, created_utc, id, top, year, month, day, ticker, YearMonDay, cleaned_body, LEN]
Index: []
Empty DataFrame
Columns: [body, created_utc, id, top, year, month, day, ticker, YearMonDay, cleaned_body, LEN]
Index: []
Empty DataFrame
Columns: [body, created_utc, id, top, year, month, day, ticker, YearMonDay, cleaned_body, LEN]
Index: []
Empty DataFrame
Columns: [body, created_utc, id, top, year, month, day, ticker, YearMonDay, cleaned_body, LEN]
Index: []
Empty DataFrame
Columns: [body, created_utc, id, top, year, month, day, ticker, YearMonDay, cleaned_body, LEN]
Index: []


In [20]:
for tick, tick_df in twitter_df_dict.items():
    print(tick_df.query('LEN > 512'))

Empty DataFrame
Columns: [date, content, likeCount, lang, cleaned_content, year, month, day, ticker, YearMonDay, cleaned_body, LEN, mltp]
Index: []
Empty DataFrame
Columns: [date, content, likeCount, lang, cleaned_content, year, month, day, ticker, YearMonDay, cleaned_body, LEN, mltp]
Index: []
Empty DataFrame
Columns: [date, content, likeCount, lang, cleaned_content, year, month, day, ticker, YearMonDay, cleaned_body, LEN, mltp]
Index: []
Empty DataFrame
Columns: [date, content, likeCount, lang, cleaned_content, year, month, day, ticker, YearMonDay, cleaned_body, LEN, mltp]
Index: []
Empty DataFrame
Columns: [date, content, likeCount, lang, cleaned_content, year, month, day, ticker, YearMonDay, cleaned_body, LEN, mltp]
Index: []


#### 5.2.3 Define a funtion to prepare reddit and twitter data for model prediction
- *The data should be tokenized and encoded and then tensor datasets must be created from them to be passed to the BERT model*
- *We will create a dictionary for reddit and twitter to hold the tensor datasets respectively*

In [21]:
#---Initialize bert-base-uncased tokenizer and select the option to convert all text to lowercase
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def dataloader_fnc(text, MAX_LEN):
    input_id_list = []
    attn_mask_list = []
    
    for line in text:
        #--Tokenize and encode the cleaned text (we do not return_tensors here since our tensor should have all the lines)
        data_enc = tokenizer.encode_plus(line, 
                                         add_special_tokens=True, 
                                         return_attention_mask=True, 
                                         pad_to_max_length=True, 
                                         max_length=MAX_LEN
                                         )
        
        #--get the input_ids and attention_mask values
        input_id_list.append(data_enc.get("input_ids"))
        attn_mask_list.append(data_enc.get("attention_mask"))
    
    tensor_dataset = TensorDataset(torch.tensor(input_id_list), torch.tensor(attn_mask_list) )
    dataloader = DataLoader(tensor_dataset, sampler=SequentialSampler(tensor_dataset), batch_size=32)
    
    return dataloader

In [22]:
reddit_dataloader_dict = {tick: dataloader_fnc(tick_df.cleaned_body, tick_df.cleaned_body.str.len().max())  for tick, tick_df in reddit_df_dict.items()}

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [24]:
reddit_dataloader_dict

{'AMC': <torch.utils.data.dataloader.DataLoader at 0x1b88e9e7700>,
 'DKNG': <torch.utils.data.dataloader.DataLoader at 0x1b88e9cdaf0>,
 'TSLA': <torch.utils.data.dataloader.DataLoader at 0x1b8647ae2e0>,
 'AMD': <torch.utils.data.dataloader.DataLoader at 0x1b88e9cd400>,
 'BABA': <torch.utils.data.dataloader.DataLoader at 0x1b91da9a580>}

In [25]:
twitter_dataloader_dict = {tick: dataloader_fnc(tick_df.cleaned_body, tick_df.cleaned_body.str.len().max())  for tick, tick_df in twitter_df_dict.items()}

In [26]:
twitter_dataloader_dict

{'AMC': <torch.utils.data.dataloader.DataLoader at 0x1b88e9cdc40>,
 'DKNG': <torch.utils.data.dataloader.DataLoader at 0x1b91da9a9a0>,
 'TSLA': <torch.utils.data.dataloader.DataLoader at 0x1b91da9adf0>,
 'AMD': <torch.utils.data.dataloader.DataLoader at 0x1b91da9a3d0>,
 'BABA': <torch.utils.data.dataloader.DataLoader at 0x1b86a7752e0>}

## 6. Prediction

### 6.1 Define a prediction function

In [27]:
def prediction(bert_model, data):
    #--put the model in eval mode
    bert_model.eval()
    
    #--predict and collect output logits in a list
    output_logit_list = []
    
    for batch in data:
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
        input_id_tensor = temp_batch[0]
        attn_mask_tensor = temp_batch[1]
#         print(input_id_tensor)
#         print(attn_mask_tensor)
        
        with torch.no_grad():
            output_logits = bert_model(input_id_tensor, attn_mask_tensor)
            output_logit_list.append(output_logits[0])
    
    #--concatenate logits across batches and apply softmax
    complete_logits = torch.cat(output_logit_list, dim=0)
    softmax_probs = Func.softmax(complete_logits, dim=1).cpu().numpy() #convert to numpy
    
    return softmax_probs

### 6.2 Predict sentiments for reddit data

#### 6.2.1 Prediction for reddit comments
- *Use the model to get positive and negative probabilities for each reddit comment*
- *Group by date and compute average positive, negative and neutral scores for each day*

In [28]:
for tick, tick_df in reddit_df_dict.items():
    pred_df = pd.DataFrame(prediction(reqd_model, reddit_dataloader_dict[tick]), columns=['neutral', 'positive', 'negative'])
    reddit_df_dict[tick] = pd.concat([tick_df, pred_df], axis=1)

In [29]:
for tick, tick_df in reddit_df_dict.items():
    tick_grouped = tick_df.groupby(['ticker', 'YearMonDay'])[['neutral', 'positive', 'negative']].mean().reset_index()
    tick_grouped.to_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Final_dfs\\reddit_{tick}_BERT_TwitterAirline_Even.pkl")
    print(tick_grouped)

    ticker YearMonDay   neutral  positive  negative
0      AMC    20Aug05  0.091782  0.225708  0.682509
1      AMC    20Aug13  0.973933  0.003902  0.022165
2      AMC    20Aug17  0.639910  0.011118  0.348972
3      AMC    20Aug19  0.876112  0.099524  0.024364
4      AMC    20Aug25  0.659872  0.315126  0.025002
..     ...        ...       ...       ...       ...
175    AMC    21May26  0.605147  0.153711  0.241142
176    AMC    21May27  0.557445  0.174559  0.267996
177    AMC    21May28  0.559392  0.165391  0.275217
178    AMC    21May29  0.324422  0.259156  0.416423
179    AMC    21May30  0.982039  0.008089  0.009872

[180 rows x 5 columns]
    ticker YearMonDay   neutral  positive  negative
0     DKNG    20Aug03  0.841356  0.044665  0.113979
1     DKNG    20Aug05  0.756529  0.040376  0.203095
2     DKNG    20Aug06  0.495894  0.013155  0.490951
3     DKNG    20Aug10  0.292190  0.031255  0.676555
4     DKNG    20Aug11  0.556901  0.009579  0.433521
..     ...        ...       ...       ..

#### 6.2.2 Prediction for twitter comments
- *Use the model to get positive and negative probabilities for each twitter comment*
- *Group by date and compute average positive, negative and neutral scores for each day*

In [30]:
for tick, tick_df in twitter_df_dict.items():
    pred_df = pd.DataFrame(prediction(reqd_model, twitter_dataloader_dict[tick]), columns=['neutral', 'positive', 'negative'])
    twitter_df_dict[tick] = pd.concat([tick_df, pred_df], axis=1)

In [31]:
for tick, tick_df in twitter_df_dict.items():
    tick_grouped = tick_df.groupby(['ticker', 'YearMonDay'])[['neutral', 'positive', 'negative']].mean().reset_index()
    tick_grouped.to_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Final_dfs\\twitter_{tick}_BERT_TwitterAirline_Even.pkl")
    print(tick_grouped)

    ticker YearMonDay   neutral  positive  negative
0      AMC    20Aug01  0.450005  0.311928  0.238067
1      AMC    20Aug02  0.599266  0.081191  0.319544
2      AMC    20Aug03  0.734386  0.050787  0.214826
3      AMC    20Aug04  0.790502  0.089713  0.119785
4      AMC    20Aug05  0.677792  0.116865  0.205342
..     ...        ...       ...       ...       ...
482    AMC    21Sep26  0.496433  0.215175  0.288392
483    AMC    21Sep27  0.465371  0.196891  0.337738
484    AMC    21Sep28  0.417595  0.130639  0.451766
485    AMC    21Sep29  0.458247  0.188060  0.353693
486    AMC    21Sep30  0.453878  0.223223  0.322899

[487 rows x 5 columns]
    ticker YearMonDay   neutral  positive  negative
0     DKNG    20Aug01  0.583589  0.219671  0.196740
1     DKNG    20Aug02  0.449766  0.191134  0.359100
2     DKNG    20Aug03  0.511297  0.093189  0.395514
3     DKNG    20Aug04  0.597515  0.155424  0.247060
4     DKNG    20Aug05  0.750923  0.073731  0.175345
..     ...        ...       ...       ..

#### 6.2.3 Calculate combined predictions by combining both reddit and twitter sentiments

In [32]:
for ticker in tick_list:
    print(ticker)
    reddit_sentis = reddit_df_dict[ticker][['ticker', 'YearMonDay', 'neutral', 'positive', 'negative']]
    twitter_sentis = twitter_df_dict[ticker][['ticker', 'YearMonDay', 'neutral', 'positive', 'negative']]
    
#     print(reddit_sentis,twitter_sentis,pd.concat([reddit_sentis,twitter_sentis], axis=0,ignore_index=True))
    combined_sentis = pd.concat([reddit_sentis,twitter_sentis], axis=0,ignore_index=True)
    combined_sentis_grouped = combined_sentis.groupby(['ticker', 'YearMonDay'])[['neutral', 'positive', 'negative']].mean().reset_index()
    combined_sentis_grouped.to_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Final_dfs\\combined_{ticker}_BERT_TwitterAirline_Even.pkl")
    print(combined_sentis_grouped)

AMC
    ticker YearMonDay   neutral  positive  negative
0      AMC    20Aug01  0.450005  0.311928  0.238067
1      AMC    20Aug02  0.599266  0.081191  0.319544
2      AMC    20Aug03  0.734386  0.050787  0.214826
3      AMC    20Aug04  0.790502  0.089713  0.119785
4      AMC    20Aug05  0.624519  0.126760  0.248721
..     ...        ...       ...       ...       ...
488    AMC    21Sep26  0.496433  0.215175  0.288392
489    AMC    21Sep27  0.465371  0.196891  0.337738
490    AMC    21Sep28  0.417595  0.130639  0.451766
491    AMC    21Sep29  0.458247  0.188060  0.353693
492    AMC    21Sep30  0.453878  0.223223  0.322899

[493 rows x 5 columns]
DKNG
    ticker YearMonDay   neutral  positive  negative
0     DKNG    20Aug01  0.583589  0.219671  0.196740
1     DKNG    20Aug02  0.449766  0.191134  0.359100
2     DKNG    20Aug03  0.519760  0.091945  0.388295
3     DKNG    20Aug04  0.597515  0.155424  0.247060
4     DKNG    20Aug05  0.751022  0.073146  0.175832
..     ...        ...       ...

## References
- https://machinelearningmastery.com/exploding-gradients-in-neural-networks/
- https://neptune.ai/blog/understanding-gradient-clipping-and-how-it-can-fix-exploding-gradients-problem