# Step 3C: BERT Model 3 - BERT sentiment analysis using sentiment140 dataset

# using sentiment140 twitter dataset

## 1. Required imports

### 1.1 Import required libraries

In [1]:
#---Neural Network libraries
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup            #huggingface transformers library
import tensorflow as tf                                                                                                  #tensorflow library         
import torch                                                                                                             #pytorch library
import torch.nn.functional as Func
import gc
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler                        #for loading data into our model

#---Data processing
from sklearn.model_selection import train_test_split                                                                     #for splitting data into training, testing and validation
from sklearn.metrics import f1_score
import pandas as pd                                                                                                      #for using data in the form of dataframes
import numpy as np
import re                                                                                                                #for data manipulation when cleaning datasets

#---Visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# from transformers import InputExample, InputFeatures

In [2]:
import ipywidgets as widgets
widgets.IntSlider()

IntSlider(value=0)

### 1.2 Import training data to train the BERT model
- *This dataset is from Sentiment140 - http://help.sentiment140.com/for-students*
- *We only pick the required columns from the dataset*

In [31]:
training_data = pd.read_csv(r"C:\Users\Karthik\Desktop\Dissertation\BERT\Training dataset\Sentiment140.csv", engine='python', names=['label','tweet_id','date', 'query_type', 'user', 'text'], index_col=False)
training_data = pd.concat([training_data.query('label==0').sample(n=25000),training_data.query('label==4').sample(n=25000)])
training_data = training_data[['text', 'label']]
training_data.label = training_data.label.apply(lambda x: 1 if x==4 else 0)
training_data.reset_index(drop=True, inplace=True)
training_data.head()

Unnamed: 0,text,label
0,really sad,0
1,soo...my studio fix is nearly done. this make...,0
2,@cindyjonas i know! this sucks!,0
3,@KodeRED1911 Lazy get ur azz in here! Grab a c...,0
4,"@fatpuppy Yeah, that's where I got my hat. I ...",0


### 1.3 Import reddit and twitter data for which we have to populate the sentiments
- *we will use them on the model after the model is trained with the training dataset from kaggle*
- *we save data in two dictionaries; one for reddit and one for twitter to make it more organized*### 1.3 Import reddit and twitter data for which we have to populate the sentiments
- *we will use them on the model after the model is trained with the training dataset from kaggle*

In [3]:
tick_list =  ['AAPL', 'AMC', 'DKNG', 'TSLA', 'AMD', 'BABA']

In [4]:
reddit_df_dict = {tick: pd.read_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Reddit\\consolidated_pickle_files\\reddit_{tick}_df_for_BERT.pkl") for tick in tick_list}
# reddit_df_dict  

In [5]:
reddit_df_dict['TSLA']

Unnamed: 0,body,created_utc,id,top,year,month,day,ticker,YearMonDay
0,Non-troll post. I started 2 weeks ago and have...,1609500199,ghp72zs,top,21,Jan,01,TSLA,21Jan01
1,we eat cornbread on new years day to ensure a ...,1609524808,ghqo6qb,top,21,Jan,01,TSLA,21Jan01
2,TSLA 850 EOD,1612203941,glmp0gv,top,21,Feb,01,TSLA,21Feb01
3,joe weisenthal is the ultimate chad( who else ...,1612207510,glmyg66,top,21,Feb,01,TSLA,21Feb01
4,TSLA 🚀🚀🚀🚀🚀,1612211972,gln9v98,top,21,Feb,01,TSLA,21Feb01
...,...,...,...,...,...,...,...,...,...
15125,TSLA Drill Team 6 reporting for duty,1609359381,ghj65jr,top,20,Dec,30,TSLA,20Dec30
15126,TSLA’s still having them TSLA days I see. Join...,1609359393,ghj66f1,top,20,Dec,30,TSLA,20Dec30
15127,TSLA 🎰 EOY coming,1609360555,ghj8jam,top,20,Dec,30,TSLA,20Dec30
15128,Thank fuck I didn't dump my TSLA calls yesterday,1609361591,ghjamup,top,20,Dec,30,TSLA,20Dec30


In [6]:
twitter_df_dict = {tick: pd.read_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Twitter\\consolidated_pickle_files\\twitter_{tick}_df_for_BERT.pkl") for tick in tick_list}

In [7]:
twitter_df_dict['TSLA']

Unnamed: 0,date,content,likeCount,lang,cleaned_content,year,month,day,ticker,YearMonDay
0,2021-09-18 19:56:59+00:00,@The_RockTrading Bullish on $TSLA this week &a...,1,en,Bullish on $TSLA this week & $Aapl,21,Sep,18,TSLA,21Sep18
1,2021-09-18 04:16:52+00:00,$TSLA now is the same as $aapl in the 80s ! @e...,1,en,$TSLA now is the same as $aapl in the 80s !,21,Sep,18,TSLA,21Sep18
2,2021-09-16 19:01:28+00:00,"Added more $TSLA and $aapl to long, because I ...",0,en,"Added more $TSLA and $aapl to long, because I ...",21,Sep,16,TSLA,21Sep16
3,2021-09-16 15:23:53+00:00,@NeilRog49855230 @Gays4Tesla @TheMaverickWS Th...,3,en,There is plenty of information available on-li...,21,Sep,16,TSLA,21Sep16
4,2021-09-16 15:23:11+00:00,There is plenty of information available on-li...,0,en,There is plenty of information available on-li...,21,Sep,16,TSLA,21Sep16
...,...,...,...,...,...,...,...,...,...,...
179489,2020-06-01 18:42:22+00:00,Normally volatile Tesla $TSLA is the IBD Stock...,1,en,Normally volatile Tesla $TSLA is the IBD Stock...,20,Jun,01,TSLA,20Jun01
179490,2020-06-01 18:42:10+00:00,Normally volatile Tesla $TSLA is the IBD Stock...,1,en,Normally volatile Tesla $TSLA is the IBD Stock...,20,Jun,01,TSLA,20Jun01
179491,2020-06-01 18:41:35+00:00,$TSLA up $51.00 from next suggested buy entry ...,1,en,$TSLA up $51.00 from next suggested buy entry ...,20,Jun,01,TSLA,20Jun01
179492,2020-06-01 18:41:32+00:00,@Desert_Trader81 $TSLA on the move crossing HO...,0,en,$TSLA on the move crossing HOD $885 !!!! 900 w...,20,Jun,01,TSLA,20Jun01


## 2. Prepare training dataset

### 2.1 One-Hot encode the labels

In [32]:
training_data.label.value_counts()

1    25000
0    25000
Name: label, dtype: int64

In [33]:
training_data['sentiment'] = training_data['label'].apply(lambda x: 'positive' if x==1 else 'negative')
training_data

Unnamed: 0,text,label,sentiment
0,really sad,0,negative
1,soo...my studio fix is nearly done. this make...,0,negative
2,@cindyjonas i know! this sucks!,0,negative
3,@KodeRED1911 Lazy get ur azz in here! Grab a c...,0,negative
4,"@fatpuppy Yeah, that's where I got my hat. I ...",0,negative
...,...,...,...
49995,@Spoonsie sending advil your way,1,positive
49996,"Meh, bye bye Rove. Ima go back to the computer...",1,positive
49997,Special thanks to Jose for helping me fix the ...,1,positive
49998,@aureliom Thanks for the #followfriday shout o...,1,positive


### 2.2 Data preprocessing

#### 2.2.1 Define data preprocessing function

In [8]:
def text_preprocessing(text):
    text = re.sub(r'(@.*?)[\s]', ' ', text) #remove hashtags
    text = re.sub(r'http\S+', '', text)    #remove urls
    text = re.sub(r'&amp;amp', '&', text)  #remove double amps
    text = re.sub(r'\&amp;', '&', text)    #remove single amps
    text = re.sub(r'\s+', ' ', text)       #reduce multiple spaces into a single space
    text = re.sub(r'\s+-\s+', ' ', text)
    text = text.strip()
    return text

#### 2.2.2 Preprocess the data

In [34]:
#---Make a copy of the original raw text for future reference if required
training_data['original_text'] = training_data['text']
#---Use the above function to clean the tweet data
training_data['text'] = training_data.text.apply(lambda x: text_preprocessing(x))
training_data.head()

Unnamed: 0,text,label,sentiment,original_text
0,really sad,0,negative,really sad
1,soo...my studio fix is nearly done. this makes...,0,negative,soo...my studio fix is nearly done. this make...
2,i know! this sucks!,0,negative,@cindyjonas i know! this sucks!
3,Lazy get ur azz in here! Grab a computer and g...,0,negative,@KodeRED1911 Lazy get ur azz in here! Grab a c...
4,"Yeah, that's where I got my hat. I had so many...",0,negative,"@fatpuppy Yeah, that's where I got my hat. I ..."


### 2.3 Split the cleaned training data into train and validation datasets
- *we do not need a test dataset. This is because we are using the training dataset only to train the BERT model*
- *since we are going to use the BERT model to classify a different corpus (reddit and twitter data), splitting the data into training and validation datasets serves our purpose*
- *this will also help in having a bigger training dataset*

In [35]:
#---we will use scikit-learn's train_test_split to classify records into train, test or validation dataset
x_train, x_val, y_train, y_val =  train_test_split(training_data.index.values,          #x values/ input values 
                                                   training_data.label.values,          #y values/ output values/ lables
                                                   test_size=0.25,                      #percentage of data to be used for test dataset
                                                   random_state=17,                     #shuffles the data prior to splitting
                                                   stratify=training_data.label.values  #stratification helps when the distribution of data is uneven like in our case where,
                                                                                        #we have a lot of negative compared to neutral and positive labels                                                   
)

#---add a new column to training data to classify records into train and val datasets
training_data['dataset'] = 'NA'
training_data.loc[x_train, 'dataset'] = 'train'
training_data.loc[x_val, 'dataset'] = 'val'

training_data.groupby(['sentiment', 'label', 'dataset']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text,original_text
sentiment,label,dataset,Unnamed: 3_level_1,Unnamed: 4_level_1
negative,0,train,18750,18750
negative,0,val,6250,6250
positive,1,train,18750,18750
positive,1,val,6250,6250


### 2.4 Tokenize and encode data

#### 2.4.1 Find the max tweet length in the training data
- *This is required to be input into the tokenizer*
- *BERT expects all its inputs to be of the same length so shorter sentences will be padded to maintain the length*

In [13]:
MAX_LEN = training_data['text'].str.len().max()
MAX_LEN

220

#### 2.4.2 Initialize huggingface tokenizer and tokenize and encode train and val datasets

- *In this step we will be tokenizing and encoding the input side of our datasets (i.e) the tweet comments*
- *We will be using batch_encode_plus() method in the tokenizer as our input to the tokenizer will be an array of the tweet column in training_data [training_data[training_data.dataset=='train'].text.values]*
- Ref:*https://huggingface.co/transformers/internal/tokenization_utils.html#pretrainedtokenizerbase*

In [14]:
#---Initialize bert-base-uncased tokenizer and select the option to convert all text to lowercase
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [15]:
#---Tokenize and encode train dataset
train_enc = tokenizer.batch_encode_plus(training_data[training_data.dataset=='train'].text.values, 
                                           add_special_tokens=True, 
                                           return_attention_mask=True, 
                                           pad_to_max_length=True, 
                                           max_length=MAX_LEN, 
                                           return_tensors='pt'
                                           )

#---Tokenize and encode val dataset
val_enc = tokenizer.batch_encode_plus(training_data[training_data.dataset=='val'].text.values, 
                                         add_special_tokens=True, 
                                         return_attention_mask=True, 
                                         pad_to_max_length=True, 
                                         max_length=MAX_LEN, 
                                         return_tensors='pt'
                                         )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### 2.5 Create tensor datasets from the encoded datasets from previous step
- *We must split the input_ids(encoded tokens) and attention masks(says which values a model should work on since we padded tweets shorter than 172) from the previous step*
- *we must create a tensor dataset with both input(encoded tweets from last step) and also outputs (the one-hot encoded lables)*
- *we create a tensor with the outputs so that the TensorDataset method can combine both inputs and outputs*

In [16]:
#---Get the input ids and attention masks from the tokenizer outputs
train_input_ids = train_enc['input_ids']
train_attn_masks = train_enc['attention_mask']
val_input_ids = val_enc['input_ids']
val_attn_masks = val_enc['attention_mask']

#---Create tensors with the output data
train_labels = torch.tensor(training_data[training_data.dataset=='train'].label.values)
val_labels = torch.tensor(training_data[training_data.dataset=='val'].label.values)

#---Create the required tensor datasets which will be used in the dataloader
train_tensor = TensorDataset(train_input_ids, train_attn_masks, train_labels)
val_tensor = TensorDataset(val_input_ids, val_attn_masks, val_labels)

len(train_tensor), len(val_tensor)

(37500, 12500)

### 2.6 Create dataloaders
- *We create two dataloaders - one for training data and one for validation data*
- *We use random sampling*
- *We use a  batch size of 32 in consideration of the low computing power at hand. It is also known that smaller batch sizes converge quicker - this seems like a good choice considering our limited training data*


In [17]:
batch_size = 16

# We Need two different dataloder
train_dloader = DataLoader(train_tensor, sampler=RandomSampler(train_tensor), batch_size=batch_size)
val_dloader = DataLoader(val_tensor, sampler=RandomSampler(val_tensor), batch_size=batch_size)

In [18]:
# for batch in val_dloader:
#     display(batch)
#     display("----=-=-=-=--=-=")
#     for b in batch:
#         display(b)

## 3. BERT model and performance definitions
- *Define BERT model, optimizer, scheduler and performance metrics*
- *Ignore the warning because we are going to train the model before using it to classify reddit and twitter data

### 3.1 Define the BERT model

In [19]:
len(training_data.label.unique())

2

In [20]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(training_data.label.unique()),    #Used for better code reproducability instead of initializing a constant value
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### 3.2 Move the model to GPU for better performance

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device) 
print(device)

cuda


### 3.3 Define the optimizer and scheduler
- *We prefer AdamW optimizer function since it is the most widely used general purpose optimizer*
- *The scheduler will help bring the learning rate down if it senses overfitting*

In [21]:
epochs = 3
lr = 4e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

### 3.4 Define evaluation function with performance metrics
- *We prefer AdamW optimizer function since it is the most widely used general purpose optimizer*
- *The scheduler will help bring the learning rate down if it senses overfitting*

https://huggingface.co/transformers/main_classes/output.html

In [22]:
def model_eval(validation_dataloader):
    #---set the model in evaluation mode to disable dropout layers, batch normalization, etc
    model.eval()
    
    #---Initialize some variables to calculate model performance post evaluation
    total_loss = 0                          #counter to keep a sum of loss values across all batches
    predicted_class, true_class = [], []    #counters to keep track of all the predicted and true classes across all batches
    
    #---Use model to predict with validation dataset
    for batch in tqdm(validation_dataloader):
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                                                         #move the batch elements to GPU for faster performance
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}
        
        #---enable no_grad option in pytorch as there will be no back-propogation during evaluation
        with torch.no_grad():
            #---Predict outputs using the model
            model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]             #Example output in every iteration: tensor(0.2658, device='cuda:0')
        total_loss += loss.item()
        #print('model op 0:', loss)
        
        #---get the predicted outputs (logits) for current batch and append to predicted_class list
        logits = model_output[1]                          #List of lists [[0.1 0.2 0.3][0.1 0.2 0.3]...32 lists inside a list]
        #move the logits to cpu and convert to numpy since we will compute the simple math error calculations in cpu and not GPU
        logits = logits.detach().cpu().numpy()            #array of list of lists array([[0.1 0.2 0.3][0.1 0.2 0.3]...32 lists inside a list])
        predicted_class.append(logits)                    #list containing array of list of lists [array, array, ...]
        #print('model op 1:', logits)
        #print('pred_class:', predicted_class)
        
        #---get the original true labels and append to true_class list
        true_labels = input_dict['labels'].cpu().numpy()   #array of list of labels [0,1,2,2,1,0,...32 labels]
        true_class.append(true_labels)                     #list of array of list of labels [array([32 labels]), array([32 labels]),...]
        #print('true_class:', true_class)
    
    #---Compute average loss
    average_loss = total_loss/len(validation_dataloader) 
    
    #---Compute prediction accuracy
    predictions = np.concatenate(predicted_class, axis=0)  #list of lists [[0.1 0.2 0.3][0.1 0.2 0.3]....]
    true_vals = np.concatenate(true_class, axis=0)         #List of lists
    correct_predictions = np.sum(np.argmax(predictions, axis=1).flatten() == true_vals)
    prediction_accuracy = correct_predictions / len(training_data[training_data.dataset=='val'])
    #print('predicaitons', predictions)
    #print('true_vals', true_vals)
    
    #---Compute f1 score
    flattened_predictions = np.argmax(predictions, axis=1).flatten()    #List of values in predictions
    flattened_labels = true_vals.flatten()                              #List of values in true_vals
    f1score = f1_score(flattened_labels, flattened_predictions, average='weighted')
    #print('preds_flat:', preds_flat)
    #print('labels_flat:', labels_flat)
    
    return average_loss, predictions, true_vals, prediction_accuracy, f1score

### 3.5 Clear GPU cache

In [23]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=True)



- https://machinelearningmastery.com/exploding-gradients-in-neural-networks/
- https://neptune.ai/blog/understanding-gradient-clipping-and-how-it-can-fix-exploding-gradients-problem

## 4. Trial and error to find the best parameters
- *The original paper for BERT worked on batch sizes of (16 and 32) and learning rates of (5e-5, 4e-5, 3e-5, 2e-5, 1e-5)*
- REF: *https://arxiv.org/abs/1810.04805*
- *We prefer AdamW optimizer function since it is the most widely used general purpose optimizer and Adam was the optimization function used in the original paper on BERT*
- *The scheduler will help bring the learning rate down if it senses overfitting*

### 4.1 Learning Rate: 4e-5, Batch size = 16

In [26]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_Sentiment140_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.4208461363663191
Validation loss: 0.3851662040652369
validation accuracy: 0.83096
f1 score: 0.8307587941398925


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.25873052553550274
Validation loss: 0.4250749346640561
validation accuracy: 0.83128
f1 score: 0.8310865695283144


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.13917963950887421
Validation loss: 0.6944000666694301
validation accuracy: 0.82888
f1 score: 0.8288153069763945



### 4.2 Learning Rate: 3e-5, Batch size = 16

In [27]:
epochs = 3
lr = 3e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [28]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_Sentiment140_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.1564887284863233
Validation loss: 0.671937946092023
validation accuracy: 0.82384
f1 score: 0.8238118504898282


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.0867322064487212
Validation loss: 0.9431607150313411
validation accuracy: 0.81776
f1 score: 0.8175842457772563


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.03696078970871783
Validation loss: 1.1663305137046378
validation accuracy: 0.82504
f1 score: 0.8249956457764376



### 4.3 Learning Rate: 2e-5, Batch size = 16

In [29]:
epochs = 3
lr = 2e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [30]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_Sentiment140_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.06091343062879982
Validation loss: 1.276457139588644
validation accuracy: 0.82248
f1 score: 0.8224693095278935


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.03680679552722884
Validation loss: 1.1925956902970352
validation accuracy: 0.82416
f1 score: 0.8241524326461255


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.016153724763539174
Validation loss: 1.417942157954003
validation accuracy: 0.824
f1 score: 0.8239464526797898



### 4.4 Learning Rate: 1e-5, Batch size = 16

In [31]:
epochs = 3
lr = 1e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [32]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_Sentiment140_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.018606522796964686
Validation loss: 1.5379125961290017
validation accuracy: 0.82104
f1 score: 0.8210317245069411


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.014482215968010638
Validation loss: 1.58442658079775
validation accuracy: 0.8256
f1 score: 0.8255654189722758


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.008653285596655527
Validation loss: 1.6233788173296053
validation accuracy: 0.82608
f1 score: 0.8260587997682098



### 4.5 Learning Rate: 5e-5, Batch size = 16

In [33]:
epochs = 3
lr = 5e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [34]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_Sentiment140_Batch_{batch_size}_LR_{lr}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.13055027880797854
Validation loss: 0.9773190225857278
validation accuracy: 0.8136
f1 score: 0.8135655170780388


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.06981834219545088
Validation loss: 1.064036731321491
validation accuracy: 0.8224
f1 score: 0.8223231298789109


HBox(children=(HTML(value='Epoch 3'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲3
Training loss: 0.027265351066210163
Validation loss: 1.307236851887018
validation accuracy: 0.82272
f1 score: 0.8226872042252935



### 4.6 Learning Rate: 4e-5, Batch size = 16, 2 epochs

In [24]:
epochs = 2
lr = 4e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dloader)*epochs)

In [25]:
training_loss_dict = {}
validation_loss_dict = {}

for epoch in tqdm(range(1, epochs+1)):
    
    #---set the model to training mode
    model.train()         
    
    #counter to keep a sum of loss values across all epochs
    total_loss = 0

    # Setting up the Progress bar to Moniter the progress of training
    pbar = tqdm(train_dloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in pbar:
        
        #---set zero_grad so that gradient values are not accumulated across batches
        model.zero_grad()
        
        #---get the input_ids, attention_masks and labels from each batch int a dictionary to pass as input to the model
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
         
        input_dict = {'input_ids': temp_batch[0], 'attention_mask': temp_batch[1], 'labels': temp_batch[2]}       

        #---Predict outputs using the model
        model_output = model(**input_dict)

        #---get loss for current batch and add to the total_loss counter
        loss = model_output[0]
        total_loss += loss.item()
        
        #---compute and clip gradients
        loss.backward()                                           #compute gradients via backpropogation
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #clip gradients to prevent gradient explosion issue

        #---Use optimizer and scheduler to tune the model during epochs
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    #---save the model to disk        
    #torch.save(model.state_dict(), r'C:\Users\Karthik\Desktop\Dissertation\BERT\Model\BERT_trained.model')
        
    #---calculate metrics 
    loss_train_avg = total_loss/len(train_dloader)            
    val_loss, predictions, true_vals, val_accuracy, f1score = model_eval(val_dloader)

    training_loss_dict[epoch] = loss_train_avg
    validation_loss_dict[epoch] = val_loss
    
    tqdm.write("\u0332".join(f'\nEpoch {epoch}'))
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'validation accuracy: {val_accuracy}')
    tqdm.write(f'f1 score: {f1score}')
    
torch.save(model, f"C:\\Users\\Karthik\\Desktop\\Dissertation\\BERT\\Models\\BERT_Sentiment140_Batch_{batch_size}_LR_{lr}_epoch_{epochs}")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value='Epoch 1'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲1
Training loss: 0.41865895168836714
Validation loss: 0.37467033030165126
validation accuracy: 0.84
f1 score: 0.8397960503754014


HBox(children=(HTML(value='Epoch 2'), FloatProgress(value=0.0, max=2344.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=782.0), HTML(value='')))



̲E̲p̲o̲c̲h̲ ̲2
Training loss: 0.24322100275485387
Validation loss: 0.4319172023135759
validation accuracy: 0.842
f1 score: 0.8419690259290822



## 5. Predict with the best model
- *Batch size of 32 and learning rate of 4e-4 gave the best results*

### 5.1 Load the selected model

In [9]:
reqd_model = torch.load(r"C:\Users\Karthik\Desktop\Dissertation\BERT\Models\BERT_Sentiment140_Batch_16_LR_4e-05_epoch_2")

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
reqd_model.to(device) 
print(device)

cuda


### 5.2 Prepare the datasets to be used in the BERT model for prediction

#### 5.2.1 Clean the text data

In [11]:
for tick, tick_df in reddit_df_dict.items():
#     print(tick, tick_df)
    tick_df['cleaned_body'] = tick_df['body'].apply(lambda x: text_preprocessing(x))
    tick_df['LEN'] = tick_df.cleaned_body.str.len()    

In [12]:
for tick, tick_df in twitter_df_dict.items():
    tick_df['cleaned_body'] = tick_df['cleaned_content'].apply(lambda x: text_preprocessing(x))
    tick_df['LEN'] = tick_df.cleaned_body.str.len()    

#### 5.2.2 Handle records where text is greater than a length of 512
- *BERT can only handle a max length of 512*
- *For each comment > length of 512, break the comment into multiple sets of 512*
- *After breaking, create a new record for each of the broken parts and append to the original dataframe. Since we take the average sentiment scores, appending new rows with the same dates will not affect the data*

In [13]:
n=512
def ffn(xseries):
    list_of_lists = []
        
    if xseries['LEN']>512:
        parts = [xseries['cleaned_body'][i:i+n] for i in range(0, xseries['LEN'], n)]
    
        counter=0
        for i in parts:
            list_series = []
            list_series.append(xseries['body'])
            list_series.append(xseries['created_utc'])
            list_series.append(xseries['id'])
            list_series.append(xseries['top'])
            list_series.append(xseries['year'])
            list_series.append(xseries['month'])
            list_series.append(xseries['day'])
            list_series.append(xseries['ticker'])
            list_series.append(xseries['YearMonDay'])
            list_series.append(i)
            counter+=1
            list_series.append(counter)
            
            list_of_lists.append(list_series)
    
        return list_of_lists
    else:
        return 0

In [14]:
for tick, tick_df in reddit_df_dict.items():
    tick_df['mltp'] = tick_df.apply(ffn, axis=1)

In [15]:
for tick, tick_df in twitter_df_dict.items():
    tick_df['mltp'] = tick_df.apply(ffn, axis=1)

In [16]:
for tick, tick_df in reddit_df_dict.items():
    addition_list = tick_df.query('mltp != 0')['mltp']
    
    if len(addition_list)==0:
        continue
    
    ind_additions = []

    for list_of_sentences in addition_list:
        for sentence_part in list_of_sentences:
            ind_additions.append(sentence_part)
    
    
    df_additions = pd.DataFrame(ind_additions, columns=['body','created_utc','id','top','year','month','day','ticker','YearMonDay','cleaned_body','LEN'])
    
    tick_df.drop('mltp', axis=1, inplace=True)
    tick_df.drop(tick_df[tick_df.LEN > 512].index, inplace = True)
    tick_df = tick_df.append(df_additions, ignore_index=True)

In [17]:
for tick, tick_df in twitter_df_dict.items():
    addition_list = tick_df.query('mltp != 0')['mltp']
    
    if len(addition_list)==0:
        continue
    
    ind_additions = []

    for list_of_sentences in addition_list:
        for sentence_part in list_of_sentences:
            ind_additions.append(sentence_part)
    
    
    df_additions = pd.DataFrame(ind_additions, columns=['body','created_utc','id','top','year','month','day','ticker','YearMonDay','cleaned_body','LEN'])
    
    tick_df.drop('mltp', axis=1, inplace=True)
    tick_df.drop(tick_df[tick_df.LEN > 512].index, inplace = True)
    tick_df = tick_df.append(df_additions, ignore_index=True)

- *Make sure there is no data where comment length is greater than 512*

In [18]:
for tick, tick_df in reddit_df_dict.items():
    print(tick_df.query('LEN > 512'))

Empty DataFrame
Columns: [body, created_utc, id, top, year, month, day, ticker, YearMonDay, cleaned_body, LEN]
Index: []
Empty DataFrame
Columns: [body, created_utc, id, top, year, month, day, ticker, YearMonDay, cleaned_body, LEN]
Index: []
Empty DataFrame
Columns: [body, created_utc, id, top, year, month, day, ticker, YearMonDay, cleaned_body, LEN]
Index: []
Empty DataFrame
Columns: [body, created_utc, id, top, year, month, day, ticker, YearMonDay, cleaned_body, LEN]
Index: []
Empty DataFrame
Columns: [body, created_utc, id, top, year, month, day, ticker, YearMonDay, cleaned_body, LEN]
Index: []
Empty DataFrame
Columns: [body, created_utc, id, top, year, month, day, ticker, YearMonDay, cleaned_body, LEN]
Index: []


In [19]:
for tick, tick_df in twitter_df_dict.items():
    print(tick_df.query('LEN > 512'))

Empty DataFrame
Columns: [date, content, likeCount, lang, cleaned_content, year, month, day, ticker, YearMonDay, cleaned_body, LEN, mltp]
Index: []
Empty DataFrame
Columns: [date, content, likeCount, lang, cleaned_content, year, month, day, ticker, YearMonDay, cleaned_body, LEN, mltp]
Index: []
Empty DataFrame
Columns: [date, content, likeCount, lang, cleaned_content, year, month, day, ticker, YearMonDay, cleaned_body, LEN, mltp]
Index: []
Empty DataFrame
Columns: [date, content, likeCount, lang, cleaned_content, year, month, day, ticker, YearMonDay, cleaned_body, LEN, mltp]
Index: []
Empty DataFrame
Columns: [date, content, likeCount, lang, cleaned_content, year, month, day, ticker, YearMonDay, cleaned_body, LEN, mltp]
Index: []
Empty DataFrame
Columns: [date, content, likeCount, lang, cleaned_content, year, month, day, ticker, YearMonDay, cleaned_body, LEN, mltp]
Index: []


#### 5.2.3 Define a funtion to prepare reddit and twitter data for model prediction
- *The data should be tokenized and encoded and then tensor datasets must be created from them to be passed to the BERT model*
- *We will create a dictionary for reddit and twitter to hold the tensor datasets respectively*

In [20]:
#---Initialize bert-base-uncased tokenizer and select the option to convert all text to lowercase
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def dataloader_fnc(text, MAX_LEN):
    input_id_list = []
    attn_mask_list = []
    
    for line in text:
        #--Tokenize and encode the cleaned text (we do not return_tensors here since our tensor should have all the lines)
        data_enc = tokenizer.encode_plus(line, 
                                         add_special_tokens=True, 
                                         return_attention_mask=True, 
                                         pad_to_max_length=True, 
                                         max_length=MAX_LEN
                                         )
        
        #--get the input_ids and attention_mask values
        input_id_list.append(data_enc.get("input_ids"))
        attn_mask_list.append(data_enc.get("attention_mask"))
    
    tensor_dataset = TensorDataset(torch.tensor(input_id_list), torch.tensor(attn_mask_list) )
    dataloader = DataLoader(tensor_dataset, sampler=SequentialSampler(tensor_dataset), batch_size=16)
    
    return dataloader

In [21]:
reddit_dataloader_dict = {tick: dataloader_fnc(tick_df.cleaned_body, tick_df.cleaned_body.str.len().max())  for tick, tick_df in reddit_df_dict.items()}

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [22]:
reddit_dataloader_dict

{'AAPL': <torch.utils.data.dataloader.DataLoader at 0x23215174820>,
 'AMC': <torch.utils.data.dataloader.DataLoader at 0x232a62ced30>,
 'DKNG': <torch.utils.data.dataloader.DataLoader at 0x232a62cef10>,
 'TSLA': <torch.utils.data.dataloader.DataLoader at 0x232a62ceb50>,
 'AMD': <torch.utils.data.dataloader.DataLoader at 0x232150ba9d0>,
 'BABA': <torch.utils.data.dataloader.DataLoader at 0x232150baaf0>}

In [23]:
twitter_dataloader_dict = {tick: dataloader_fnc(tick_df.cleaned_body, tick_df.cleaned_body.str.len().max())  for tick, tick_df in twitter_df_dict.items()}

In [24]:
twitter_dataloader_dict

{'AAPL': <torch.utils.data.dataloader.DataLoader at 0x23253328580>,
 'AMC': <torch.utils.data.dataloader.DataLoader at 0x231c7d38580>,
 'DKNG': <torch.utils.data.dataloader.DataLoader at 0x231c7d38490>,
 'TSLA': <torch.utils.data.dataloader.DataLoader at 0x231c7d382e0>,
 'AMD': <torch.utils.data.dataloader.DataLoader at 0x231c7d382b0>,
 'BABA': <torch.utils.data.dataloader.DataLoader at 0x231c7d26970>}

## 6. Prediction

### 6.1 Define a prediction function

In [25]:
def prediction(bert_model, data):
    #--put the model in eval mode
    bert_model.eval()
    
    #--predict and collect output logits in a list
    output_logit_list = []
    
    for batch in data:
        temp_batch = tuple(b.to(device) for b in batch)                 #move the batch elements to GPU for faster performance
        input_id_tensor = temp_batch[0]
        attn_mask_tensor = temp_batch[1]
#         print(input_id_tensor)
#         print(attn_mask_tensor)
        
        with torch.no_grad():
            output_logits = bert_model(input_id_tensor, attn_mask_tensor)
            output_logit_list.append(output_logits[0])
    
    #--concatenate logits across batches and apply softmax
    complete_logits = torch.cat(output_logit_list, dim=0)
    softmax_probs = Func.softmax(complete_logits, dim=1).cpu().numpy() #convert to numpy
    
    return softmax_probs

### 6.2 Predict sentiments for reddit data

#### 6.2.1 Prediction for reddit comments
- *Use the model to get positive and negative probabilities for each reddit comment*
- *Group by date and compute average positive, negative and neutral scores for each day*

In [26]:
for tick, tick_df in reddit_df_dict.items():
    pred_df = pd.DataFrame(prediction(reqd_model, reddit_dataloader_dict[tick]), columns=['negative', 'positive'])
    reddit_df_dict[tick] = pd.concat([tick_df, pred_df], axis=1)

In [27]:
for tick, tick_df in reddit_df_dict.items():
    tick_grouped = tick_df.groupby(['ticker', 'YearMonDay'])[['negative', 'positive']].mean().reset_index()
    tick_grouped.to_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Final_dfs\\reddit_{tick}_BERT_Sentiment140.pkl")
    print(tick_grouped)

    ticker YearMonDay  negative  positive
0     AAPL    20Aug03  0.525778  0.474222
1     AAPL    20Aug05  0.473751  0.526249
2     AAPL    20Aug06  0.566665  0.433335
3     AAPL    20Aug10  0.563220  0.436780
4     AAPL    20Aug11  0.605449  0.394551
..     ...        ...       ...       ...
253   AAPL    21May24  0.382096  0.617904
254   AAPL    21May25  0.458121  0.541879
255   AAPL    21May26  0.561172  0.438828
256   AAPL    21May27  0.677567  0.322433
257   AAPL    21May28  0.515061  0.484939

[258 rows x 4 columns]
    ticker YearMonDay  negative  positive
0      AMC    20Aug05  0.469485  0.530515
1      AMC    20Aug13  0.982924  0.017076
2      AMC    20Aug17  0.733672  0.266328
3      AMC    20Aug19  0.430416  0.569584
4      AMC    20Aug25  0.104475  0.895525
..     ...        ...       ...       ...
175    AMC    21May26  0.404465  0.595535
176    AMC    21May27  0.382352  0.617648
177    AMC    21May28  0.393951  0.606049
178    AMC    21May29  0.659310  0.340690
179    AMC

#### 6.2.2 Prediction for twitter comments
- *Use the model to get positive and negative probabilities for each twitter comment*
- *Group by date and compute average positive, negative and neutral scores for each day*

In [28]:
for tick, tick_df in twitter_df_dict.items():
    pred_df = pd.DataFrame(prediction(reqd_model, twitter_dataloader_dict[tick]), columns=['negative', 'positive'])
    twitter_df_dict[tick] = pd.concat([tick_df, pred_df], axis=1)

In [29]:
for tick, tick_df in twitter_df_dict.items():
    tick_grouped = tick_df.groupby(['ticker', 'YearMonDay'])[['negative', 'positive']].mean().reset_index()
    tick_grouped.to_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Final_dfs\\twitter_{tick}_BERT_Sentiment140.pkl")
    print(tick_grouped)

    ticker YearMonDay  negative  positive
0     AAPL    20Aug01  0.410255  0.589745
1     AAPL    20Aug02  0.385161  0.614839
2     AAPL    20Aug03  0.403561  0.596439
3     AAPL    20Aug04  0.398259  0.601741
4     AAPL    20Aug05  0.437953  0.562047
..     ...        ...       ...       ...
482   AAPL    21Sep26  0.408619  0.591381
483   AAPL    21Sep27  0.384897  0.615103
484   AAPL    21Sep28  0.423793  0.576207
485   AAPL    21Sep29  0.398828  0.601171
486   AAPL    21Sep30  0.466274  0.533726

[487 rows x 4 columns]
    ticker YearMonDay  negative  positive
0      AMC    20Aug01  0.424519  0.575481
1      AMC    20Aug02  0.571125  0.428875
2      AMC    20Aug03  0.320163  0.679837
3      AMC    20Aug04  0.355008  0.644992
4      AMC    20Aug05  0.412835  0.587165
..     ...        ...       ...       ...
482    AMC    21Sep26  0.269218  0.730782
483    AMC    21Sep27  0.370985  0.629015
484    AMC    21Sep28  0.378215  0.621785
485    AMC    21Sep29  0.368406  0.631594
486    AMC

#### 6.2.3 Calculate combined predictions by combining both reddit and twitter sentiments

In [30]:
for ticker in tick_list:
    print(ticker)
    reddit_sentis = reddit_df_dict[ticker][['ticker', 'YearMonDay', 'negative', 'positive']]
    twitter_sentis = twitter_df_dict[ticker][['ticker', 'YearMonDay', 'negative', 'positive']]
    
#     print(reddit_sentis,twitter_sentis,pd.concat([reddit_sentis,twitter_sentis], axis=0,ignore_index=True))
    combined_sentis = pd.concat([reddit_sentis,twitter_sentis], axis=0,ignore_index=True)
    combined_sentis_grouped = combined_sentis.groupby(['ticker', 'YearMonDay'])[['negative', 'positive']].mean().reset_index()
    combined_sentis_grouped.to_pickle(f"C:\\Users\\Karthik\\Desktop\\Dissertation\\Final_dfs\\combined_{ticker}_BERT_Sentiment140.pkl")
    print(combined_sentis_grouped)

AAPL
    ticker YearMonDay  negative  positive
0     AAPL    20Aug01  0.410255  0.589745
1     AAPL    20Aug02  0.385161  0.614839
2     AAPL    20Aug03  0.418962  0.581038
3     AAPL    20Aug04  0.398259  0.601741
4     AAPL    20Aug05  0.439466  0.560534
..     ...        ...       ...       ...
499   AAPL    21Sep26  0.408619  0.591381
500   AAPL    21Sep27  0.384897  0.615103
501   AAPL    21Sep28  0.423793  0.576207
502   AAPL    21Sep29  0.398828  0.601171
503   AAPL    21Sep30  0.466274  0.533726

[504 rows x 4 columns]
AMC
    ticker YearMonDay  negative  positive
0      AMC    20Aug01  0.424519  0.575481
1      AMC    20Aug02  0.571125  0.428875
2      AMC    20Aug03  0.320163  0.679837
3      AMC    20Aug04  0.355008  0.644992
4      AMC    20Aug05  0.417985  0.582015
..     ...        ...       ...       ...
488    AMC    21Sep26  0.269218  0.730782
489    AMC    21Sep27  0.370985  0.629015
490    AMC    21Sep28  0.378215  0.621785
491    AMC    21Sep29  0.368406  0.631594
4

## References
- https://machinelearningmastery.com/exploding-gradients-in-neural-networks/
- https://neptune.ai/blog/understanding-gradient-clipping-and-how-it-can-fix-exploding-gradients-problem