# Sentiment Analysis  using BERT

In [None]:
#libraries and packages
!pip install torch
import torch
import pandas as pd
from tqdm.notebook import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#load data
df = pd.read_csv('/content/smile-annotations-final.csv', 
                 names = ['id', 'text', 'category'])

#reset index
df.set_index('id', inplace = True)

In [None]:
#preview
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6.11857e+17,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
6.14485e+17,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
6.14747e+17,@SelectShowcase @Tate_StIves ... Replace with ...,happy
6.14878e+17,@Sofabsports thank you for following me back. ...,happy
6.11932e+17,@britishmuseum @TudorHistory What a beautiful ...,happy


In [None]:
#info
df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 3085 entries, 6.11857e+17 to 6.11567e+17
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      3085 non-null   object
 1   category  3085 non-null   object
dtypes: object(2)
memory usage: 72.3+ KB


In [None]:
#check for null
df.isnull().sum()

text        0
category    0
dtype: int64

In [None]:
#look at an example
df.text.iloc[10]

'"1...2..." "non arrête mon brush!". l.Alma|A favourite custom|1909 @NationalGallery #bonlundi http://t.co/HpjvSJHGhP'

In [None]:
#count for each class
df.category.value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: category, dtype: int64

In [None]:
#drop irrelevent class
df = df[~df.category.str.contains('\|')]

In [None]:
#drop irrelevent class
df = df[df.category != 'nocode']

In [None]:
#final classes
df.category.value_counts()

happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: category, dtype: int64

In [None]:
#store classes into an array
possible_labels = df.category.unique()
possible_labels

array(['happy', 'not-relevant', 'angry', 'disgust', 'sad', 'surprise'],
      dtype=object)

In [None]:
#convert labels into numeric values
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [None]:
label_dict

{'angry': 2,
 'disgust': 3,
 'happy': 0,
 'not-relevant': 1,
 'sad': 4,
 'surprise': 5}

In [None]:
#convert labels into numeric values
df['label'] = df.category.replace(label_dict)
df.head(10)

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6.14485e+17,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
6.14747e+17,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
6.14878e+17,@Sofabsports thank you for following me back. ...,happy,0
6.11932e+17,@britishmuseum @TudorHistory What a beautiful ...,happy,0
6.1157e+17,@NationalGallery @ThePoldarkian I have always ...,happy,0
6.145e+17,Lucky @FitzMuseum_UK! Good luck @MirandaStearn...,happy,0
6.13602e+17,Yr 9 art students are off to the @britishmuseu...,happy,0
6.13697e+17,@RAMMuseum Please vote for us as @sainsbury #s...,not-relevant,1
6.10747e+17,#AskTheGallery Have you got plans to privatise...,not-relevant,1
6.12648e+17,@BarbyWT @britishmuseum so beautiful,happy,0


## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

#train test split
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                   df.label.values,
                                                   test_size = 0.15,
                                                   random_state = 17,
                                                   stratify = df.label.values)

In [None]:
#create new column
df['data_type'] = ['not_set'] * df.shape[0]
df.head()

Unnamed: 0_level_0,text,category,label,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6.14485e+17,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0,not_set
6.14747e+17,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0,not_set
6.14878e+17,@Sofabsports thank you for following me back. ...,happy,0,not_set
6.11932e+17,@britishmuseum @TudorHistory What a beautiful ...,happy,0,not_set
6.1157e+17,@NationalGallery @ThePoldarkian I have always ...,happy,0,not_set


In [None]:
#fill in data type
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
angry,2,train,41
angry,2,val,16
disgust,3,train,5
disgust,3,val,1
happy,0,train,888
happy,0,val,249
not-relevant,1,train,169
not-relevant,1,val,45
sad,4,train,24
sad,4,val,8


## Tokenization

In [None]:
!pip install transformers

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                         do_lower_case = True)

In [None]:
#tokenize train set
encoded_data_train = tokenizer.batch_encode_plus(df[df.data_type == 'train'].text.values,
                                                add_special_tokens = True,
                                                return_attention_mask = True,
                                                pad_to_max_length = True,
                                                max_length = 150,
                                                return_tensors = 'pt')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
#tokenizer val set
encoded_data_val = tokenizer.batch_encode_plus(df[df.data_type == 'val'].text.values,
                                                #add_special_tokens = True,
                                                return_attention_mask = True,
                                                pad_to_max_length = True,
                                                max_length = 150,
                                                return_tensors = 'pt')



In [None]:
encoded_data_train

{'input_ids': tensor([[  101, 16092,  3897,  ...,     0,     0,     0],
        [  101,  1030, 27034,  ...,     0,     0,     0],
        [  101,  1030, 10682,  ...,     0,     0,     0],
        ...,
        [  101, 11047,  1030,  ...,     0,     0,     0],
        [  101,  1030,  3680,  ...,     0,     0,     0],
        [  101,  1030,  2120,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

## Encoding

In [None]:
#encode train set
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

In [None]:
#encode val set
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']

#convert data type to torch.tensor
labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

In [None]:
input_ids_train

tensor([[  101, 16092,  3897,  ...,     0,     0,     0],
        [  101,  1030, 27034,  ...,     0,     0,     0],
        [  101,  1030, 10682,  ...,     0,     0,     0],
        ...,
        [  101, 11047,  1030,  ...,     0,     0,     0],
        [  101,  1030,  3680,  ...,     0,     0,     0],
        [  101,  1030,  2120,  ...,     0,     0,     0]])

In [None]:
attention_masks_train

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [None]:
labels_train

tensor([0, 0, 0,  ..., 0, 0, 1])

In [None]:
#create dataloader
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val, 
                             attention_masks_val, 
                             labels_val)

In [None]:
print(len(dataset_train))
print(len(dataset_val))

1154
327


In [None]:
dataset_train

<torch.utils.data.dataset.TensorDataset at 0x7f7591bd3450>

In [None]:
dataset_train.tensors

(tensor([[  101, 16092,  3897,  ...,     0,     0,     0],
         [  101,  1030, 27034,  ...,     0,     0,     0],
         [  101,  1030, 10682,  ...,     0,     0,     0],
         ...,
         [  101, 11047,  1030,  ...,     0,     0,     0],
         [  101,  1030,  3680,  ...,     0,     0,     0],
         [  101,  1030,  2120,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 0, 0,  ..., 0, 0, 1]))

## Set Up BERT Pretrained Model

In [None]:
from transformers import BertForSequenceClassification

#load pre-trained BERT
# output attention is false 

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = len(label_dict),
                                                      output_attentions = False,
                                                      output_hidden_states = False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:


for name, param in model.named_parameters():

  if name.startswith("bert.encoder.layer.11"):

    param.requires_grad = False

    if name.startswith("bert.encoder.layer.12"):
      param.requires_grad = False
  
  print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight True
bert.embeddings.position_embeddings.weight True
bert.embeddings.token_type_embeddings.weight True
bert.embeddings.LayerNorm.weight True
bert.embeddings.LayerNorm.bias True
bert.encoder.layer.0.attention.self.query.weight True
bert.encoder.layer.0.attention.self.query.bias True
bert.encoder.layer.0.attention.self.key.weight True
bert.encoder.layer.0.attention.self.key.bias True
bert.encoder.layer.0.attention.self.value.weight True
bert.encoder.layer.0.attention.self.value.bias True
bert.encoder.layer.0.attention.output.dense.weight True
bert.encoder.layer.0.attention.output.dense.bias True
bert.encoder.layer.0.attention.output.LayerNorm.weight True
bert.encoder.layer.0.attention.output.LayerNorm.bias True
bert.encoder.layer.0.intermediate.dense.weight True
bert.encoder.layer.0.intermediate.dense.bias True
bert.encoder.layer.0.output.dense.weight True
bert.encoder.layer.0.output.dense.bias True
bert.encoder.layer.0.output.LayerNorm.weight True


In [None]:
#model summary
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

## Create Data Loaders

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 4 #since we have limited resource

#load train set
dataloader_train = DataLoader(dataset_train,
                              sampler = RandomSampler(dataset_train),
                              batch_size = batch_size)

#load val set
dataloader_val = DataLoader(dataset_val,
                              sampler = RandomSampler(dataset_val),
                              batch_size = 32) #since we don't have to do backpropagation for this step

## Set Up Optimizer and Scheduler

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
epochs = 10

#load optimizer
optimizer = AdamW(model.parameters(),
                 lr = 1e-5,
                 eps = 1e-8) #2e-5 > 5e-5



In [None]:
#load scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = len(dataloader_train)*epochs)

## Define Performance Metrics

In [None]:
import numpy as np
from sklearn.metrics import f1_score

#f1 score
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [None]:
#accuracy score
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    #make prediction
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
def evaluate(dataloader_val):

    #evaluation mode disables the dropout layer 
    model.eval()
    
    #tracking variables
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        #load into GPU
        batch = tuple(b.to(device) for b in batch)
        
        #define inputs
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2]}

        #compute logits
        with torch.no_grad():        
            outputs = model(**inputs)
        
        #compute loss
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        #compute accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    #compute average loss
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

## Train Model

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [None]:
for epoch in tqdm(range(1, epochs+1)):

    #set model in train mode
    model.train()

    #tracking variable
    loss_train_total = 0
    
    #set up progress bar
    progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        #set gradient to 0
        model.zero_grad()

        #load into GPU
        batch = tuple(b.to(device) for b in batch)

        #define inputs
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        outputs = model(**inputs)
        loss = outputs[0] #output.loss
        loss_train_total +=loss.item()

        #backward pass to get gradients
        loss.backward()
        
        #clip the norm of the gradients to 1.0 to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        #update optimizer
        optimizer.step()

        #update scheduler
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
    
    tqdm.write('\nEpoch {epoch}')
    
    #print training result
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    #evaluate
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    #f1 score
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/289 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.7581313140432521


  0%|          | 0/11 [00:00<?, ?it/s]

Validation loss: 0.6107063916596499
F1 Score (weighted): 0.7705914027655013


Epoch 2:   0%|          | 0/289 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.4799741613050233


  0%|          | 0/11 [00:00<?, ?it/s]

Validation loss: 0.6768115508285436
F1 Score (weighted): 0.7724786918331804


Epoch 3:   0%|          | 0/289 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.3470367567362361


  0%|          | 0/11 [00:00<?, ?it/s]

Validation loss: 0.8041173680262133
F1 Score (weighted): 0.7860058221501673


Epoch 4:   0%|          | 0/289 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.24155534030974338


  0%|          | 0/11 [00:00<?, ?it/s]

Validation loss: 0.7146957218647003
F1 Score (weighted): 0.8356248588965571


Epoch 5:   0%|          | 0/289 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.14398458247817408


  0%|          | 0/11 [00:00<?, ?it/s]

Validation loss: 0.7207399931820956
F1 Score (weighted): 0.8379268151356024


Epoch 6:   0%|          | 0/289 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.09908067972611179


  0%|          | 0/11 [00:00<?, ?it/s]

Validation loss: 0.7453244991431182
F1 Score (weighted): 0.8380896667032945


Epoch 7:   0%|          | 0/289 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.06485182517304836


  0%|          | 0/11 [00:00<?, ?it/s]

Validation loss: 0.789553399790417
F1 Score (weighted): 0.8389825867530157


Epoch 8:   0%|          | 0/289 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.04430063783753483


  0%|          | 0/11 [00:00<?, ?it/s]

Validation loss: 0.9106014357371763
F1 Score (weighted): 0.8534072136700483


Epoch 9:   0%|          | 0/289 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.039099649956190446


  0%|          | 0/11 [00:00<?, ?it/s]

Validation loss: 0.790932492099025
F1 Score (weighted): 0.8484044245762906


Epoch 10:   0%|          | 0/289 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.03278438268599155


  0%|          | 0/11 [00:00<?, ?it/s]

Validation loss: 0.8003569949756969
F1 Score (weighted): 0.8495590482897603


## Model Evaluation


In [None]:
outputs.loss

tensor(0.0005, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
outputs.logits

tensor([[ 7.6258, -1.8816, -2.2501, -1.6907, -1.0389, -1.2780],
        [ 7.6321, -1.5414, -2.2762, -1.8715, -1.2669, -1.2519]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
#evaluate
_, predictions, true_vals = evaluate(dataloader_val)

  0%|          | 0/11 [00:00<?, ?it/s]

In [None]:
#get accuracy score
accuracy_per_class(predictions, true_vals)

Class: happy
Accuracy:234/249

Class: not-relevant
Accuracy:23/45

Class: angry
Accuracy:13/16

Class: disgust
Accuracy:0/1

Class: sad
Accuracy:6/8

Class: surprise
Accuracy:3/8

