In [0]:
from tqdm.notebook import tqdm
import pandas as pd
import transformers
from transformers import BertForSequenceClassification,BertTokenizer
from transformers import AdamW,get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler
from sklearn.metrics import f1_score,accuracy_score
import random

In [2]:
df = pd.read_csv("/content/smile-annotations-final.csv", names=['id','text','category'])
df.set_index('id',inplace=True)
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [3]:
df.category.value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: category, dtype: int64

In [0]:
df = df[~df.category.str.contains('\|')]
df = df[df['category'] != 'nocode']

In [5]:
df.category.value_counts()

happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: category, dtype: int64

In [0]:
label_dict = { j:i for i,j in enumerate(df.category.unique())}
df['label'] = df.category.map(label_dict)

In [7]:
df.head()

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0


In [0]:
X_train,X_val, y_train,y_val = train_test_split(df.index.values,df.label.values,test_size=0.15,random_state=10,stratify = df.label.values)

In [0]:
df['data_type'] = ['na']*df.shape[0]
df.loc[X_train,"data_type"] = 'train'
df.loc[X_val,"data_type"] = 'val'

In [10]:
df.head()

Unnamed: 0_level_0,text,category,label,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0,train
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0,train
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0,train
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0,train
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0,val


In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case = True)

In [0]:
encoded_data_train = tokenizer.batch_encode_plus(df[df.data_type =='train'].text.values,
                                          add_special_tokens=True,
                                           return_attention_mask = True,
                                           pad_to_max_length = True,
                                           max_length=256,
                                           return_tensors = 'pt'
                                           )
encoded_data_val = tokenizer.batch_encode_plus(df[df.data_type =='val'].text.values,
                                          add_special_tokens=True,
                                           return_attention_mask = True,
                                           pad_to_max_length = True,
                                           max_length=256,
                                           return_tensors = 'pt'
                                           )

input_ids_train = encoded_data_train['input_ids']
attention_mask_train = encoded_data_train['attention_mask']
label_train = torch.tensor(df[df.data_type == 'train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_mask_val = encoded_data_val['attention_mask']
label_val = torch.tensor(df[df.data_type == 'val'].label.values)

In [0]:
dataset_train = TensorDataset(input_ids_train,attention_mask_train,label_train)
dataset_val = TensorDataset(input_ids_val,attention_mask_val,label_val)


In [0]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=len(label_dict),
                                                      output_attentions=False,output_hidden_states=False)

In [0]:
bs = 32

dataloader_train = DataLoader(dataset_train,sampler=RandomSampler(dataset_train),batch_size=bs)
dataloader_val = DataLoader(dataset_val,sampler = SequentialSampler(dataset_val),batch_size=bs)

In [0]:
epochs = 10
optimizer = AdamW(model.parameters(),lr = 3e-05,eps=1e-07)
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,
                                            num_training_steps= len(dataloader_train)*epochs)

In [0]:
def f_score(preds,labels):
  preds_flat = np.argmax(preds,axis=1).flatten()
  labels_flat = labels.flatten()
  return f1_score(labels_flat,preds_flat,average='weighted')

In [0]:
def accuracy(preds,labels):
  preds_flat = np.argmax(preds,axis=1).flatten()
  labels_flat = labels.flatten()
  return accuracy_score(labels_flat,preds_flat)

In [0]:
def accuracy_per_class(preds,labels):
  label_dict_inv = {v: k for k, v in label_dict.items()}
    
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()

  for label in np.unique(labels_flat):
      y_preds = preds_flat[labels_flat==label]
      y_true = labels_flat[labels_flat==label]
      print(f'Class: {label_dict_inv[label]}')
      print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [0]:
seed_val = 10
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [0]:
def evaluate(dataloader):
  model.eval()
  loss_val_total = 0
  preds,true_labels = [],[]
  
  for batch in dataloader:
    batch = tuple(b.to(device) for b in batch)
    inputs  = {'input_ids': batch[0], 'attention_mask' : batch[1], 'labels' : batch[2] }
    with torch.no_grad():
      outputs = model(**inputs)
    
    loss = outputs[0]
    logits = outputs[1]
    loss_val_total += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].detach().cpu().numpy()
    preds.append(logits)
    true_labels.append(label_ids)
  
  loss_val_avg = loss_val_total/len(dataloader_val) 
    
  preds = np.concatenate(preds, axis=0)
  true_labels = np.concatenate(true_labels, axis=0)
            
  return loss_val_avg, preds, true_labels


In [23]:
for epoch in tqdm(range(1, epochs+1)):

  model.train()

  loss_train_total = 0

  progress_bar = tqdm(dataloader_train,desc='Epoch {:1d}'.format(epoch),leave=False,disable=False)

  for batch in progress_bar:
    model.zero_grad()
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids':      batch[0],'attention_mask': batch[1],'labels': batch[2]}
    outputs = model(**inputs)

    loss= outputs[0]
    loss_train_total += loss.item()
    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(),1.0)

    optimizer.step()
    scheduler.step()
    
    progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
    
  torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
  tqdm.write(f'\nEpoch {epoch}')

  loss_train_avg = loss_train_total/len(dataloader_train)            
  tqdm.write(f'Training loss: {loss_train_avg}')
    
  val_loss, preds, true_labels = evaluate(dataloader_val)
  val_f1 = f_score(preds, true_labels)
  val_acc = accuracy(preds,true_labels)
  tqdm.write(f'Validation loss: {val_loss}')
  tqdm.write(f'F1 Score (Weighted): {val_f1}')
  tqdm.write(f'Accuracy Score : {val_acc}')


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=40.0, style=ProgressStyle(description_width…


Epoch 1
Training loss: 0.9006500475108623
Validation loss: 0.6599450835159847
F1 Score (Weighted): 0.688382967306734
Accuracy Score : 0.7757847533632287


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=40.0, style=ProgressStyle(description_width…


Epoch 2
Training loss: 0.5068012749776244
Validation loss: 0.5267009224210467
F1 Score (Weighted): 0.7821736044302879
Accuracy Score : 0.7937219730941704


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=40.0, style=ProgressStyle(description_width…


Epoch 3
Training loss: 0.31518045980483295
Validation loss: 0.510186323097774
F1 Score (Weighted): 0.8165304268846503
Accuracy Score : 0.8475336322869955


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=40.0, style=ProgressStyle(description_width…


Epoch 4
Training loss: 0.20601371321827172
Validation loss: 0.4439525944846017
F1 Score (Weighted): 0.8528192883349834
Accuracy Score : 0.8654708520179372


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=40.0, style=ProgressStyle(description_width…


Epoch 5
Training loss: 0.13615759387612342
Validation loss: 0.4822454814399992
F1 Score (Weighted): 0.8426568019887953
Accuracy Score : 0.8565022421524664


HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=40.0, style=ProgressStyle(description_width…


Epoch 6
Training loss: 0.08965511238202453
Validation loss: 0.4946765665497099
F1 Score (Weighted): 0.8440656062195447
Accuracy Score : 0.852017937219731


HBox(children=(FloatProgress(value=0.0, description='Epoch 7', max=40.0, style=ProgressStyle(description_width…


Epoch 7
Training loss: 0.05504520802060142
Validation loss: 0.5143599254744393
F1 Score (Weighted): 0.8722074815509614
Accuracy Score : 0.8789237668161435


HBox(children=(FloatProgress(value=0.0, description='Epoch 8', max=40.0, style=ProgressStyle(description_width…


Epoch 8
Training loss: 0.03475100873038173
Validation loss: 0.5584342862878527
F1 Score (Weighted): 0.8702284158123916
Accuracy Score : 0.8789237668161435


HBox(children=(FloatProgress(value=0.0, description='Epoch 9', max=40.0, style=ProgressStyle(description_width…


Epoch 9
Training loss: 0.02462871700990945
Validation loss: 0.5674384172473635
F1 Score (Weighted): 0.8500530397172951
Accuracy Score : 0.8565022421524664


HBox(children=(FloatProgress(value=0.0, description='Epoch 10', max=40.0, style=ProgressStyle(description_widt…


Epoch 10
Training loss: 0.02104468559846282
Validation loss: 0.5636159607342311
F1 Score (Weighted): 0.8474586141987526
Accuracy Score : 0.8609865470852018



In [26]:
_,preds,true_labels = evaluate(dataloader_val)
accuracy_per_class(preds,true_labels)

Class: happy
Accuracy: 166/171

Class: not-relevant
Accuracy: 15/32

Class: angry
Accuracy: 7/9

Class: disgust
Accuracy: 0/1

Class: sad
Accuracy: 1/5

Class: surprise
Accuracy: 3/5

