In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv('/content/smile-annotations-final.csv',
                 names=['id', 'text','category'])
df.set_index('id', inplace=True)

In [3]:
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [4]:
df.text.iloc[0]

'@aandraous @britishmuseum @AndrewsAntonio Merci pour le partage! @openwinemap'

In [5]:
df.category.value_counts()
#nocode is simply no clear emotions in this tweet

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
nocode,1572
happy,1137
not-relevant,214
angry,57
surprise,35
sad,32
happy|surprise,11
happy|sad,9
disgust|angry,7
disgust,6


In [6]:
#we want to remove nocode and also those with multiple emotions, with |
df = df[~df.category.str.contains('\|')]   #we need to choose the | by backslash
df = df[df.category != 'nocode']

In [7]:
df.category.value_counts()
#class imbalance

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
happy,1137
not-relevant,214
angry,57
surprise,35
sad,32
disgust,6


In [8]:
#build dictionary, key: emotion, value: numbers
possible_labels = df.category.unique()

In [9]:
label_dict = {}
#loop over index
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [10]:
label_dict

{'happy': 0,
 'not-relevant': 1,
 'angry': 2,
 'disgust': 3,
 'sad': 4,
 'surprise': 5}

In [11]:
#build new column for these values
df['label'] = df.category.replace(label_dict)
df.head()

  df['label'] = df.category.replace(label_dict)


Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
#stratified split
X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                 df.label.values,
                                                 test_size = 0.15,
                                                 random_state=17,
                                                 stratify = df.label.values
                                                 )

In [14]:
df['data_type'] = ['not_set']*df.shape[0]

In [15]:
df.head()

Unnamed: 0_level_0,text,category,label,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0,not_set
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0,not_set
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0,not_set
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0,not_set
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0,not_set


In [16]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [17]:
df.groupby(['category', 'label', 'data_type']).count()
#group by using count

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
angry,2,train,48
angry,2,val,9
disgust,3,train,5
disgust,3,val,1
happy,0,train,966
happy,0,val,171
not-relevant,1,train,182
not-relevant,1,val,32
sad,4,train,27
sad,4,val,5


In [18]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [19]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                         #all lower case
                                         do_lower_case = True,
                                         )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [20]:
#batch using multiple strings and convert them into tokens
encoded_data_train = tokenizer.batch_encode_plus(
        df[df.data_type == 'train'].text.values,
        add_special_tokens = True,
        #to know when sentence begins and ends
        return_attention_mask = True,
        #set max length to large values for big sentences
        pad_to_max_length = True,
        max_length = 256,
        return_tensors = 'pt'
        #pt: pytorch
        )

encoded_data_val = tokenizer.batch_encode_plus(
        df[df.data_type == 'val'].text.values,
        add_special_tokens = True,
        #to know when sentence begins and ends
        return_attention_mask = True,
        #set max length to large values for big sentences
        pad_to_max_length = True,
        max_length = 256,
        return_tensors = 'pt'
        #pt: pytorch
        )

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type == 'train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type == 'val'].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [21]:
dataset_train = TensorDataset(input_ids_train,
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val,
                              attention_masks_val,
                              labels_val)

In [22]:
len(dataset_train)

1258

In [23]:
len(dataset_val)

223

In [24]:
from transformers import BertForSequenceClassification

In [25]:
#each sequence will be dealt separate classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    #the other cased one is larger and takes more computation power
    #we want to fine tune the parts we need
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False
                                     )
#450 MB needs to be fetched and loaded into memory
#bert takes into text and encodes into meaningful way according to the huge corpus it was intitially exposed to
#we are just lying on top of it to get our 6 classes classifier

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from torch.utils.data import DataLoader,RandomSampler,SequentialSampler

In [101]:
batch_size = 32   #very small due to machine low specs but can increase to 32

dataloader_train = DataLoader(
            dataset_train,
            sampler = RandomSampler(dataset_train),
            #to avoid it learning from any sequences
            batch_size = batch_size
            )

dataloader_val = DataLoader(
            dataset_val,
            sampler = RandomSampler(dataset_val),
            #to avoid it learning from any sequences
            batch_size = 32    #here no many computation, no backpropagation
            )

In [102]:
#Optimizer defines our learning rate and how it changed throught each epoch
from transformers import AdamW, get_linear_schedule_with_warmup
#Adam with weight decay, stochastic optimizer

In [103]:
optimizer = AdamW(
                model.parameters(),
                lr = 5e-5,         #recommended: 2e-5 > 5e-5
                eps = 1e-8,
                )



In [112]:
epoch = 10

schedular = get_linear_schedule_with_warmup(
        optimizer,     #Adam
        num_warmup_steps = 0,
        num_training_steps = len(dataloader_train)*epoch
        )

In [113]:
import numpy as np

In [114]:
from sklearn.metrics import f1_score

In [115]:
#f1-score is good bec. of class imbalance
#accuracy alone will give me skewed results,
    #based on f1-score not actually representing what we want

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    #flatten to get single list and not array
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')
#can changed weighted to macro

In [116]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
#here we are using numpy indexing to index 2 array of the same shape by each other
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:  {len(y_preds[y_preds == label])}/{len(y_true)}\n')

In [117]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [118]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#send model to device we are using
model.to(device)
print(device)

cuda


In [119]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals


In [120]:
for epoch in tqdm(range(1, epoch+1)):

    model.train()

    loss_train_total = 0
    #we set it initially as 0

    progress_bar = tqdm(dataloader_train,
                        desc = 'Epoch {:1d}'.format(epoch),
                        leave = False,   #overwrite after each epoch
                        disable = False
                       )
    #to see where are we, has it crashed

    for batch in progress_bar:
        model.zero_grad()
        #gradient set to zero

        batch = tuple(b.to(device) for b in batch)
        #this is imp for cuda gpu use

        inputs = {
            'input_ids':         batch[0],
            'attention_mask':    batch[1],
            'labels' :           batch[2]
        }

        outputs = model(**inputs)
        #outputs dictionary directly into inputs

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        #clip our gradient
        #take gradient and give it normal value that we provide as 1
        #stop gradients from slipping into becoming exceptionally small or too big
        #promote generalization

        optimizer.step()
        schedular.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
        #append small dictionary

    torch.save(model.state_dict(), f'/content/model/Bert_ft_epoch{epoch}.model')

    tqdm.write('\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write('Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_val)
        #this is imp if over training
        #model will have no generalization abilities
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')



  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: {loss_train_avg}
Validation loss: 0.47170497689928326
F1 Score (weighted): 0.842540699107976


Epoch 2:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: {loss_train_avg}
Validation loss: 0.591927924326488
F1 Score (weighted): 0.8489024757861542


Epoch 3:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: {loss_train_avg}
Validation loss: 0.549673753125327
F1 Score (weighted): 0.8703180795773051


Epoch 4:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: {loss_train_avg}
Validation loss: 0.5639691480568477
F1 Score (weighted): 0.8706985378840569


Epoch 5:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: {loss_train_avg}
Validation loss: 0.6859284937381744
F1 Score (weighted): 0.8636490141647092


Epoch 6:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: {loss_train_avg}
Validation loss: 0.6115182978766305
F1 Score (weighted): 0.8794385235151739


Epoch 7:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: {loss_train_avg}
Validation loss: 0.7019630138363157
F1 Score (weighted): 0.8695882569839816


Epoch 8:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: {loss_train_avg}
Validation loss: 0.6917361489364079
F1 Score (weighted): 0.8733698266848338


Epoch 9:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: {loss_train_avg}
Validation loss: 0.7200451408113752
F1 Score (weighted): 0.8695882569839816


Epoch 10:   0%|          | 0/40 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: {loss_train_avg}
Validation loss: 0.7257078758307866
F1 Score (weighted): 0.8695882569839816


In [121]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [122]:
model.to(device)
pass   #to not get alot of text output

In [123]:
model.load_state_dict(
    torch.load('/content/model/Bert_ft_epoch10.model'))

  torch.load('/content/model/Bert_ft_epoch10.model'))


<All keys matched successfully>

In [124]:
_, prediction, true_vals = evaluate(dataloader_val)
#7 batches
#will take almost 2 minutes

In [125]:
accuracy_per_class(prediction, true_vals)

Class: happy
Accuracy:  163/171

Class: not-relevant
Accuracy:  19/32

Class: angry
Accuracy:  7/9

Class: disgust
Accuracy:  0/1

Class: sad
Accuracy:  2/5

Class: surprise
Accuracy:  3/5

