In [1]:
# Import necessary libraries that we'll use along this project

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from tqdm.notebook import tqdm

# Data Exploration

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Reading the csv file that we'll work with and transform it into a dataframe, then take a look at the first lines

sentiment_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/smile-annotations-final.csv', names=['id', 'text', 'category'])
sentiment_df.set_index('id', inplace=True)
sentiment_df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [4]:
# Calculating the number of rows and columns of our dataset

sentiment_df.shape

(3085, 2)

In [5]:
# Extracting the datatypes of each variable

sentiment_df.dtypes

Unnamed: 0,0
text,object
category,object


In [6]:
# Getting more detailed information on our dataset

sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3085 entries, 611857364396965889 to 611566876762640384
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      3085 non-null   object
 1   category  3085 non-null   object
dtypes: object(2)
memory usage: 72.3+ KB


In [7]:
# Calculating the number of null values of each column

sentiment_df.isnull().sum()

Unnamed: 0,0
text,0
category,0


# EDA & Data Preprocessing

## category variable :

In [8]:
# Let's first explore the target variable and see its different categories

sentiment_df['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
nocode,1572
happy,1137
not-relevant,214
angry,57
surprise,35
sad,32
happy|surprise,11
happy|sad,9
disgust|angry,7
disgust,6


In [9]:
# In here, we'll drop categories that have two different sentiments or more at the same time

sentiment_df = sentiment_df[~sentiment_df['category'].str.contains('\|')]

In [10]:
# Now, let's discard nocode category as it makes no sense

sentiment_df = sentiment_df[sentiment_df['category'] != 'nocode']

In [11]:
# Let's explore once again category column and check if the transformations have been made the right way or not

sentiment_df['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
happy,1137
not-relevant,214
angry,57
surprise,35
sad,32
disgust,6


In [12]:
# We'll create a variable called label that has the sentiments encoded so we could feed our data to a model in the end

categories_encoded = {'happy' : 0, 'not-relevant' : 1, 'angry' : 2, 'surprise' : 3, 'sad' : 4, 'disgust' : 5}
sentiment_df['label'] = sentiment_df['category'].replace(categories_encoded)
sentiment_df.head()

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0
611570404268883969,@NationalGallery @ThePoldarkian I have always ...,happy,0


# Training/Test Split

In [13]:
# Let's extract predictors and labels & split our data into training and test data to use in the rest of the project

X = sentiment_df.index.values
y = sentiment_df.label.values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=0)

In [14]:
# We'll create a new variable that will hold if a sample is part of the training or test data

sentiment_df.loc[X_train, 'data_type'] = 'train_data'
sentiment_df.loc[X_test, 'data_type'] = 'test_data'

In [15]:
# Let's check if our transformations have been made successfully or not

sentiment_df.groupby(['category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
angry,2,test_data,9
angry,2,train_data,48
disgust,5,test_data,1
disgust,5,train_data,5
happy,0,test_data,171
happy,0,train_data,966
not-relevant,1,test_data,32
not-relevant,1,train_data,182
sad,4,test_data,5
sad,4,train_data,27


# Loading Tokenizer and Encoding our data

In [16]:
# Import necessary libraries for text tokenization and load a pretrained bert tokenizer from HuggingFace

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [17]:
# Tokenize and encode our data using the already loaded pretrained bert tokenizer and transform the result into a tensor dataset by adding the labels to the transformed data

encoded_train_data = tokenizer.batch_encode_plus(
    sentiment_df[sentiment_df['data_type'] == 'train_data'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_test_data = tokenizer.batch_encode_plus(
    sentiment_df[sentiment_df['data_type'] == 'test_data'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_train_data['input_ids']
attention_masks_train = encoded_train_data['attention_mask']
labels_train = torch.tensor(sentiment_df[sentiment_df['data_type'] == 'train_data'].label.values)
train_dataset = TensorDataset(input_ids_train, attention_masks_train, labels_train)

input_ids_test = encoded_test_data['input_ids']
attention_masks_test = encoded_test_data['attention_mask']
labels_test = torch.tensor(sentiment_df[sentiment_df['data_type'] == 'test_data'].label.values)
test_dataset = TensorDataset(input_ids_test, attention_masks_test, labels_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [18]:
# Let's check the number of samples to verify if all transformations have been made successfully

print('Train dataset length :', len(train_dataset))
print('Test dataset length :', len(test_dataset))

Train dataset length : 1258
Test dataset length : 223


# Setting up Bert Pretrained Model

In [19]:
# Load a pretrained Bert model from HuggingFace

from transformers import BertForSequenceClassification

bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                          num_labels=len(categories_encoded),
                                                          output_attentions=False,
                                                          output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Creating Data Loaders

In [20]:
# Creating data loaders for training & test datasets so we could train our model with a batch size not with the whole data

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 32
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

# Setting up Optimizer & Scheduler

In [21]:
# Define an AdamW optimizer with learning rate equals to 1e-5 & number of epochs equals to 1e-8 and a linear scheduler with warmup

from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(bert_model.parameters(), lr=1e-5, eps=1e-8)

epochs = 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

# Defining Performance Metrics

In [22]:
# Make a function that calculates f1_score between true and predicted values

from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(preds_flat, labels_flat, average='weighted')

In [23]:
# Define a function that calculates the accuracy score for every class category

def accuracy_per_class(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label, value in categories_encoded.items():
        y_pred = preds_flat[labels_flat == value]
        y_true = labels_flat[labels_flat == value]
        print('Class :', label)
        print('Accuracy :', len(y_pred[y_pred == value]), '/', len(y_true))

# Creating Training Loop

In [24]:
# Verifying whether our machine is running our code using a gpu or cpu

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)

print(device)

cpu


In [25]:
# Define a function that makes predictions on the test data

def evaluate(test_dataloader):

    bert_model.eval()

    test_loss_total = 0
    predictions, true_vals = [], []

    for batch in test_dataloader:
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids' : batch[0], 'attention_mask' : batch[1], 'labels' : batch[2]}

        with torch.no_grad():
            outputs = bert_model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        test_loss_total += loss

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    test_loss_avg = test_loss_total / len(test_dataloader)
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return test_loss_avg, predictions, true_vals

In [26]:
# Let's train our pretrained bert model on the training data using the already defined optimizer and scheduler and evaluate its performance by calculating f1-score on the test data

for epoch in tqdm(range(1, epochs + 1)):

    bert_model.train()

    train_loss_total = 0
    progress_bar = tqdm(train_dataloader, desc='Epoch {:1}'.format(epoch), leave=False, disable=False)

    for batch in progress_bar:
        bert_model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids' : batch[0], 'attention_mask' : batch[1], 'labels' : batch[2]}
        outputs = bert_model(**inputs)

        loss = outputs[0]
        train_loss_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(bert_model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss :' : '{:3f}'.format(loss.item() / len(batch))})

    torch.save(bert_model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
    tqdm.write(f'\n Epoch : {epoch}')

    train_loss_avg = train_loss_total / len(train_dataloader)
    tqdm.write(f'Training loss : {train_loss_avg}')

    test_loss, predictions, true_vals = evaluate(test_dataloader)
    test_f1_score = f1_score_func(predictions, true_vals)
    tqdm.write(f'Testing loss : {test_loss}')
    tqdm.write(f'Testing f1 score : {test_f1_score}')

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/40 [00:00<?, ?it/s]


 Epoch : 1
Training loss : 1.0873677477240562
Testing loss : 0.7573949098587036
Testing f1 score : 0.8662239135353987


Epoch 2:   0%|          | 0/40 [00:00<?, ?it/s]


 Epoch : 2
Training loss : 0.737107054144144
Testing loss : 0.6817365288734436
Testing f1 score : 0.8701790170892216


Epoch 3:   0%|          | 0/40 [00:00<?, ?it/s]


 Epoch : 3
Training loss : 0.6547585383057595
Testing loss : 0.6546007394790649
Testing f1 score : 0.8715708727212965


In [27]:
# We make predictions on the test data and evaluate our model using accuracy metric

test_loss, predictions, true_vals = evaluate(test_dataloader)
accuracy_per_class(predictions, true_vals)

Class : happy
Accuracy : 171 / 171
Class : not-relevant
Accuracy : 10 / 32
Class : angry
Accuracy : 0 / 9
Class : surprise
Accuracy : 0 / 5
Class : sad
Accuracy : 0 / 5
Class : disgust
Accuracy : 0 / 1
