# Natural Language Processing - Emotion Prediction

## Importing the libraries

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import numpy as np

## Importing the dataset

In [None]:
dataset = pd.read_csv('text.csv', index_col=0, quoting = 3)

In [None]:
dataset.head(10)

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4
5,i was beginning to feel quite disheartened,0
6,i would think that whomever would be lucky eno...,2
7,i fear that they won t ever feel that deliciou...,1
8,im forever taking some time out to have a lie ...,5
9,i can still lose the weight without feeling de...,0


In [None]:
dataset = dataset.iloc[0:2000]

## Cleaning the texts

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def preprocess_text(text):
    return tokenizer(text, padding='max_length', max_length=128, truncation=True, return_tensors="pt")

# Apply the tokenizer to the text column
dataset['input_ids'] = dataset['text'].apply(lambda x: preprocess_text(x)['input_ids'].squeeze(0).tolist())
dataset['attention_mask'] = dataset['text'].apply(lambda x: preprocess_text(x)['attention_mask'].squeeze(0).tolist())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
label_encoder = LabelEncoder()
dataset['label'] = label_encoder.fit_transform(dataset['label'])

# Split the dataset into training and testing sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    dataset[['input_ids', 'attention_mask']],
    dataset['label'],
    test_size=0.2,
    random_state=42
)

# Convert the split data into dictionaries
train_texts = {'input_ids': torch.tensor(train_texts['input_ids'].tolist()), 'attention_mask': torch.tensor(train_texts['attention_mask'].tolist())}
val_texts = {'input_ids': torch.tensor(val_texts['input_ids'].tolist()), 'attention_mask': torch.tensor(val_texts['attention_mask'].tolist())}
train_labels = torch.tensor(train_labels.tolist())
val_labels = torch.tensor(val_labels.tolist())


In [None]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.texts.items()}
        item['labels'] = self.labels[idx]
        return item

# Create dataset objects
train_dataset = EmotionDataset(train_texts, train_labels)
val_dataset = EmotionDataset(val_texts, val_labels)

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [None]:
class EmotionClassifier(nn.Module):
    def __init__(self, n_classes):
        super(EmotionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
# Check if a GPU is available and if not, use a CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model = EmotionClassifier(n_classes=len(label_encoder.classes_))
model = model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_dataloader) * 3  # Number of training epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Define loss function
loss_fn = nn.CrossEntropyLoss().to(device)

# Training function
def train_epoch(model, dataloader, loss_fn, optimizer, device, scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in dataloader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(dataloader.dataset), np.mean(losses)

# Evaluation function
def eval_model(model, dataloader, loss_fn, device):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in dataloader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / len(dataloader.dataset), np.mean(losses)

# Training loop
epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_dataloader,
        loss_fn,
        optimizer,
        device,
        scheduler
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_dataloader,
        loss_fn,
        device
    )

    print(f'Val loss {val_loss} accuracy {val_acc}')
    print()




Epoch 1/3
----------
Train loss 1.209025104343891 accuracy 0.54875
Val loss 0.6517348456382751 accuracy 0.7875

Epoch 2/3
----------
