In [11]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import torchtext
from torchtext.data import get_tokenizer

from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

from transformers import BertTokenizer, BertModel
import joblib
import requests

# Datasets: go_emotions

https://www.kaggle.com/datasets/debarshichanda/goemotions/data

In [3]:
df1 = pd.read_csv('dataset/goemotions_1.csv')
df2 = pd.read_csv('dataset/goemotions_2.csv')
df3 = pd.read_csv('dataset/goemotions_3.csv')

df = pd.concat([df1, df2, df3])
df = df.reset_index()
df['index'] = [i for i in range(df.shape[0])]
df = df.set_index('index')

# Preprocessing

In [9]:
# use GPU(for Mac)
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
# use GPU(for CUDA)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# init BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def preprocess_with_bert(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    # use the output of the [CLS] token as the sentence embedding
    sentence_embedding = outputs.last_hidden_state[:, 0, :].squeeze().detach().cpu().numpy()
    return sentence_embedding


In [54]:
# use BERT to embed each text in the df['text'] column
# This step may take a long time, depending on your hardware performance
# I strongly recommend you to save the results to the local environment
df['bert_embedding'] = df['text'].apply(preprocess_with_bert)

# save the embeddings to the local environment
joblib.dump(df['bert_embedding'].tolist(), 'bert_embeddings.joblib')

In [6]:
# load the embeddings from the local environment
bert_embeddings = joblib.load('bert_embeddings.joblib')
df['bert_embedding'] = bert_embeddings

# extract the features and labels
X = pd.DataFrame(df['bert_embedding'].to_list())
y = df.iloc[:, 9:37].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# transform BERT embeddings to tensors(used in PyTorch)

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

train_data = TensorDataset(X_train_tensor, y_train_tensor)
valid_data = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 100
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size, drop_last=True)

In [8]:
# define the SimpleNN model

class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, drop_prob=0.1):
        super(SimpleNN, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(drop_prob)
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x


input_dim = X_train.shape[1]  # BERT embedding size
hidden_dim = 64
output_dim = y_train.shape[1]
drop_prob = 0.25

model = SimpleNN(input_dim, hidden_dim, output_dim, drop_prob).to(device)
print(model)



SimpleNN(
  (fc1): Linear(in_features=768, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=28, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)


In [9]:
# loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# calculate the accuracy
def acc(pred, label):
    pred = torch.round(pred)
    correct = (pred == label).float()
    acc = correct.sum() / correct.numel()
    return acc

In [10]:
# train the model
epochs = 5
clip = 5
valid_loss_min = np.Inf

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        output = model(inputs)
        
        loss = criterion(output, labels)
        loss.backward()
        
        train_losses.append(loss.item())
        
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        accuracy = acc(output, labels)
        train_acc += accuracy
    
    model.eval()
    val_losses = []
    val_acc = 0.0
    with torch.no_grad():
        for inputs, labels in valid_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            output = model(inputs)
            
            val_loss = criterion(output, labels)
            val_losses.append(val_loss.item())
            
            accuracy = acc(output, labels)
            val_acc += accuracy
    
    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc / len(train_loader)
    epoch_val_acc = val_acc / len(valid_loader)
    
    print(f'Epoch {epoch + 1}')
    print(f'Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}')
    print(f'Train Accuracy: {epoch_train_acc * 100:.2f}%, Val Accuracy: {epoch_val_acc * 100:.2f}%')
    
    if epoch_val_loss <= valid_loss_min:
        print(f'Validation loss decreased ({valid_loss_min:.6f} --> {epoch_val_loss:.6f}). Saving model ...')
        torch.save(model.state_dict(), 'model.pt')
        valid_loss_min = epoch_val_loss

print('Training complete.')


Epoch 1
Train Loss: 0.1392, Val Loss: 0.1395
Train Accuracy: 95.79%, Val Accuracy: 95.95%
Validation loss decreased (inf --> 0.139521). Saving model ...
Epoch 2
Train Loss: 0.1283, Val Loss: 0.1353
Train Accuracy: 95.95%, Val Accuracy: 95.98%
Validation loss decreased (0.139521 --> 0.135252). Saving model ...
Epoch 3
Train Loss: 0.1263, Val Loss: 0.1362
Train Accuracy: 95.97%, Val Accuracy: 95.99%
Epoch 4
Train Loss: 0.1251, Val Loss: 0.1332
Train Accuracy: 95.99%, Val Accuracy: 96.00%
Validation loss decreased (0.135252 --> 0.133200). Saving model ...
Epoch 5
Train Loss: 0.1244, Val Loss: 0.1350
Train Accuracy: 96.00%, Val Accuracy: 96.00%
Training complete.
