In [1]:
import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, RobertaModel
import torch
from sklearn.metrics import f1_score, accuracy_score, classification_report
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.init as init




In [2]:
# Loading the training file
file_path = '/Users/diana/Desktop/isear-train.xlsx'

# Define custom headers
custom_headers = ['Emotions', 'Text']
df = pd.read_excel(file_path, skiprows=1, header=None, names=custom_headers)

# Display the first few rows
print(len(df['Text']))

5366


In [3]:
# Label preprocessing

label_encoding = {'anger': 0, 'disgust': 1, 'fear': 2, 'guilt': 3, 'joy': 4, 'sadness': 5, 'shame': 6}
y = df['Emotions'].values
y_train_encoded = np.array([label_encoding[label] for label in y])
y_train_tensor = torch.tensor (y_train_encoded)

#print("Encoded labels:", y, y_train_encoded[:20])
#print(len(y))


In [10]:
# Creating RoBERTa embeddings for the trainset
# Load pre-trained RoBERTa model and tokenizer 
roberta_model = RobertaModel.from_pretrained('roberta-base', output_hidden_states=True)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Use GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
roberta_model.to(device)

# Function to get sentence embeddings for a list of text strings
def get_sentence_embeddings(text_list, batch_size=32):
    all_embeddings = []

    for i in range(0, len(text_list), batch_size): # Process the text list in batches
        batch = text_list[i:i + batch_size]

        # Tokenize the batch of text strings and convert to tensors
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

        # No gradient calculation in forward pass
        with torch.no_grad():
            outputs = roberta_model(**inputs)

        # Extract the hidden state of the CLS token (first token) from the last layer
        cls_embeddings = outputs.hidden_states[-1][:, 0, :] 
        all_embeddings.append(cls_embeddings)   # Append the embeddings to the embedding list
 
    # Concatenate all batch embeddings
    sentence_embeddings = torch.cat(all_embeddings, dim=0)
    
    return sentence_embeddings

# Example usage with strings_dev
# sentence_embeddings_dev = get_sentence_embeddings(strings_dev)
# print("Sentence embeddings shape:", sentence_embeddings_dev.shape)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
# Initialize embeddings for trainset
sentence_embeddings = get_sentence_embeddings(list(df['Text']))


In [53]:
# Load the devset
file_path_dev = '/Users/diana/Desktop/isear-validation.xlsx'
custom_headers_dev = ['Emotions_dev', 'Text_dev']
df_dev = pd.read_excel(file_path_dev, skiprows=1, header=None, names=custom_headers_dev)
#print (type(df_dev['Text_dev']))

# Data and label processing for devset
strings_dev = []
for index, sentence in enumerate(df_dev['Text_dev']):
    emotion_label_dev = df_dev['Emotions_dev'][index]
    strings_dev.append (sentence)
#print (strings_dev[0:3])

# Label encoding
y_dev = df_dev['Emotions_dev'].values
y_dev_encoded = np.array([label_encoding[label] for label in y_dev])
y_dev_tensor = torch.tensor (y_dev_encoded)
#print (len (strings_dev))
#print (y_dev_tensor.shape)


In [55]:
# Load the test set
file_path_test = '/Users/diana/Desktop/isear-test.xlsx'
custom_headers_test = ['Emotions_test', 'Text_test']
df_test = pd.read_excel(file_path_test, skiprows=1, header=None, names=custom_headers_test)

# Data and label processing for testset
strings_test = []
for index, sentence in enumerate(df_test['Text_test']):
    emotion_label_test = df_test['Emotions_test'][index]
    strings_test.append (sentence)

strings_test = df_test['Text_test'].tolist()
#print (strings_test[0:3])

# Label encoding
y_test = df_test['Emotions_test'].values
y_test_encoded = np.array([label_encoding[label] for label in y_test])
y_test_tensor = torch.tensor (y_test_encoded)
print (len (df_test['Text_test']))


1150


In [57]:
# Creating devset embeddings
sentence_embeddings_dev = get_sentence_embeddings(list(df_dev['Text_dev']))
# Print the shape of the embeddings
print("Sentence embeddings shape:", sentence_embeddings_dev.shape)

Sentence embeddings shape: torch.Size([1150, 768])


In [58]:
# Creating testset embeddings
sentence_embeddings_test = get_sentence_embeddings(list(df_test['Text_test']))
#print("Sentence embeddings shape:", sentence_embeddings_test.shape)

In [59]:
# Checking dimensions
#print (type (sentence_embeddings))
y_train_tensor = torch.tensor(y_train_encoded)
y_test_tensor = torch.tensor(y_test_encoded)
print (y_dev_tensor.shape, y_test_tensor.shape,y_train_tensor.shape)

torch.Size([1150]) torch.Size([1150]) torch.Size([5366])


In [93]:
# Convert to PyTorch tensors
embeddings = torch.tensor(sentence_embeddings, dtype=torch.float32).to(device)
labels = torch.tensor(y_train_encoded, dtype=torch.long).to(device)

embeddings_dev = torch.tensor(sentence_embeddings_dev, dtype=torch.float32).to(device)
labels_dev = torch.tensor(y_dev_encoded, dtype=torch.long)

embeddings_test = torch.tensor(sentence_embeddings_test, dtype=torch.float32).to(device)
labels_test = torch.tensor(y_test_encoded, dtype=torch.long)

# Create datasets and dataloaders for training, validation, and testing
dataset = TensorDataset(embeddings, labels)
dataset_dev = TensorDataset(embeddings_dev,labels_dev)
dataset_test = TensorDataset(embeddings_test,labels_test)


train_loader = DataLoader(dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(dataset_dev, batch_size=32, shuffle=False)
test_loader = DataLoader(dataset_test, batch_size=32, shuffle=False)

class SimpleNN(nn.Module): # Defining the model
    def __init__(self, input_dim, hidden_dim1, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.bn1 = nn.BatchNorm1d(hidden_dim1)
        self.relu1 = nn.LeakyReLU()
        self.dropout1 = nn.Dropout(0.4)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.relu2 = nn.LeakyReLU()
        self.dropout2 = nn.Dropout(0.4)
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.bn3 = nn.BatchNorm1d(hidden_dim3)
        self.relu3 = nn.LeakyReLU()
        self.dropout3 = nn.Dropout(0.4) 

        self.fc4 = nn.Linear(hidden_dim3, output_dim)


        self.init_weights()

    def init_weights(self):
        # Initialize weights and biases using Kaiming initialization and constant zero for biases
        init.kaiming_uniform_(self.fc1.weight, nonlinearity='leaky_relu')
        init.constant_(self.fc1.bias, 0)
        init.kaiming_uniform_(self.fc2.weight, nonlinearity='leaky_relu')
        init.constant_(self.fc2.bias, 0)
        init.kaiming_uniform_(self.fc3.weight, nonlinearity='leaky_relu')
        init.constant_(self.fc3.bias, 0)
        init.kaiming_uniform_(self.fc4.weight, nonlinearity='leaky_relu')
        init.constant_(self.fc4.bias, 0)

    
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.dropout3(x)

        x = self.fc4(x)
        return x
    
# Define dimensions for the neural network
input_dim = sentence_embeddings.shape[1]
hidden_dim1 = 568  
hidden_dim2 = 284   
hidden_dim3 = 16
output_dim = 7

  embeddings = torch.tensor(sentence_embeddings, dtype=torch.float32).to(device)
  embeddings_dev = torch.tensor(sentence_embeddings_dev, dtype=torch.float32).to(device)
  embeddings_test = torch.tensor(sentence_embeddings_test, dtype=torch.float32).to(device)


In [94]:
# Create an instant of the model
simp_model = SimpleNN(input_dim, hidden_dim1, output_dim).to(device)
# Define Loss and Optimisation function
lossf = nn.CrossEntropyLoss()
optimizer = optim.AdamW (simp_model.parameters(), lr=0.0001, weight_decay=1e-9) # L2 regularization 
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)


In [105]:
num_epochs = 30

# Initialize lists to store losses
train_losses = []
val_losses = []
test_losses = []

for epoch in range(num_epochs):
    # Training phase
    simp_model.train()
    train_loss = 0.0
    
    for batch_embeddings, batch_labels in train_loader:
        # Forward pass
        outputs = simp_model(batch_embeddings)
        loss = lossf(outputs, batch_labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Calculate average training loss
    train_loss /= len(train_loader)
    train_losses.append(train_loss)
    
    # Validation phase
    simp_model.eval()
    val_loss = 0.0

    
    with torch.no_grad():
        for batch_embeddings, batch_labels in val_loader:
            outputs = simp_model(batch_embeddings)
            loss = lossf(outputs, batch_labels)
            val_loss += loss.item()
    
    # Calculate average validation loss
    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    scheduler.step(val_loss)
    
    # Test phase
    simp_model.eval()
    test_loss = 0.0
    
    with torch.no_grad():
        for batch_embeddings, batch_labels in test_loader:
            outputs = simp_model(batch_embeddings)
            loss = lossf(outputs, batch_labels)
            test_loss += loss.item()
    
    # Calculate average test loss
    test_loss /= len(test_loader)
    test_losses.append(test_loss)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Test Loss: {test_loss:.4f}')


Epoch [1/30], Train Loss: 0.9036, Val Loss: 1.0371, Test Loss: 1.0286
Epoch [2/30], Train Loss: 0.8748, Val Loss: 1.0390, Test Loss: 1.0254
Epoch [3/30], Train Loss: 0.8862, Val Loss: 1.0413, Test Loss: 1.0257
Epoch [4/30], Train Loss: 0.8838, Val Loss: 1.0407, Test Loss: 1.0290
Epoch [5/30], Train Loss: 0.9031, Val Loss: 1.0395, Test Loss: 1.0282
Epoch [6/30], Train Loss: 0.8863, Val Loss: 1.0407, Test Loss: 1.0270
Epoch [7/30], Train Loss: 0.8819, Val Loss: 1.0411, Test Loss: 1.0260
Epoch [8/30], Train Loss: 0.8874, Val Loss: 1.0467, Test Loss: 1.0296
Epoch [9/30], Train Loss: 0.8900, Val Loss: 1.0371, Test Loss: 1.0311
Epoch [10/30], Train Loss: 0.8675, Val Loss: 1.0436, Test Loss: 1.0267
Epoch [11/30], Train Loss: 0.8846, Val Loss: 1.0434, Test Loss: 1.0291
Epoch [12/30], Train Loss: 0.8959, Val Loss: 1.0444, Test Loss: 1.0273
Epoch [13/30], Train Loss: 0.8819, Val Loss: 1.0417, Test Loss: 1.0295
Epoch [14/30], Train Loss: 0.8865, Val Loss: 1.0421, Test Loss: 1.0258
Epoch [15/30], 

In [107]:
# Evaluate the model on the testset
from sklearn.metrics import f1_score

simp_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch_embeddings, batch_labels in test_loader:
        outputs = simp_model(batch_embeddings)
        _, predicted = torch.max(outputs.data, 1)
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()
    f1 = f1_score(predicted , batch_labels, average='weighted')
    f1_macro = f1_score(predicted, batch_labels, average='macro')
    print (f1, f1_macro)
    print(f'Accuracy of the model on the data: {100 * correct / total:.2f}%')

0.5536892736892737 0.5480107194392909
Accuracy of the model on the data: 62.87%


In [108]:
# Evaluate the model
from sklearn.metrics import f1_score

simp_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch_embeddings, batch_labels in val_loader:
        outputs = simp_model(batch_embeddings)
        _, predicted = torch.max(outputs.data, 1)
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()
    f1 = f1_score(predicted , batch_labels, average='weighted')
    f1_macro = f1_score(predicted, batch_labels, average='macro')
    print (f1, f1_macro)
    print(f'Accuracy of the model on the data: {100 * correct / total:.2f}%')

0.6059259259259259 0.5000566893424037
Accuracy of the model on the data: 63.04%


In [111]:
# Make predictions on the test set
simp_model.eval()

predicted_labels = []
true_labels = []

with torch.no_grad():
    for batch_embeddings, batch_labels in test_loader:
        outputs = simp_model(batch_embeddings)
        _, preds = torch.max(outputs, 1)
        predicted_labels.extend(preds.cpu().numpy())
        true_labels.extend(batch_labels.cpu().numpy())

predicted_labels = np.array(predicted_labels)
true_labels = np.array(true_labels)

# Calculate F1 score for each class
f1_scores = f1_score(true_labels, predicted_labels, average=None)

# Print F1 score for each class
for label, f1 in enumerate(f1_scores):
    print(f"F1 score for class {label}: {f1:.4f}")

F1 score for class 0: 0.5110
F1 score for class 1: 0.6118
F1 score for class 2: 0.7702
F1 score for class 3: 0.5034
F1 score for class 4: 0.8580
F1 score for class 5: 0.6841
F1 score for class 6: 0.4709
