In [53]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
import torch

In [54]:
#Loading the training file
file_path = '/Users/diana/Desktop/isear-train.xlsx'
custom_headers = ['Emotions', 'Text']
df = pd.read_excel(file_path, skiprows=1, header=None, names=custom_headers)

In [55]:
label_encoding = {'anger': 0, 'disgust': 1, 'fear': 2, 'guilt': 3, 'joy': 4, 'sadness': 5, 'shame': 6}
y = df['Emotions'].values
y_train_encoded = np.array([label_encoding[label] for label in y])
print("Encoded labels:", y, y_train_encoded[:20])

Encoded labels: ['sadness' 'disgust' 'fear' ... 'sadness' 'disgust' 'fear'] [5 1 2 6 0 6 1 4 6 4 1 0 0 4 2 4 6 2 1 5]


In [56]:
#conda install torch transformers openpyxl


In [57]:
pip  install torch

Note: you may need to restart the kernel to use updated packages.


In [58]:
print (type(df['Text']))
strings = []
for index, sentence in enumerate(df['Text']):
    emotion_label = df['Emotions'][index]
    strings.append (sentence)
print (strings[0:3])

<class 'pandas.core.series.Series'>
['Losing my girlfriend who made an end to our relationship. By this I lost an important source of support, certainty and joyful moments. I cried very intensly when that happened.', '[ No response.]', 'Staying alone in a dark place.']


In [59]:
#Training Data BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def get_sentence_embeddings(text_list, batch_size=32):
    all_embeddings = []

    # Process each sublist separately
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        # Extract the sentence embeddings for the batch
        batch_embeddings = outputs.pooler_output
        all_embeddings.append(batch_embeddings.cpu())  # Move to CPU to save GPU memory
    
    # Concatenate all batch embeddings
    sentence_embeddings = torch.cat(all_embeddings, dim=0)
    
    return sentence_embeddings

# Get embeddings
sentence_embeddings = get_sentence_embeddings(strings)

# Print the shape of the embeddings
print("Sentence embeddings shape:", sentence_embeddings.shape)

Sentence embeddings shape: torch.Size([5366, 768])


In [130]:
#Devset BERT
file_path_dev = '/Users/diana/Desktop/isear-validation.xlsx'
custom_headers_dev = ['Emotions_dev', 'Text_dev']
df = pd.read_excel(file_path_dev, skiprows=1, header=None, names=custom_headers_dev)

print (type(df['Text_dev']))
strings_dev = []
for index, sentence in enumerate(df['Text_dev']):
    emotion_label_dev = df['Emotions_dev'][index]
    strings_dev.append (sentence)
print (strings_dev[0:3])

strings_dev = df['Text_dev'].tolist()

y_dev = df['Emotions_dev'].values
y_dev_encoded = np.array([label_encoding[label] for label in y_dev])
y_dev_encoded = torch.tensor (y_dev_encoded)


<class 'pandas.core.series.Series'>
['[ Laziness makes one sad.]', 'I had to leave my girlfriend for 4 weeks because of my studies. This happens frequently. The last 24 hours before my departure are always very depressing.', 'I promised to join a meeting but did not turn up as I did not want to do the work.']


In [61]:
sentence_embeddings_dev = get_sentence_embeddings(strings_dev)
# Print the shape of the embeddings
print("Sentence embeddings shape:", sentence_embeddings_dev.shape)

Sentence embeddings shape: torch.Size([1150, 768])


In [62]:
torch.save(sentence_embeddings, "sentence_embeddings.pt")
# To load the embeddings later
loaded_embeddings = torch.load("sentence_embeddings.pt")

In [63]:
print (sentence_embeddings)

tensor([[-0.8667, -0.5060, -0.9691,  ..., -0.8864, -0.6726,  0.8883],
        [-0.9396, -0.5441, -0.9673,  ..., -0.8054, -0.6967,  0.9153],
        [-0.8893, -0.4814, -0.9798,  ..., -0.9239, -0.7393,  0.8747],
        ...,
        [-0.8967, -0.5091, -0.9448,  ..., -0.8054, -0.7148,  0.8998],
        [-0.8920, -0.4614, -0.8404,  ..., -0.7408, -0.6178,  0.9071],
        [-0.8762, -0.5444, -0.9822,  ..., -0.9704, -0.6663,  0.9188]])


In [64]:
class LogisticRegressionMulticlass:
    def __init__(self, learning_rate=0.3, num_iterations=10000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None
    
    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)
    
    def xavier_init(self, shape):
        fan_in = shape[0]
        fan_out = shape[1]
        limit = np.sqrt(6 / (fan_in + fan_out))
        return np.random.uniform(-limit, limit, size=shape)
    
    def fit(self, X, y):
        # Convert PyTorch tensors to NumPy arrays
        if isinstance(X, torch.Tensor):
            X = X.detach().cpu().numpy()
        if isinstance(y, torch.Tensor):
            y = y.detach().cpu().numpy()

        num_samples, num_features = X.shape
        num_classes = len(np.unique(y))
        self.weights = self.xavier_init((num_features, num_classes))
        self.bias = np.zeros((1, num_classes))
        y_one_hot = np.eye(num_classes)[y]
        
        for _ in range(self.num_iterations):
            linear_model = X.dot(self.weights) + self.bias
            y_pred = self.softmax(linear_model)
            
            dw = (1 / num_samples) * X.T.dot(y_pred - y_one_hot)
            db = (1 / num_samples) * np.sum(y_pred - y_one_hot, axis=0, keepdims=True)
            
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
    
    def predict(self, X):
        if isinstance(X, torch.Tensor):
            X = X.detach().cpu().numpy()

        num_samples, num_features = X.shape
        _, num_classes = self.weights.shape
    
        if num_features != self.weights.shape[0]:
            self.weights = self.xavier_init((num_features, num_classes))
            
        linear_predictions = X.dot(self.weights) + self.bias
        y_pred = self.softmax(linear_predictions)
        print(self.weights.shape)
        class_pred = np.argmax(y_pred, axis=1)  # Choose the class with the highest probability
        return class_pred

In [65]:
print (type (sentence_embeddings))
y_train_tensor = torch.tensor(y_train_encoded)
print (type (y_train_tensor)) 

<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [66]:
model_emotions = LogisticRegressionMulticlass()
model_emotions.fit (sentence_embeddings, y_train_tensor)

In [67]:
pred = model_emotions.predict (sentence_embeddings)

print(sentence_embeddings.shape)

from sklearn.metrics import f1_score

micro_average_f1 = f1_score(pred, y_train_encoded, average='micro')
print("Micro-average F1-score:", micro_average_f1)

f1_external = f1_score(pred, y_train_encoded, average='weighted')
print("F1-score on training data:", f1_external)

#print (pred [0:300],y_train_encoded [0:300])


(768, 7)
torch.Size([5366, 768])
Micro-average F1-score: 0.3438315318673127
F1-score on training data: 0.4214195922893096


In [235]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert to PyTorch tensors
embeddings = torch.tensor(sentence_embeddings, dtype=torch.float32).to(device)
labels = torch.tensor(y_train_tensor, dtype=torch.long).to(device)

embeddings_dev = torch.tensor(sentence_embeddings_dev, dtype=torch.float32)
labels_dev = torch.tensor(y_dev_encoded, dtype=torch.long)

# Create a dataset and dataloader
dataset = TensorDataset(embeddings, labels)
dataset_dev = TensorDataset(embeddings_dev,labels_dev)


train_loader = DataLoader(dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(dataset_dev, batch_size=32, shuffle=False)

class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x
    
input_dim = 768  # Dimension of BERT embeddings
hidden_dim = 128
output_dim = 7  # Number of emotion classes

  embeddings = torch.tensor(sentence_embeddings, dtype=torch.float32).to(device)
  labels = torch.tensor(y_train_tensor, dtype=torch.long).to(device)
  embeddings_dev = torch.tensor(sentence_embeddings_dev, dtype=torch.float32)
  labels_dev = torch.tensor(y_dev_encoded, dtype=torch.long)


In [255]:
simp_model = SimpleNN(input_dim, hidden_dim, output_dim).to(device)
lossf = nn.CrossEntropyLoss()
optimizer = optim.Adam(simp_model.parameters(), lr=0.0005, weight_decay=1e-6) # L2 regularization

In [256]:
num_epochs = 300

for epoch in range(num_epochs):
    simp_model.train()
    for batch_embeddings, batch_labels in train_loader:
        # Forward pass
        outputs = simp_model(batch_embeddings)
        loss = lossf(outputs, batch_labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/300], Loss: 1.9123
Epoch [2/300], Loss: 1.8617
Epoch [3/300], Loss: 1.8971
Epoch [4/300], Loss: 1.7225
Epoch [5/300], Loss: 1.5977
Epoch [6/300], Loss: 1.6715
Epoch [7/300], Loss: 1.8330
Epoch [8/300], Loss: 1.7793
Epoch [9/300], Loss: 1.5355
Epoch [10/300], Loss: 1.5455
Epoch [11/300], Loss: 1.4638
Epoch [12/300], Loss: 1.1661
Epoch [13/300], Loss: 1.3389
Epoch [14/300], Loss: 1.5576
Epoch [15/300], Loss: 1.5666
Epoch [16/300], Loss: 1.2362
Epoch [17/300], Loss: 1.2819
Epoch [18/300], Loss: 1.4841
Epoch [19/300], Loss: 1.5547
Epoch [20/300], Loss: 1.0336
Epoch [21/300], Loss: 1.2330
Epoch [22/300], Loss: 1.4050
Epoch [23/300], Loss: 1.2227
Epoch [24/300], Loss: 1.3338
Epoch [25/300], Loss: 1.4951
Epoch [26/300], Loss: 1.2680
Epoch [27/300], Loss: 1.4546
Epoch [28/300], Loss: 1.3512
Epoch [29/300], Loss: 1.0410
Epoch [30/300], Loss: 1.4225
Epoch [31/300], Loss: 1.1768
Epoch [32/300], Loss: 1.3895
Epoch [33/300], Loss: 1.0576
Epoch [34/300], Loss: 1.3563
Epoch [35/300], Loss: 1

In [261]:
simp_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch_embeddings, batch_labels in test_loader:
        outputs = simp_model(batch_embeddings)
        _, predicted = torch.max(outputs.data, 1)
        total += batch_labels.size(0)
        correct += (predicted == batch_labels).sum().item()
    f1 = f1_score(predicted , batch_labels, average='weighted')
    f1_micro = f1_score(predicted , batch_labels, average='micro')
    print (f1, f1_micro)
    print(f'Accuracy of the model on the data: {100 * correct / total:.2f}%')

0.6320965309200605 0.6333333333333333
Accuracy of the model on the data: 56.17%
