##Loading word vectors (elmo.py)

In [1]:
import torch
import json
import csv
from tqdm import tqdm
import numpy as np
import gensim.downloader as api

word_vectors = api.load("word2vec-google-news-300")
word_vectors.save_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)



In [2]:
import re
import random

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    if words:
        rand_idx = random.randint(0, len(words) - 1)
        words[rand_idx] = "unk"
    text = ' '.join(words)
    return text

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
file_path = "drive/MyDrive/NLP_A4/train.csv"
file1 = open(file_path, "r")
data = csv.reader(file1)

In [5]:
next(data)

['Class Index', 'Description']

In [6]:
max_sentences = 20000

with open(file_path, "r") as file1:
    data = csv.reader(file1)
    corp_list = []
    corp_str = "sos "
    for i, row in enumerate(data):
        if i >= max_sentences:
            break
        string = preprocess(row[1])
        corp_list.append(string)
        corp_str += string + " sos eos "

final_list = []
corp_words = corp_str.split()
# make sentences of 32 words
for i in range(0, len(corp_words), 32):
    final_list.append(corp_words[i:i+32])
print(len(final_list))


word2idx = {}
idx2word = {}
word_set = set()
for i, word in enumerate(corp_words):
    word_set.add(word)
i = 0
for word in word_set:
    if word not in word2idx.keys():
        word2idx[word] = i
        idx2word[i] = word
        i+=1

from torch import nn, optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Making Dataset...")
class dual_dataset(torch.utils.data.Dataset):
    def __init__(self, sent_list, word2idx, idx2word):
        self.sent_list = sent_list
        self.word2idx = word2idx
        self.idx2word = idx2word
        self.forward_labels, self.forward_targets = self.forward()
        self.backward_labels, self.backward_targets = self.backward()
    def forward(self):
        labels = []
        targets = []
        for sent in self.sent_list:
            loc_labels = []
            loc_targets = []
            for i in range(1, len(sent)-1):
                loc_labels.append(self.word2idx[sent[i]])
                loc_targets.append(self.word2idx[sent[i+1]])
            labels.append(loc_labels)
            targets.append(loc_targets)
        return labels, targets
    def backward(self):
        labels = []
        targets = []
        for_labels = self.forward_labels
        for_targets = self.forward_targets
        for label in for_labels:
            rev_label = label[::-1]
            labels.append(rev_label)
        for target in for_targets:
            rev_target = target[::-1]
            targets.append(rev_target)
        return labels, targets
    def __len__(self):
        return len(self.sent_list)

    def __getitem__(self, idx):
        if len(self.forward_labels[idx]) != len(self.forward_labels[0]):
            idx = idx -1
        forward_labels = torch.LongTensor(self.forward_labels[idx])
        backward_labels = torch.LongTensor(self.backward_labels[idx])
        forward_targets = torch.LongTensor(self.forward_targets[idx])
        backward_targets = torch.LongTensor(self.backward_targets[idx])
        return forward_labels, backward_labels, forward_targets, backward_targets

my_dataset = dual_dataset(final_list, word2idx, idx2word)
class lstm(nn.Module):
    def __init__(self, vocab_size):
        super(lstm, self).__init__()
        self.vocab_size = vocab_size
        # lstm with 2 stacks
        num_stacks = 2
        self.embeddings = nn.Embedding(vocab_size, 300)
        self.lstm = nn.LSTM(300, 300, 1, batch_first=True)
        self.lstm1 = nn.LSTM(300, 300, 1, batch_first=True)
        self.linear = nn.Linear(300, vocab_size)
    def forward(self, x):
        embed = self.embeddings(x)
        x1, _ = self.lstm(embed)
        x2, _ = self.lstm1(x1)
        x = self.linear(x2)
        return x, (embed, x1, x2)

forward_model = lstm(len(word2idx)).to(device)
backward_model = lstm(len(word2idx)).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer1 = optim.Adam(forward_model.parameters(), lr=0.001)
optimizer2 = optim.Adam(backward_model.parameters(), lr=0.001)
dataloader = torch.utils.data.DataLoader(my_dataset, batch_size=50, shuffle=True)

torch.save(word2idx, "word2idx.pt")
torch.save(idx2word, "idx2word.pt")
print("Done making the dictionaries")


20452
Making Dataset...
Done making the dictionaries


In [7]:
n_epochs = 15

In [8]:
from tqdm import tqdm

print("Starting Training...")
for epoch in range(n_epochs):
    forward_model.train()
    backward_model.train()
    total_forward_loss = 0.0
    total_backward_loss = 0.0
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc="Epoch {}".format(epoch))
    for batch, (forward_labels, backward_labels, forward_targets, backward_targets) in progress_bar:
        forward_targets = forward_targets.to(device)
        backward_targets = backward_targets.to(device)
        forward_labels = forward_labels.to(device)
        backward_labels = backward_labels.to(device)
        # change the shape of forward_vcts and backward_vcts to batch-size, seq_len, embedding_dim
        forward_targets = forward_targets.view(forward_targets.shape[1], forward_targets.shape[0])
        backward_targets = backward_targets.view(backward_targets.shape[1], backward_targets.shape[0])
        forward_labels = forward_labels.view(forward_labels.shape[1], forward_labels.shape[0])
        backward_labels = backward_labels.view(backward_labels.shape[1], backward_labels.shape[0])
        # forward pass
        optimizer1.zero_grad()
        forward_out,_  = forward_model(forward_labels)
        forward_out = forward_out.view(forward_out.shape[0]*forward_out.shape[1], forward_out.shape[2])
        forward_targets = forward_targets.view(forward_targets.shape[0]*forward_targets.shape[1])
        forward_loss = loss_fn(forward_out, forward_targets)
        forward_loss.backward()
        optimizer1.step()
        total_forward_loss += forward_loss.item()
        # backward pass
        optimizer2.zero_grad()
        backward_out,_ = backward_model(backward_labels)
        backward_out = backward_out.view(backward_out.shape[0]*backward_out.shape[1], backward_out.shape[2])
        backward_targets = backward_targets.view(backward_targets.shape[0]*backward_targets.shape[1])
        backward_loss = loss_fn(backward_out, backward_targets)
        backward_loss.backward()
        optimizer2.step()
        total_backward_loss += backward_loss.item()
        forward_peplexity = torch.exp(forward_loss)
        backward_peplexity = torch.exp(backward_loss)
        progress_bar.set_postfix(forward_loss=forward_loss.item(), backward_loss=backward_loss.item(), avg_forward_loss=total_forward_loss/(batch+1), avg_backward_loss=total_backward_loss/(batch+1))

    # Calculate average losses
    average_forward_loss = total_forward_loss / len(dataloader)
    average_backward_loss = total_backward_loss / len(dataloader)

    # Print epoch-level loss
    print("Epoch: {}, Average Forward Loss: {:.4f}, Average Backward Loss: {:.4f}".format(epoch, average_forward_loss, average_backward_loss))




Starting Training...


Epoch 0: 100%|██████████| 410/410 [00:40<00:00, 10.12it/s, avg_backward_loss=6.02, avg_forward_loss=7.43, backward_loss=5.48, forward_loss=6.89]


Epoch: 0, Average Forward Loss: 7.4291, Average Backward Loss: 6.0187


Epoch 1: 100%|██████████| 410/410 [00:42<00:00,  9.72it/s, avg_backward_loss=3.4, avg_forward_loss=6.85, backward_loss=4.54, forward_loss=6.52]


Epoch: 1, Average Forward Loss: 6.8535, Average Backward Loss: 3.4019


Epoch 2: 100%|██████████| 410/410 [00:41<00:00,  9.76it/s, avg_backward_loss=2.2, avg_forward_loss=6.5, backward_loss=4.56, forward_loss=6.79]


Epoch: 2, Average Forward Loss: 6.5022, Average Backward Loss: 2.2035


Epoch 3: 100%|██████████| 410/410 [00:41<00:00,  9.88it/s, avg_backward_loss=1.62, avg_forward_loss=6.22, backward_loss=3.58, forward_loss=5.72]


Epoch: 3, Average Forward Loss: 6.2177, Average Backward Loss: 1.6156


Epoch 4: 100%|██████████| 410/410 [00:41<00:00,  9.85it/s, avg_backward_loss=1.28, avg_forward_loss=5.98, backward_loss=3.53, forward_loss=5.48]


Epoch: 4, Average Forward Loss: 5.9804, Average Backward Loss: 1.2766


Epoch 5: 100%|██████████| 410/410 [00:41<00:00,  9.87it/s, avg_backward_loss=1.05, avg_forward_loss=5.78, backward_loss=3.33, forward_loss=5.43]


Epoch: 5, Average Forward Loss: 5.7770, Average Backward Loss: 1.0503


Epoch 6: 100%|██████████| 410/410 [00:41<00:00,  9.87it/s, avg_backward_loss=0.886, avg_forward_loss=5.6, backward_loss=3.51, forward_loss=5.71]


Epoch: 6, Average Forward Loss: 5.5977, Average Backward Loss: 0.8861


Epoch 7: 100%|██████████| 410/410 [00:41<00:00,  9.86it/s, avg_backward_loss=0.762, avg_forward_loss=5.43, backward_loss=3.22, forward_loss=5.84]


Epoch: 7, Average Forward Loss: 5.4332, Average Backward Loss: 0.7619


Epoch 8: 100%|██████████| 410/410 [00:41<00:00,  9.86it/s, avg_backward_loss=0.666, avg_forward_loss=5.27, backward_loss=3.25, forward_loss=5.21]


Epoch: 8, Average Forward Loss: 5.2732, Average Backward Loss: 0.6663


Epoch 9: 100%|██████████| 410/410 [00:41<00:00,  9.84it/s, avg_backward_loss=0.591, avg_forward_loss=5.11, backward_loss=2.84, forward_loss=5.09]


Epoch: 9, Average Forward Loss: 5.1118, Average Backward Loss: 0.5914


Epoch 10: 100%|██████████| 410/410 [00:41<00:00,  9.82it/s, avg_backward_loss=0.538, avg_forward_loss=4.95, backward_loss=3.78, forward_loss=5.83]


Epoch: 10, Average Forward Loss: 4.9540, Average Backward Loss: 0.5377


Epoch 11: 100%|██████████| 410/410 [00:41<00:00,  9.83it/s, avg_backward_loss=0.496, avg_forward_loss=4.8, backward_loss=3.31, forward_loss=5.78]


Epoch: 11, Average Forward Loss: 4.7997, Average Backward Loss: 0.4961


Epoch 12: 100%|██████████| 410/410 [00:41<00:00,  9.82it/s, avg_backward_loss=0.466, avg_forward_loss=4.65, backward_loss=3.48, forward_loss=5.76]


Epoch: 12, Average Forward Loss: 4.6507, Average Backward Loss: 0.4661


Epoch 13: 100%|██████████| 410/410 [00:41<00:00,  9.78it/s, avg_backward_loss=0.446, avg_forward_loss=4.51, backward_loss=3.13, forward_loss=4.98]


Epoch: 13, Average Forward Loss: 4.5061, Average Backward Loss: 0.4464


Epoch 14: 100%|██████████| 410/410 [00:41<00:00,  9.81it/s, avg_backward_loss=0.432, avg_forward_loss=4.37, backward_loss=3.21, forward_loss=5.02]

Epoch: 14, Average Forward Loss: 4.3719, Average Backward Loss: 0.4317





In [9]:
torch.save(forward_model, f"forward_model_final.pt")
torch.save(backward_model, f"backward_model_final.pt")

##Downstream classification task (classification.py)

In [11]:
import torch
import json
import csv
from tqdm import tqdm
from pprint import pprint
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import os

from torch import nn, optim
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

import regex as re
print("Done importing...")

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text

class lstm(nn.Module):
    def __init__(self, vocab_size):
        super(lstm, self).__init__()
        self.vocab_size = vocab_size
        # lstm with 2 stacks
        num_stacks = 2
        self.embeddings = nn.Embedding(vocab_size, 300)
        self.lstm = nn.LSTM(300, 300, 1, batch_first=True)
        self.lstm1 = nn.LSTM(300, 300, 1, batch_first=True)
        self.linear = nn.Linear(300, vocab_size)

    def forward(self, x):
        embed = self.embeddings(x)
        x1, _ = self.lstm(embed)
        x2, _ = self.lstm1(x1)
        x = self.linear(x2)
        return x, (embed, x1, x2)

def make_dataset(data, word2idx, idx2word, max_sentences=20000):
    seq_list = []
    label_list = []
    length_list = []
    data = data[1:max_sentences+1]  # Select only the first max_sentences
    for item in data:
        idx_seq = []
        item = item.strip().split(',')  # Split the line into comma-separated values
        label = item[0]
        seq = preprocess(item[1])
        seq = seq.split()
        for word in seq:
            if word in word2idx.keys():
                idx_seq.append(word2idx[word])
            else:
                idx_seq.append(word2idx["unk"])
        length = len(idx_seq)
        length_list.append(length)
        seq_list.append(idx_seq)
        label_list.append(label)
    return seq_list, label_list, length_list


import os
import torch

word2idx_path = "word2idx.pt"
idx2word_path = "idx2word.pt"

word2idx = torch.load(word2idx_path)
idx2word = torch.load(idx2word_path)

def load_dataset(filename):
    file_path = os.path.join("drive/MyDrive/NLP_A4", filename)  # Update the dataset folder path
    with open(file_path, "r") as file:
        data = file.readlines()
    return data

# Load train and test datasets
train_data = load_dataset("train.csv")
test_data = load_dataset("test.csv")

# Preprocess and create sequences, labels, and lengths
train_seqs, train_labels, train_lengths = make_dataset(train_data, word2idx, idx2word)
test_seqs, test_labels, test_lengths = make_dataset(test_data, word2idx, idx2word)

from torch.nn.utils.rnn import pad_sequence

train_seqs = pad_sequence([torch.LongTensor(i) for i in train_seqs], batch_first=True)
test_seqs = pad_sequence([torch.LongTensor(i) for i in test_seqs], batch_first=True)

train_labels = torch.LongTensor([int(i) for i in train_labels])
test_labels = torch.LongTensor([int(i) for i in test_labels])
train_lengths = torch.LongTensor(train_lengths)
test_lengths = torch.LongTensor(test_lengths)

print("train, test input shapes: ", train_seqs.shape, test_seqs.shape)
print("train, test label shapes: ", train_labels.shape, test_labels.shape)
print("train, test length shapes: ", train_lengths.shape, test_lengths.shape)

class Classifier(nn.Module):
    def __init__(self, num_classes, in_dim):
        super(Classifier, self).__init__()
        self.linear_layer = nn.Linear(in_dim, 600)
        self.lstm = nn.LSTM(600, 300, 1, batch_first=True)
        self.linear_layer2 = nn.Linear(300, num_classes)

        # Initialize learnable lambda parameters
        self.lambda_0 = nn.Parameter(torch.rand(1))
        self.lambda_1 = nn.Parameter(torch.rand(1))
        self.lambda_2 = nn.Parameter(torch.rand(1))

    def forward(self, x0, x1, x2, length):
        x = self.linear_layer(self.lambda_0 * x0 + self.lambda_1 * x1 + self.lambda_2 * x2)
        x, _ = self.lstm(x)

        final_preds = []
        loc_cnt = 0
        for loc_len in length:
            loc_x = x[loc_cnt, loc_len-1, :]
            loc_out = self.linear_layer2(loc_x)
            loc_cnt += 1
            final_preds.append(loc_out)
        final_preds = torch.stack(final_preds)
        return final_preds

forward_lstm = torch.load("backward_model_final.pt").to(device)
backward_lstm = torch.load("forward_model_final.pt").to(device)

# Freeze the ELMo model parameters
for param in list(forward_lstm.parameters()) + list(backward_lstm.parameters()):
    param.requires_grad = False

train_dataset = torch.utils.data.TensorDataset(train_seqs, train_labels, train_lengths)

# split train set into train and validation set 20 percent
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

test_dataset = torch.utils.data.TensorDataset(test_seqs, test_labels, test_lengths)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=50, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=50, shuffle=True)

in_num = 600
num_classes = 5
downstream_model = Classifier(num_classes, in_num).to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(downstream_model.parameters(), lr=0.0001)

from tqdm import tqdm

n_epochs = 15
best_val_f1 = 0.0

for epoch in range(n_epochs):
    train_accurs = []
    train_f1s = []
    val_accurs = []
    val_f1s = []

    print("Epoch {}:".format(epoch + 1))

    # Training
    downstream_model.train()
    train_loader_tqdm = tqdm(train_loader, desc="Training")
    for data, label, length in train_loader_tqdm:
        data = data.to(device)
        data_reverse = data.flip(1)
        _, (xf0, xf1, xf2) = forward_lstm(data)
        _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
        xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
        xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
        x0 = torch.cat((xf0, xb0), dim=2)
        x1 = torch.cat((xf1, xb1), dim=2)
        x2 = torch.cat((xf2, xb2), dim=2)
        output = downstream_model(x0, x1, x2, length)
        loss = loss_func(output, label.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(output, 1)
        accuracy = accuracy_score(label.cpu(), predicted.cpu())
        micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
        train_accurs.append(accuracy)
        train_f1s.append(micro_f1)
        train_loader_tqdm.set_postfix(loss=loss.item(), accuracy=accuracy, micro_f1=micro_f1)

    train_accuracy = sum(train_accurs) / len(train_accurs)
    train_micro_f1 = sum(train_f1s) / len(train_f1s)
    print("Training - Accuracy: {:.4f}, Micro F1: {:.4f}".format(train_accuracy, train_micro_f1))

    # Validation
    downstream_model.eval()
    with torch.no_grad():
        val_loader_tqdm = tqdm(val_loader, desc="Validation")
        for data, label, length in val_loader_tqdm:
            data = data.to(device)
            data_reverse = data.flip(1)
            _, (xf0, xf1, xf2) = forward_lstm(data)
            _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
            xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
            xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
            x0 = torch.cat((xf0, xb0), dim=2)
            x1 = torch.cat((xf1, xb1), dim=2)
            x2 = torch.cat((xf2, xb2), dim=2)
            output = downstream_model(x0, x1, x2, length)

            _, predicted = torch.max(output, 1)
            accuracy = accuracy_score(label.cpu(), predicted.cpu())
            micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
            val_accurs.append(accuracy)
            val_f1s.append(micro_f1)
            val_loader_tqdm.set_postfix(accuracy=accuracy, micro_f1=micro_f1)

    val_accuracy = sum(val_accurs) / len(val_accurs)
    val_micro_f1 = sum(val_f1s) / len(val_f1s)
    print("Validation - Accuracy: {:.4f}, Micro F1: {:.4f}".format(val_accuracy, val_micro_f1))

    if val_micro_f1 > best_val_f1:
        best_val_f1 = val_micro_f1
        # Save the model
        torch.save(downstream_model, "best_downstream_model.pt")

print("Training complete.")

# Testing the best model
best_downstream_model = torch.load("best_downstream_model.pt")
best_downstream_model.eval()

test_accurs = []
test_f1s = []

with torch.no_grad():
    for data, label, length in test_loader:
        data = data.to(device)
        data_reverse = data.flip(1)
        _, (xf0, xf1, xf2) = forward_lstm(data)
        _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
        xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
        xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
        x0 = torch.cat((xf0, xb0), dim=2)
        x1 = torch.cat((xf1, xb1), dim=2)
        x2 = torch.cat((xf2, xb2), dim=2)
        output = best_downstream_model(x0, x1, x2, length)

        _, predicted = torch.max(output, 1)
        accuracy = accuracy_score(label.cpu(), predicted.cpu())
        micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
        test_accurs.append(accuracy)
        test_f1s.append(micro_f1)

test_accuracy = sum(test_accurs) / len(test_accurs)
test_micro_f1 = sum(test_f1s) / len(test_f1s)
print("Test - Accuracy: {:.4f}, Micro F1: {:.4f}".format(test_accuracy, test_micro_f1))

Done importing...
train, test input shapes:  torch.Size([20000, 117]) torch.Size([7600, 104])
train, test label shapes:  torch.Size([20000]) torch.Size([7600])
train, test length shapes:  torch.Size([20000]) torch.Size([7600])
Epoch 1:


Training: 100%|██████████| 320/320 [01:01<00:00,  5.20it/s, accuracy=0.5, loss=1.25, micro_f1=0.5]


Training - Accuracy: 0.3551, Micro F1: 0.3551


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.45it/s, accuracy=0.36, micro_f1=0.36]


Validation - Accuracy: 0.4108, Micro F1: 0.4107
Epoch 2:


Training: 100%|██████████| 320/320 [01:00<00:00,  5.33it/s, accuracy=0.3, loss=1.23, micro_f1=0.3]


Training - Accuracy: 0.4296, Micro F1: 0.4296


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.37it/s, accuracy=0.46, micro_f1=0.46]


Validation - Accuracy: 0.4678, Micro F1: 0.4678
Epoch 3:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.49it/s, accuracy=0.38, loss=1.2, micro_f1=0.38]


Training - Accuracy: 0.4910, Micro F1: 0.4910


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.38it/s, accuracy=0.52, micro_f1=0.52]


Validation - Accuracy: 0.5065, Micro F1: 0.5065
Epoch 4:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.48it/s, accuracy=0.7, loss=0.85, micro_f1=0.7]


Training - Accuracy: 0.5404, Micro F1: 0.5404


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.41it/s, accuracy=0.52, micro_f1=0.52]


Validation - Accuracy: 0.5455, Micro F1: 0.5455
Epoch 5:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.49it/s, accuracy=0.6, loss=0.93, micro_f1=0.6]


Training - Accuracy: 0.5767, Micro F1: 0.5767


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.7, micro_f1=0.7]


Validation - Accuracy: 0.5827, Micro F1: 0.5827
Epoch 6:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.50it/s, accuracy=0.6, loss=0.953, micro_f1=0.6]


Training - Accuracy: 0.5997, Micro F1: 0.5997


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.37it/s, accuracy=0.62, micro_f1=0.62]


Validation - Accuracy: 0.5885, Micro F1: 0.5885
Epoch 7:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.51it/s, accuracy=0.52, loss=1, micro_f1=0.52]


Training - Accuracy: 0.6216, Micro F1: 0.6216


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.37it/s, accuracy=0.54, micro_f1=0.54]


Validation - Accuracy: 0.6015, Micro F1: 0.6015
Epoch 8:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.49it/s, accuracy=0.64, loss=0.88, micro_f1=0.64]


Training - Accuracy: 0.6387, Micro F1: 0.6387


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.45it/s, accuracy=0.48, micro_f1=0.48]


Validation - Accuracy: 0.6095, Micro F1: 0.6095
Epoch 9:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.49it/s, accuracy=0.6, loss=0.854, micro_f1=0.6]


Training - Accuracy: 0.6602, Micro F1: 0.6602


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.35it/s, accuracy=0.54, micro_f1=0.54]


Validation - Accuracy: 0.6222, Micro F1: 0.6222
Epoch 10:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.51it/s, accuracy=0.72, loss=0.693, micro_f1=0.72]


Training - Accuracy: 0.6684, Micro F1: 0.6684


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.41it/s, accuracy=0.56, micro_f1=0.56]


Validation - Accuracy: 0.6182, Micro F1: 0.6182
Epoch 11:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.48it/s, accuracy=0.8, loss=0.618, micro_f1=0.8]


Training - Accuracy: 0.6859, Micro F1: 0.6859


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.39it/s, accuracy=0.62, micro_f1=0.62]


Validation - Accuracy: 0.6385, Micro F1: 0.6385
Epoch 12:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.51it/s, accuracy=0.72, loss=0.719, micro_f1=0.72]


Training - Accuracy: 0.7062, Micro F1: 0.7062


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.35it/s, accuracy=0.62, micro_f1=0.62]


Validation - Accuracy: 0.6400, Micro F1: 0.6400
Epoch 13:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.47it/s, accuracy=0.64, loss=0.851, micro_f1=0.64]


Training - Accuracy: 0.7154, Micro F1: 0.7154


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.42it/s, accuracy=0.68, micro_f1=0.68]


Validation - Accuracy: 0.6570, Micro F1: 0.6570
Epoch 14:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.49it/s, accuracy=0.78, loss=0.636, micro_f1=0.78]


Training - Accuracy: 0.7352, Micro F1: 0.7352


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.76, micro_f1=0.76]


Validation - Accuracy: 0.6660, Micro F1: 0.6660
Epoch 15:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.50it/s, accuracy=0.74, loss=0.624, micro_f1=0.74]


Training - Accuracy: 0.7471, Micro F1: 0.7471


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.82, micro_f1=0.82]


Validation - Accuracy: 0.6660, Micro F1: 0.6660
Training complete.
Test - Accuracy: 0.6150, Micro F1: 0.6150


##Hyperparameter Tuning##

Trainable λs

In [12]:
import torch
import json
import csv
from tqdm import tqdm
from pprint import pprint
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import gensim
import os

from torch import nn, optim
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

import regex as re
print("Done importing...")

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text

class lstm(nn.Module):
    def __init__(self, vocab_size):
        super(lstm, self).__init__()
        self.vocab_size = vocab_size
        # lstm with 2 stacks
        num_stacks = 2
        self.embeddings = nn.Embedding(vocab_size, 300)
        self.lstm = nn.LSTM(300, 300, 1, batch_first=True)
        self.lstm1 = nn.LSTM(300, 300, 1, batch_first=True)
        self.linear = nn.Linear(300, vocab_size)

    def forward(self, x):
        embed = self.embeddings(x)
        x1, _ = self.lstm(embed)
        x2, _ = self.lstm1(x1)
        x = self.linear(x2)
        return x, (embed, x1, x2)

def make_dataset(data, word2idx, idx2word, max_sentences=20000):
    seq_list = []
    label_list = []
    length_list = []
    data = data[1:max_sentences+1]  # Select only the first max_sentences
    for item in data:
        idx_seq = []
        item = item.strip().split(',')
        label = item[0]
        seq = preprocess(item[1])
        seq = seq.split()
        for word in seq:
            if word in word2idx.keys():
                idx_seq.append(word2idx[word])
            else:
                idx_seq.append(word2idx["unk"])
        length = len(idx_seq)
        length_list.append(length)
        seq_list.append(idx_seq)
        label_list.append(label)
    return seq_list, label_list, length_list

import os
import torch

word2idx_path = "word2idx.pt"
idx2word_path = "idx2word.pt"

word2idx = torch.load(word2idx_path)
idx2word = torch.load(idx2word_path)

def load_dataset(filename):
    file_path = os.path.join("drive/MyDrive/NLP_A4", filename)
    with open(file_path, "r") as file:
        data = file.readlines()
    return data

train_data = load_dataset("train.csv")
test_data = load_dataset("test.csv")

#preprocess and create sequences, labels, and lengths
train_seqs, train_labels, train_lengths = make_dataset(train_data, word2idx, idx2word)
test_seqs, test_labels, test_lengths = make_dataset(test_data, word2idx, idx2word)

from torch.nn.utils.rnn import pad_sequence

train_seqs = pad_sequence([torch.LongTensor(i) for i in train_seqs], batch_first=True)
test_seqs = pad_sequence([torch.LongTensor(i) for i in test_seqs], batch_first=True)

train_labels = torch.LongTensor([int(i) for i in train_labels])
test_labels = torch.LongTensor([int(i) for i in test_labels])
train_lengths = torch.LongTensor(train_lengths)
test_lengths = torch.LongTensor(test_lengths)

print("train, test input shapes: ", train_seqs.shape, test_seqs.shape)
print("train, test label shapes: ", train_labels.shape, test_labels.shape)
print("train, test length shapes: ", train_lengths.shape, test_lengths.shape)

class Classifier(nn.Module):
    def __init__(self, num_classes, in_dim):
        super(Classifier, self).__init__()
        self.linear_layer = nn.Linear(in_dim, 600)
        self.lstm = nn.LSTM(600, 300, 1, batch_first=True)
        self.linear_layer2 = nn.Linear(300, num_classes)

        # Initialize learnable lambda parameters
        self.lambda_0 = nn.Parameter(torch.rand(1))
        self.lambda_1 = nn.Parameter(torch.rand(1))
        self.lambda_2 = nn.Parameter(torch.rand(1))

    def forward(self, x0, x1, x2, length):
        x = self.linear_layer(self.lambda_0 * x0 + self.lambda_1 * x1 + self.lambda_2 * x2)
        x, _ = self.lstm(x)

        final_preds = []
        loc_cnt = 0
        for loc_len in length:
            loc_x = x[loc_cnt, loc_len-1, :]
            loc_out = self.linear_layer2(loc_x)
            loc_cnt += 1
            final_preds.append(loc_out)
        final_preds = torch.stack(final_preds)
        return final_preds

forward_lstm = torch.load("backward_model_final.pt").to(device)
backward_lstm = torch.load("forward_model_final.pt").to(device)

for param in list(forward_lstm.parameters()) + list(backward_lstm.parameters()):
    param.requires_grad = False

train_dataset = torch.utils.data.TensorDataset(train_seqs, train_labels, train_lengths)

# split train set into train and validation set 20 percent
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

test_dataset = torch.utils.data.TensorDataset(test_seqs, test_labels, test_lengths)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=50, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=50, shuffle=True)

in_num = 600
num_classes = 5
downstream_model = Classifier(num_classes, in_num).to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(downstream_model.parameters(), lr=0.0001)

from tqdm import tqdm

n_epochs = 10

best_val_f1 = 0.0

for epoch in range(n_epochs):
    train_accurs = []
    train_f1s = []
    val_accurs = []
    val_f1s = []

    print("Epoch {}:".format(epoch + 1))

    # Training
    downstream_model.train()
    train_loader_tqdm = tqdm(train_loader, desc="Training")
    for data, label, length in train_loader_tqdm:
        data = data.to(device)
        data_reverse = data.flip(1)
        _, (xf0, xf1, xf2) = forward_lstm(data)
        _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
        xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
        xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
        x0 = torch.cat((xf0, xb0), dim=2)
        x1 = torch.cat((xf1, xb1), dim=2)
        x2 = torch.cat((xf2, xb2), dim=2)
        output = downstream_model(x0, x1, x2, length)
        loss = loss_func(output, label.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(output, 1)
        accuracy = accuracy_score(label.cpu(), predicted.cpu())
        micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
        train_accurs.append(accuracy)
        train_f1s.append(micro_f1)
        train_loader_tqdm.set_postfix(loss=loss.item(), accuracy=accuracy, micro_f1=micro_f1)

    train_accuracy = sum(train_accurs) / len(train_accurs)
    train_micro_f1 = sum(train_f1s) / len(train_f1s)
    print("Training - Accuracy: {:.4f}, Micro F1: {:.4f}".format(train_accuracy, train_micro_f1))

    # Validation
    downstream_model.eval()
    with torch.no_grad():
        val_loader_tqdm = tqdm(val_loader, desc="Validation")
        for data, label, length in val_loader_tqdm:
            data = data.to(device)
            data_reverse = data.flip(1)
            _, (xf0, xf1, xf2) = forward_lstm(data)
            _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
            xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
            xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
            x0 = torch.cat((xf0, xb0), dim=2)
            x1 = torch.cat((xf1, xb1), dim=2)
            x2 = torch.cat((xf2, xb2), dim=2)
            output = downstream_model(x0, x1, x2, length)

            _, predicted = torch.max(output, 1)
            accuracy = accuracy_score(label.cpu(), predicted.cpu())
            micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
            val_accurs.append(accuracy)
            val_f1s.append(micro_f1)
            val_loader_tqdm.set_postfix(accuracy=accuracy, micro_f1=micro_f1)

    val_accuracy = sum(val_accurs) / len(val_accurs)
    val_micro_f1 = sum(val_f1s) / len(val_f1s)
    print("Validation - Accuracy: {:.4f}, Micro F1: {:.4f}".format(val_accuracy, val_micro_f1))

    # Check if this is the best validation F1 score so far
    if val_micro_f1 > best_val_f1:
        best_val_f1 = val_micro_f1
        # Save the model
        torch.save(downstream_model, "best_downstream_model.pt")

print("Training complete.")

best_downstream_model = torch.load("best_downstream_model.pt")
best_downstream_model.eval()

test_accurs = []
test_f1s = []

with torch.no_grad():
    for data, label, length in test_loader:
        data = data.to(device)
        data_reverse = data.flip(1)
        _, (xf0, xf1, xf2) = forward_lstm(data)
        _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
        xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
        xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
        x0 = torch.cat((xf0, xb0), dim=2)
        x1 = torch.cat((xf1, xb1), dim=2)
        x2 = torch.cat((xf2, xb2), dim=2)
        output = best_downstream_model(x0, x1, x2, length)

        _, predicted = torch.max(output, 1)
        accuracy = accuracy_score(label.cpu(), predicted.cpu())
        micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
        test_accurs.append(accuracy)
        test_f1s.append(micro_f1)

test_accuracy = sum(test_accurs) / len(test_accurs)
test_micro_f1 = sum(test_f1s) / len(test_f1s)
print("Test - Accuracy: {:.4f}, Micro F1: {:.4f}".format(test_accuracy, test_micro_f1))


Done importing...
train, test input shapes:  torch.Size([20000, 117]) torch.Size([7600, 104])
train, test label shapes:  torch.Size([20000]) torch.Size([7600])
train, test length shapes:  torch.Size([20000]) torch.Size([7600])
Epoch 1:


Training: 100%|██████████| 320/320 [01:04<00:00,  4.93it/s, accuracy=0.42, loss=1.24, micro_f1=0.42]


Training - Accuracy: 0.3716, Micro F1: 0.3716


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.17it/s, accuracy=0.32, micro_f1=0.32]


Validation - Accuracy: 0.3615, Micro F1: 0.3615
Epoch 2:


Training: 100%|██████████| 320/320 [01:01<00:00,  5.19it/s, accuracy=0.56, loss=1.12, micro_f1=0.56]


Training - Accuracy: 0.4777, Micro F1: 0.4777


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.38it/s, accuracy=0.46, micro_f1=0.46]


Validation - Accuracy: 0.5268, Micro F1: 0.5268
Epoch 3:


Training: 100%|██████████| 320/320 [01:08<00:00,  4.70it/s, accuracy=0.58, loss=0.965, micro_f1=0.58]


Training - Accuracy: 0.5546, Micro F1: 0.5546


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.62, micro_f1=0.62]


Validation - Accuracy: 0.5670, Micro F1: 0.5670
Epoch 4:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.51it/s, accuracy=0.64, loss=0.86, micro_f1=0.64]


Training - Accuracy: 0.5950, Micro F1: 0.5950


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.39it/s, accuracy=0.58, micro_f1=0.58]


Validation - Accuracy: 0.6192, Micro F1: 0.6192
Epoch 5:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.50it/s, accuracy=0.68, loss=0.884, micro_f1=0.68]


Training - Accuracy: 0.6315, Micro F1: 0.6315


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.54, micro_f1=0.54]


Validation - Accuracy: 0.5737, Micro F1: 0.5737
Epoch 6:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.50it/s, accuracy=0.58, loss=0.869, micro_f1=0.58]


Training - Accuracy: 0.6474, Micro F1: 0.6474


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.41it/s, accuracy=0.6, micro_f1=0.6]


Validation - Accuracy: 0.6485, Micro F1: 0.6485
Epoch 7:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.51it/s, accuracy=0.64, loss=0.789, micro_f1=0.64]


Training - Accuracy: 0.6766, Micro F1: 0.6766


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.6, micro_f1=0.6]


Validation - Accuracy: 0.6482, Micro F1: 0.6482
Epoch 8:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.50it/s, accuracy=0.82, loss=0.566, micro_f1=0.82]


Training - Accuracy: 0.6951, Micro F1: 0.6951


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.38it/s, accuracy=0.8, micro_f1=0.8]


Validation - Accuracy: 0.6685, Micro F1: 0.6685
Epoch 9:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.49it/s, accuracy=0.7, loss=0.713, micro_f1=0.7]


Training - Accuracy: 0.7169, Micro F1: 0.7169


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.62, micro_f1=0.62]


Validation - Accuracy: 0.6755, Micro F1: 0.6755
Epoch 10:


Training: 100%|██████████| 320/320 [00:57<00:00,  5.52it/s, accuracy=0.66, loss=0.725, micro_f1=0.66]


Training - Accuracy: 0.7298, Micro F1: 0.7298


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.78, micro_f1=0.78]


Validation - Accuracy: 0.6900, Micro F1: 0.6900
Training complete.
Test - Accuracy: 0.6204, Micro F1: 0.6204


In [13]:
from sklearn.metrics import confusion_matrix

#testing the best model
best_downstream_model = torch.load("best_downstream_model.pt")
best_downstream_model.eval()

test_accurs = []
test_f1s = []
true_labels = []
predicted_labels = []

with torch.no_grad():
    for data, label, length in test_loader:
        data = data.to(device)
        data_reverse = data.flip(1)
        _, (xf0, xf1, xf2) = forward_lstm(data)
        _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
        xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
        xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
        x0 = torch.cat((xf0, xb0), dim=2)
        x1 = torch.cat((xf1, xb1), dim=2)
        x2 = torch.cat((xf2, xb2), dim=2)
        output = best_downstream_model(x0, x1, x2, length)

        _, predicted = torch.max(output, 1)
        true_labels.extend(label.cpu().numpy())
        predicted_labels.extend(predicted.cpu().numpy())

        accuracy = accuracy_score(label.cpu(), predicted.cpu())
        micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
        test_accurs.append(accuracy)
        test_f1s.append(micro_f1)

test_accuracy = sum(test_accurs) / len(test_accurs)
test_micro_f1 = sum(test_f1s) / len(test_f1s)
print("Test - Accuracy: {:.4f}, Micro F1: {:.4f}".format(test_accuracy, test_micro_f1))

conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)


Test - Accuracy: 0.6204, Micro F1: 0.6204
Confusion Matrix:
[[1216  177  160  347]
 [ 276 1084  132  408]
 [ 189  101 1016  594]
 [ 206   97  198 1399]]


Frozen λs

In [14]:
import torch
import json
import csv
from tqdm import tqdm
from pprint import pprint
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import gensim
import os

from torch import nn, optim
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

import regex as re
print("Done importing...")

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text

class lstm(nn.Module):
    def __init__(self, vocab_size):
        super(lstm, self).__init__()
        self.vocab_size = vocab_size
        # lstm with 2 stacks
        num_stacks = 2
        self.embeddings = nn.Embedding(vocab_size, 300)
        self.lstm = nn.LSTM(300, 300, 1, batch_first=True)
        self.lstm1 = nn.LSTM(300, 300, 1, batch_first=True)
        self.linear = nn.Linear(300, vocab_size)

    def forward(self, x):
        embed = self.embeddings(x)
        x1, _ = self.lstm(embed)
        x2, _ = self.lstm1(x1)
        x = self.linear(x2)
        return x, (embed, x1, x2)

def make_dataset(data, word2idx, idx2word, max_sentences=20000):
    seq_list = []
    label_list = []
    length_list = []
    data = data[1:max_sentences+1]
    for item in data:
        idx_seq = []
        item = item.strip().split(',')
        label = item[0]
        seq = preprocess(item[1])
        seq = seq.split()
        for word in seq:
            if word in word2idx.keys():
                idx_seq.append(word2idx[word])
            else:
                idx_seq.append(word2idx["unk"])
        length = len(idx_seq)
        length_list.append(length)
        seq_list.append(idx_seq)
        label_list.append(label)
    return seq_list, label_list, length_list


import os
import torch

word2idx_path = "word2idx.pt"
idx2word_path = "idx2word.pt"

word2idx = torch.load(word2idx_path)
idx2word = torch.load(idx2word_path)

def load_dataset(filename):
    file_path = os.path.join("drive/MyDrive/NLP_A4", filename)
    with open(file_path, "r") as file:
        data = file.readlines()
    return data

train_data = load_dataset("train.csv")
test_data = load_dataset("test.csv")

#preprocess and create sequences, labels, and lengths
train_seqs, train_labels, train_lengths = make_dataset(train_data, word2idx, idx2word)
test_seqs, test_labels, test_lengths = make_dataset(test_data, word2idx, idx2word)

from torch.nn.utils.rnn import pad_sequence

train_seqs = pad_sequence([torch.LongTensor(i) for i in train_seqs], batch_first=True)
test_seqs = pad_sequence([torch.LongTensor(i) for i in test_seqs], batch_first=True)

train_labels = torch.LongTensor([int(i) for i in train_labels])
test_labels = torch.LongTensor([int(i) for i in test_labels])
train_lengths = torch.LongTensor(train_lengths)
test_lengths = torch.LongTensor(test_lengths)

print("train, test input shapes: ", train_seqs.shape, test_seqs.shape)
print("train, test label shapes: ", train_labels.shape, test_labels.shape)
print("train, test length shapes: ", train_lengths.shape, test_lengths.shape)

class Classifier(nn.Module):
    def __init__(self, num_classes, in_dim):
        super(Classifier, self).__init__()
        self.linear_layer = nn.Linear(in_dim, 600)
        self.lstm = nn.LSTM(600, 300, 1, batch_first=True)
        self.linear_layer2 = nn.Linear(300, num_classes)

        # Initialize learnable lambda parameters
        self.lambda_0 = nn.Parameter(torch.rand(1))
        self.lambda_1 = nn.Parameter(torch.rand(1))
        self.lambda_2 = nn.Parameter(torch.rand(1))

    def forward(self, x0, x1, x2, length):
        x = self.linear_layer(self.lambda_0 * x0 + self.lambda_1 * x1 + self.lambda_2 * x2)
        x, _ = self.lstm(x)

        final_preds = []
        loc_cnt = 0
        for loc_len in length:
            loc_x = x[loc_cnt, loc_len-1, :]
            loc_out = self.linear_layer2(loc_x)
            loc_cnt += 1
            final_preds.append(loc_out)
        final_preds = torch.stack(final_preds)
        return final_preds

forward_lstm = torch.load("backward_model_final.pt").to(device)
backward_lstm = torch.load("forward_model_final.pt").to(device)

# Freeze the ELMo model parameters
for param in list(forward_lstm.parameters()) + list(backward_lstm.parameters()):
    param.requires_grad = False

train_dataset = torch.utils.data.TensorDataset(train_seqs, train_labels, train_lengths)

# split train set into train and validation set 20 percent
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

test_dataset = torch.utils.data.TensorDataset(test_seqs, test_labels, test_lengths)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=50, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=50, shuffle=True)

in_num = 600
num_classes = 5
downstream_model = Classifier(num_classes, in_num).to(device)

# Freeze the lambda parameters for the frozen setting
for param in [downstream_model.lambda_0, downstream_model.lambda_1, downstream_model.lambda_2]:
    param.requires_grad = False

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(downstream_model.parameters(), lr=0.0001)

from tqdm import tqdm

n_epochs = 10

best_val_f1 = 0.0

for epoch in range(n_epochs):
    train_accurs = []
    train_f1s = []
    val_accurs = []
    val_f1s = []

    print("Epoch {}:".format(epoch + 1))

    # Training
    downstream_model.train()
    train_loader_tqdm = tqdm(train_loader, desc="Training")
    for data, label, length in train_loader_tqdm:
        data = data.to(device)
        data_reverse = data.flip(1)
        _, (xf0, xf1, xf2) = forward_lstm(data)
        _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
        xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
        xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
        x0 = torch.cat((xf0, xb0), dim=2)
        x1 = torch.cat((xf1, xb1), dim=2)
        x2 = torch.cat((xf2, xb2), dim=2)
        output = downstream_model(x0, x1, x2, length)
        loss = loss_func(output, label.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(output, 1)
        accuracy = accuracy_score(label.cpu(), predicted.cpu())
        micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
        train_accurs.append(accuracy)
        train_f1s.append(micro_f1)
        train_loader_tqdm.set_postfix(loss=loss.item(), accuracy=accuracy, micro_f1=micro_f1)

    train_accuracy = sum(train_accurs) / len(train_accurs)
    train_micro_f1 = sum(train_f1s) / len(train_f1s)
    print("Training - Accuracy: {:.4f}, Micro F1: {:.4f}".format(train_accuracy, train_micro_f1))

    # Validation
    downstream_model.eval()
    with torch.no_grad():
        val_loader_tqdm = tqdm(val_loader, desc="Validation")
        for data, label, length in val_loader_tqdm:
            data = data.to(device)
            data_reverse = data.flip(1)
            _, (xf0, xf1, xf2) = forward_lstm(data)
            _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
            xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
            xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
            x0 = torch.cat((xf0, xb0), dim=2)
            x1 = torch.cat((xf1, xb1), dim=2)
            x2 = torch.cat((xf2, xb2), dim=2)
            output = downstream_model(x0, x1, x2, length)

            _, predicted = torch.max(output, 1)
            accuracy = accuracy_score(label.cpu(), predicted.cpu())
            micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
            val_accurs.append(accuracy)
            val_f1s.append(micro_f1)
            val_loader_tqdm.set_postfix(accuracy=accuracy, micro_f1=micro_f1)

    val_accuracy = sum(val_accurs) / len(val_accurs)
    val_micro_f1 = sum(val_f1s) / len(val_f1s)
    print("Validation - Accuracy: {:.4f}, Micro F1: {:.4f}".format(val_accuracy, val_micro_f1))

    # Check if this is the best validation F1 score so far
    if val_micro_f1 > best_val_f1:
        best_val_f1 = val_micro_f1
        # Save the model
        torch.save(downstream_model, "best_downstream_model_frozen.pt")

print("Training complete.")

# Testing the best model
best_downstream_model = torch.load("best_downstream_model_frozen.pt")
best_downstream_model.eval()

test_accurs = []
test_f1s = []
true_labels = []
predicted_labels = []

with torch.no_grad():
    for data, label, length in test_loader:
        data = data.to(device)
        data_reverse = data.flip(1)
        _, (xf0, xf1, xf2) = forward_lstm(data)
        _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
        xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
        xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
        x0 = torch.cat((xf0, xb0), dim=2)
        x1 = torch.cat((xf1, xb1), dim=2)
        x2 = torch.cat((xf2, xb2), dim=2)
        output = best_downstream_model(x0, x1, x2, length)

        _, predicted = torch.max(output, 1)
        true_labels.extend(label.cpu().numpy())
        predicted_labels.extend(predicted.cpu().numpy())

        accuracy = accuracy_score(label.cpu(), predicted.cpu())
        micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
        test_accurs.append(accuracy)
        test_f1s.append(micro_f1)

test_accuracy = sum(test_accurs) / len(test_accurs)
test_micro_f1 = sum(test_f1s) / len(test_f1s)
print("Test - Accuracy: {:.4f}, Micro F1: {:.4f}".format(test_accuracy, test_micro_f1))

# Print confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)


Done importing...
train, test input shapes:  torch.Size([20000, 117]) torch.Size([7600, 104])
train, test label shapes:  torch.Size([20000]) torch.Size([7600])
train, test length shapes:  torch.Size([20000]) torch.Size([7600])
Epoch 1:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.46it/s, accuracy=0.4, loss=1.22, micro_f1=0.4]


Training - Accuracy: 0.3690, Micro F1: 0.3690


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.51it/s, accuracy=0.48, micro_f1=0.48]


Validation - Accuracy: 0.4493, Micro F1: 0.4493
Epoch 2:


Training: 100%|██████████| 320/320 [00:57<00:00,  5.55it/s, accuracy=0.46, loss=1.06, micro_f1=0.46]


Training - Accuracy: 0.5026, Micro F1: 0.5026


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.41it/s, accuracy=0.54, micro_f1=0.54]


Validation - Accuracy: 0.5222, Micro F1: 0.5222
Epoch 3:


Training: 100%|██████████| 320/320 [00:57<00:00,  5.55it/s, accuracy=0.58, loss=0.952, micro_f1=0.58]


Training - Accuracy: 0.5851, Micro F1: 0.5851


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.37it/s, accuracy=0.7, micro_f1=0.7]


Validation - Accuracy: 0.5853, Micro F1: 0.5853
Epoch 4:


Training: 100%|██████████| 320/320 [00:57<00:00,  5.58it/s, accuracy=0.68, loss=0.8, micro_f1=0.68]


Training - Accuracy: 0.6296, Micro F1: 0.6296


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.38it/s, accuracy=0.64, micro_f1=0.64]


Validation - Accuracy: 0.6082, Micro F1: 0.6082
Epoch 5:


Training: 100%|██████████| 320/320 [00:57<00:00,  5.56it/s, accuracy=0.6, loss=0.949, micro_f1=0.6]


Training - Accuracy: 0.6597, Micro F1: 0.6597


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.64, micro_f1=0.64]


Validation - Accuracy: 0.5940, Micro F1: 0.5940
Epoch 6:


Training: 100%|██████████| 320/320 [00:57<00:00,  5.58it/s, accuracy=0.6, loss=0.903, micro_f1=0.6]


Training - Accuracy: 0.6796, Micro F1: 0.6796


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.39it/s, accuracy=0.68, micro_f1=0.68]


Validation - Accuracy: 0.6552, Micro F1: 0.6552
Epoch 7:


Training: 100%|██████████| 320/320 [00:57<00:00,  5.57it/s, accuracy=0.72, loss=0.701, micro_f1=0.72]


Training - Accuracy: 0.7009, Micro F1: 0.7009


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.64, micro_f1=0.64]


Validation - Accuracy: 0.6660, Micro F1: 0.6660
Epoch 8:


Training: 100%|██████████| 320/320 [00:57<00:00,  5.57it/s, accuracy=0.6, loss=0.87, micro_f1=0.6]


Training - Accuracy: 0.7142, Micro F1: 0.7142


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.7, micro_f1=0.7]


Validation - Accuracy: 0.6737, Micro F1: 0.6737
Epoch 9:


Training: 100%|██████████| 320/320 [00:57<00:00,  5.56it/s, accuracy=0.86, loss=0.521, micro_f1=0.86]


Training - Accuracy: 0.7334, Micro F1: 0.7334


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.40it/s, accuracy=0.64, micro_f1=0.64]


Validation - Accuracy: 0.6762, Micro F1: 0.6762
Epoch 10:


Training: 100%|██████████| 320/320 [00:57<00:00,  5.58it/s, accuracy=0.7, loss=0.584, micro_f1=0.7]


Training - Accuracy: 0.7470, Micro F1: 0.7470


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.41it/s, accuracy=0.54, micro_f1=0.54]


Validation - Accuracy: 0.6660, Micro F1: 0.6660
Training complete.
Test - Accuracy: 0.6353, Micro F1: 0.6353
Confusion Matrix:
[[1130  324  158  288]
 [ 170 1382  116  232]
 [ 189  180 1055  476]
 [ 169  210  260 1261]]


Learnable Function

In [15]:
import torch
import json
import csv
from tqdm import tqdm
from pprint import pprint
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import gensim
import os

from torch import nn, optim
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

import regex as re
print("Done importing...")

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text

class lstm(nn.Module):
    def __init__(self, vocab_size):
        super(lstm, self).__init__()
        self.vocab_size = vocab_size
        # lstm with 2 stacks
        num_stacks = 2
        self.embeddings = nn.Embedding(vocab_size, 300)
        self.lstm = nn.LSTM(300, 300, 1, batch_first=True)
        self.lstm1 = nn.LSTM(300, 300, 1, batch_first=True)
        self.linear = nn.Linear(300, vocab_size)

    def forward(self, x):
        embed = self.embeddings(x)
        x1, _ = self.lstm(embed)
        x2, _ = self.lstm1(x1)
        x = self.linear(x2)
        return x, (embed, x1, x2)

def make_dataset(data, word2idx, idx2word, max_sentences=20000):
    seq_list = []
    label_list = []
    length_list = []
    data = data[1:max_sentences+1]
    for item in data:
        idx_seq = []
        item = item.strip().split(',')
        label = item[0]
        seq = preprocess(item[1])
        seq = seq.split()
        for word in seq:
            if word in word2idx.keys():
                idx_seq.append(word2idx[word])
            else:
                idx_seq.append(word2idx["unk"])
        length = len(idx_seq)
        length_list.append(length)
        seq_list.append(idx_seq)
        label_list.append(label)
    return seq_list, label_list, length_list

import os
import torch

word2idx_path = "word2idx.pt"
idx2word_path = "idx2word.pt"

word2idx = torch.load(word2idx_path)
idx2word = torch.load(idx2word_path)

def load_dataset(filename):
    file_path = os.path.join("drive/MyDrive/NLP_A4", filename)
    with open(file_path, "r") as file:
        data = file.readlines()
    return data

train_data = load_dataset("train.csv")
test_data = load_dataset("test.csv")

#preprocess and create sequences, labels, and lengths
train_seqs, train_labels, train_lengths = make_dataset(train_data, word2idx, idx2word)
test_seqs, test_labels, test_lengths = make_dataset(test_data, word2idx, idx2word)

from torch.nn.utils.rnn import pad_sequence

train_seqs = pad_sequence([torch.LongTensor(i) for i in train_seqs], batch_first=True)
test_seqs = pad_sequence([torch.LongTensor(i) for i in test_seqs], batch_first=True)

train_labels = torch.LongTensor([int(i) for i in train_labels])
test_labels = torch.LongTensor([int(i) for i in test_labels])
train_lengths = torch.LongTensor(train_lengths)
test_lengths = torch.LongTensor(test_lengths)

print("train, test input shapes: ", train_seqs.shape, test_seqs.shape)
print("train, test label shapes: ", train_labels.shape, test_labels.shape)
print("train, test length shapes: ", train_lengths.shape, test_lengths.shape)


class CombinationFunction(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(CombinationFunction, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, e0, e1, e2):
        combined = torch.cat((e0, e1, e2), dim=2)
        output = F.relu(self.linear(combined))
        return output

class Classifier(nn.Module):
    def __init__(self, num_classes, in_dim):
        super(Classifier, self).__init__()
        self.combination_function = CombinationFunction(300,600)
        self.linear_layer = nn.Linear(in_dim, 600)
        self.lstm = nn.LSTM(600, 300, 1, batch_first=True)
        self.linear_layer2 = nn.Linear(300, num_classes)

        # Initialize learnable lambda parameters
        self.lambda_0 = nn.Parameter(torch.rand(1))
        self.lambda_1 = nn.Parameter(torch.rand(1))
        self.lambda_2 = nn.Parameter(torch.rand(1))

    def forward(self, x0, x1, x2, length):
        x = self.linear_layer(self.lambda_0 * x0 + self.lambda_1 * x1 + self.lambda_2 * x2)
        x, _ = self.lstm(x)

        final_preds = []
        loc_cnt = 0
        for loc_len in length:
            loc_x = x[loc_cnt, loc_len-1, :]
            loc_out = self.linear_layer2(loc_x)
            loc_cnt += 1
            final_preds.append(loc_out)
        final_preds = torch.stack(final_preds)
        return final_preds

forward_lstm = torch.load("backward_model_final.pt").to(device)
backward_lstm = torch.load("forward_model_final.pt").to(device)

# Freeze the ELMo model parameters
for param in list(forward_lstm.parameters()) + list(backward_lstm.parameters()):
    param.requires_grad = False

train_dataset = torch.utils.data.TensorDataset(train_seqs, train_labels, train_lengths)

train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

test_dataset = torch.utils.data.TensorDataset(test_seqs, test_labels, test_lengths)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=50, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=50, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=50, shuffle=True)

in_num = 600
num_classes = 5
downstream_model = Classifier(num_classes, in_num).to(device)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(downstream_model.parameters(), lr=0.0001)

from tqdm import tqdm
n_epochs = 10
best_val_f1 = 0.0

for epoch in range(n_epochs):
    train_accurs = []
    train_f1s = []
    val_accurs = []
    val_f1s = []

    print("Epoch {}:".format(epoch + 1))

    # Training
    downstream_model.train()
    train_loader_tqdm = tqdm(train_loader, desc="Training")
    for data, label, length in train_loader_tqdm:
        data = data.to(device)
        data_reverse = data.flip(1)
        _, (xf0, xf1, xf2) = forward_lstm(data)
        _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
        xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
        xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
        x0 = torch.cat((xf0, xb0), dim=2)
        x1 = torch.cat((xf1, xb1), dim=2)
        x2 = torch.cat((xf2, xb2), dim=2)
        output = downstream_model(x0, x1, x2, length)
        loss = loss_func(output, label.to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(output, 1)
        accuracy = accuracy_score(label.cpu(), predicted.cpu())
        micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
        train_accurs.append(accuracy)
        train_f1s.append(micro_f1)
        train_loader_tqdm.set_postfix(loss=loss.item(), accuracy=accuracy, micro_f1=micro_f1)

    train_accuracy = sum(train_accurs) / len(train_accurs)
    train_micro_f1 = sum(train_f1s) / len(train_f1s)
    print("Training - Accuracy: {:.4f}, Micro F1: {:.4f}".format(train_accuracy, train_micro_f1))

    # Validation
    downstream_model.eval()
    with torch.no_grad():
        val_loader_tqdm = tqdm(val_loader, desc="Validation")
        for data, label, length in val_loader_tqdm:
            data = data.to(device)
            data_reverse = data.flip(1)
            _, (xf0, xf1, xf2) = forward_lstm(data)
            _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
            xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
            xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
            x0 = torch.cat((xf0, xb0), dim=2)
            x1 = torch.cat((xf1, xb1), dim=2)
            x2 = torch.cat((xf2, xb2), dim=2)
            output = downstream_model(x0, x1, x2, length)

            _, predicted = torch.max(output, 1)
            accuracy = accuracy_score(label.cpu(), predicted.cpu())
            micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
            val_accurs.append(accuracy)
            val_f1s.append(micro_f1)
            val_loader_tqdm.set_postfix(accuracy=accuracy, micro_f1=micro_f1)

    val_accuracy = sum(val_accurs) / len(val_accurs)
    val_micro_f1 = sum(val_f1s) / len(val_f1s)
    print("Validation - Accuracy: {:.4f}, Micro F1: {:.4f}".format(val_accuracy, val_micro_f1))

    # Check if this is the best validation F1 score so far
    if val_micro_f1 > best_val_f1:
        best_val_f1 = val_micro_f1
        torch.save(downstream_model, "best_downstream_model_function.pt")

print("Training complete.")

# Testing the best model
best_downstream_model = torch.load("best_downstream_model_function.pt")
best_downstream_model.eval()

test_accurs = []
test_f1s = []
true_labels = []
predicted_labels = []

with torch.no_grad():
    for data, label, length in test_loader:
        data = data.to(device)
        data_reverse = data.flip(1)
        _, (xf0, xf1, xf2) = forward_lstm(data)
        _, (xb0, xb1, xb2) = backward_lstm(data_reverse)
        xf0, xf1, xf2 = xf0.detach(), xf1.detach(), xf2.detach()
        xb0, xb1, xb2 = xb0.detach(), xb1.detach(), xb2.detach()
        x0 = torch.cat((xf0, xb0), dim=2)
        x1 = torch.cat((xf1, xb1), dim=2)
        x2 = torch.cat((xf2, xb2), dim=2)
        output = best_downstream_model(x0, x1, x2, length)

        _, predicted = torch.max(output, 1)
        true_labels.extend(label.cpu().numpy())
        predicted_labels.extend(predicted.cpu().numpy())

        accuracy = accuracy_score(label.cpu(), predicted.cpu())
        micro_f1 = f1_score(label.cpu(), predicted.cpu(), average="micro")
        test_accurs.append(accuracy)
        test_f1s.append(micro_f1)

test_accuracy = sum(test_accurs) / len(test_accurs)
test_micro_f1 = sum(test_f1s) / len(test_f1s)
print("Test - Accuracy: {:.4f}, Micro F1: {:.4f}".format(test_accuracy, test_micro_f1))

# Print confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:")
print(conf_matrix)

Done importing...
train, test input shapes:  torch.Size([20000, 117]) torch.Size([7600, 104])
train, test label shapes:  torch.Size([20000]) torch.Size([7600])
train, test length shapes:  torch.Size([20000]) torch.Size([7600])
Epoch 1:


Training: 100%|██████████| 320/320 [01:00<00:00,  5.26it/s, accuracy=0.42, loss=1.24, micro_f1=0.42]


Training - Accuracy: 0.4046, Micro F1: 0.4046


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.44it/s, accuracy=0.38, micro_f1=0.38]


Validation - Accuracy: 0.4278, Micro F1: 0.4278
Epoch 2:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.51it/s, accuracy=0.56, loss=1.09, micro_f1=0.56]


Training - Accuracy: 0.5433, Micro F1: 0.5433


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.37it/s, accuracy=0.6, micro_f1=0.6]


Validation - Accuracy: 0.5410, Micro F1: 0.5410
Epoch 3:


Training: 100%|██████████| 320/320 [00:59<00:00,  5.36it/s, accuracy=0.64, loss=0.734, micro_f1=0.64]


Training - Accuracy: 0.6349, Micro F1: 0.6349


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.36it/s, accuracy=0.68, micro_f1=0.68]


Validation - Accuracy: 0.6410, Micro F1: 0.6410
Epoch 4:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.44it/s, accuracy=0.74, loss=0.692, micro_f1=0.74]


Training - Accuracy: 0.6852, Micro F1: 0.6852


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.39it/s, accuracy=0.54, micro_f1=0.54]


Validation - Accuracy: 0.6568, Micro F1: 0.6568
Epoch 5:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.51it/s, accuracy=0.8, loss=0.615, micro_f1=0.8]


Training - Accuracy: 0.7266, Micro F1: 0.7266


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.39it/s, accuracy=0.74, micro_f1=0.74]


Validation - Accuracy: 0.6872, Micro F1: 0.6872
Epoch 6:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.48it/s, accuracy=0.68, loss=0.724, micro_f1=0.68]


Training - Accuracy: 0.7509, Micro F1: 0.7509


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.41it/s, accuracy=0.76, micro_f1=0.76]


Validation - Accuracy: 0.7117, Micro F1: 0.7117
Epoch 7:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.50it/s, accuracy=0.76, loss=0.665, micro_f1=0.76]


Training - Accuracy: 0.7808, Micro F1: 0.7808


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.39it/s, accuracy=0.7, micro_f1=0.7]


Validation - Accuracy: 0.7243, Micro F1: 0.7243
Epoch 8:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.51it/s, accuracy=0.8, loss=0.433, micro_f1=0.8]


Training - Accuracy: 0.8028, Micro F1: 0.8028


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.38it/s, accuracy=0.62, micro_f1=0.62]


Validation - Accuracy: 0.7085, Micro F1: 0.7085
Epoch 9:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.51it/s, accuracy=0.84, loss=0.4, micro_f1=0.84]


Training - Accuracy: 0.8241, Micro F1: 0.8241


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.38it/s, accuracy=0.72, micro_f1=0.72]


Validation - Accuracy: 0.7105, Micro F1: 0.7105
Epoch 10:


Training: 100%|██████████| 320/320 [00:58<00:00,  5.51it/s, accuracy=0.8, loss=0.582, micro_f1=0.8]


Training - Accuracy: 0.8493, Micro F1: 0.8493


Validation: 100%|██████████| 80/80 [00:12<00:00,  6.39it/s, accuracy=0.82, micro_f1=0.82]


Validation - Accuracy: 0.7392, Micro F1: 0.7392
Training complete.
Test - Accuracy: 0.6958, Micro F1: 0.6958
Confusion Matrix:
[[1333  296  137  134]
 [ 174 1548   72  106]
 [ 165  235 1219  281]
 [ 207  244  261 1188]]
