# Task 1: Part 2

In [129]:
import gensim.downloader as api
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import simplejson as json

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset


In [130]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [131]:
# Define the list of entities
entities = ["COURT","PETITIONER","RESPONDENT","JUDGE","DATE","ORG","GPE","STATUTE","PROVISION","PRECEDENT","CASE_NUMBER","WITNESS","OTHER_PERSON"]

# Generate BIO encoding for each entity
bio_encoding = []
for entity in entities:
    bio_encoding.extend(["B_" + entity, "I_" + entity])

bio_encoding.append("O")

def data_formatter(data):
    formatted_data = []
    for i in data.keys():
        formatted_data.append({'text': data[i]['text'], 'labels': data[i]['labels']})
    return formatted_data

def label_encoder(labels):
    encoded_labels = []
    for label in labels:
        if label in bio_encoding:
            encoded_labels.append(bio_encoding.index(label))
        else:
            encoded_labels.append(bio_encoding.index("O"))
    return encoded_labels

def tokenize_text(text):
    word_to_index = {}
    encoded_texts = []
    
    max_len = 0
    
    for i in range(len(text)):
        text[i]['text'] = text[i]['text'].split(' ')
        temp = text[i]['text']
        encoded_text = [word_to_index.setdefault(word, len(word_to_index)) for word in temp]
        max_len = max(max_len, len(encoded_text))
        text[i]['text'] = encoded_text
        text[i]['labels'] = label_encoder(text[i]['labels'])
    
    padded_text = []
    padded_labels = []
    
    for i in range(len(text)):
        padded_text.append(text[i]['text'] + [0] * (max_len - len(text[i]['text'])))
        padded_labels.append(text[i]['labels'] + [0] * (max_len - len(text[i]['labels'])))
        
    return [padded_text, padded_labels]

def finalize(text, labels):
    input_tensor = torch.tensor(text, dtype=torch.long)
    label_tensor = torch.tensor(labels, dtype=torch.long)
    dataset = TensorDataset(input_tensor, label_tensor)
    return dataset

In [132]:
def load_dataset():
    with open('../data/NER_train.json', 'r') as f:
        train_data = json.load(f)
        train_data = data_formatter(train_data)
        train_data = tokenize_text(train_data)
        
    with open('../data/NER_test.json', 'r') as f:
        test_data = json.load(f)
        test_data = data_formatter(test_data)
        test_data = tokenize_text(test_data)
        
    with open('../data/NER_val.json', 'r') as f:
        val_data = json.load(f)
        val_data = data_formatter(val_data)
        val_data = tokenize_text(val_data)
        
    return train_data, test_data, val_data

In [133]:
batch_size=32
train_data, test_data, val_data = load_dataset()

train_dataset = finalize(train_data[0], train_data[1])
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = finalize(test_data[0], test_data[1])
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

val_dataset = finalize(val_data[0], val_data[1])
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

In [6]:
# Load pre-trained word embeddings
word2vec = api.load("word2vec-google-news-300")
glove = api.load("glove-wiki-gigaword-100")
fasttext = api.load("fasttext-wiki-news-subwords-300")

In [182]:
# Step 4: Define the RNN-based models
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, embedding_weights):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_weights))
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output = self.fc(output)
        output = self.softmax(output)
        return output

In [135]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, embedding_weights):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_weights))
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.lstm(embedded)
        output = self.fc(output)
        return output

In [136]:
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, embedding_weights):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_weights))
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.gru(embedded)
        output = self.fc(output)
        return output

In [279]:
num_classes = 27
input_size = 300
hidden_size = 128
output_size = num_classes
num_epochs = 100

learning_rate1 = 0.001
learning_rate2 = 0.001
learning_rate3 = 0.001

In [280]:
# word2vec(Input_Size = 300, lr = 0.001, 0.01, 0.1)
model = RNNModel(input_size, hidden_size, output_size, word2vec.vectors)
model2 = LSTMModel(input_size, hidden_size, output_size, word2vec.vectors)
model3 = GRUModel(input_size, hidden_size, output_size, word2vec.vectors)

In [281]:
# golve(Input_Size = 100, lr = 0.001, 0.001, 0.001)
model4 = RNNModel(input_size, hidden_size, output_size, glove.vectors)
model5 = LSTMModel(input_size, hidden_size, output_size, glove.vectors)
model6 = GRUModel(input_size, hidden_size, output_size, glove.vectors)

In [282]:
# fasttext(Input_Size = 300, lr = 0.001, 0.001, 0.001)
model7 = RNNModel(input_size, hidden_size, output_size, fasttext.vectors)
model8 = LSTMModel(input_size, hidden_size, output_size, fasttext.vectors)
model9 = GRUModel(input_size, hidden_size, output_size, fasttext.vectors)

In [283]:
# Define loss function and optimizer
criterion1 = nn.CrossEntropyLoss()
optimizer1 = optim.Adam(model7.parameters(), lr=learning_rate1)

criterion2 = nn.CrossEntropyLoss()
optimizer2 = optim.Adam(model8.parameters(), lr=learning_rate2)

criterion3 = nn.CrossEntropyLoss()
optimizer3 = optim.Adam(model9.parameters(), lr=learning_rate3)

In [284]:
def evaluate(model, dataloader):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, dim=2)
            y_true.extend(labels.cpu().numpy().flatten())
            y_pred.extend(predicted.cpu().numpy().flatten())
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    return macro_f1


In [285]:
# Training loop
def train(model, train_dataloader, num_epochs, criterion, optimizer, output_size, val_dataloader):
    for epoch in range(1, num_epochs+1):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss_mask = (inputs != 0)
            loss = criterion(outputs.view(-1, output_size), labels.view(-1))
            loss = torch.sum(loss * loss_mask.view(-1)) / torch.sum(loss_mask)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
        if epoch % 5 == 0:
            f1_score = evaluate(model, val_dataloader)
            print(f'Epoch [{epoch}/{num_epochs}], Loss: {running_loss/len(train_dataloader)}, F1 Score: {f1_score}')
    return model


In [286]:
model = train(model, train_dataloader, num_epochs, criterion1, optimizer1, output_size, val_dataloader)

In [287]:
model2 = train(model2, train_dataloader, num_epochs, criterion2, optimizer2, output_size, val_dataloader)

In [288]:
model3 = train(model3, train_dataloader, num_epochs, criterion3, optimizer3, output_size, val_dataloader)

In [289]:
model4 = train(model4, train_dataloader, num_epochs, criterion1, optimizer1, output_size, val_dataloader)

In [290]:
model5 = train(model5, train_dataloader, num_epochs, criterion2, optimizer2, output_size, val_dataloader)

In [291]:
model6 = train(model6, train_dataloader, num_epochs, criterion3, optimizer3, output_size, val_dataloader)

In [None]:
model7 = train(model7, train_dataloader, num_epochs, criterion1, optimizer1, output_size, val_dataloader)

In [None]:
model8 = train(model8, train_dataloader, num_epochs, criterion2, optimizer2, output_size, val_dataloader)

In [296]:
model9 = train(model9, train_dataloader, num_epochs, criterion3, optimizer3, output_size, val_dataloader)

Epoch [5/100], Loss: 0.25081583548827, F1 Score: 0.07314371333907264
Epoch [10/100], Loss: 0.18768759796106482, F1 Score: 0.07160366280688156
Epoch [15/100], Loss: 0.15555515814112478, F1 Score: 0.07144988881800603
Epoch [20/100], Loss: 0.1311136958489855, F1 Score: 0.07312991177635292
Epoch [25/100], Loss: 0.10929623771592441, F1 Score: 0.07527247767567134
Epoch [30/100], Loss: 0.08981473647443897, F1 Score: 0.07527185442347349
Epoch [35/100], Loss: 0.07176251992939478, F1 Score: 0.07686956163169381
Epoch [40/100], Loss: 0.0577271441702705, F1 Score: 0.07518539177835812
Epoch [45/100], Loss: 0.04653218818227371, F1 Score: 0.07523585140140501
Epoch [50/100], Loss: 0.037475224844072445, F1 Score: 0.07563642686383287
Epoch [55/100], Loss: 0.029704654444587895, F1 Score: 0.07631801040911054
Epoch [60/100], Loss: 0.02466441742125973, F1 Score: 0.0760640506509891
Epoch [65/100], Loss: 0.022021214904419453, F1 Score: 0.07508011255616026
Epoch [70/100], Loss: 0.016340914920744194, F1 Score: 0

In [None]:
# Evaluate the model
macro_f1 = evaluate(model, test_dataloader)
print(f'Macro F1 score on validation set for model 1: {macro_f1}')

macro_f1 = evaluate(model2, test_dataloader)
print(f'Macro F1 score on validation set for model 2: {macro_f1}')

macro_f1 = evaluate(model3, test_dataloader)
print(f'Macro F1 score on validation set for model 3: {macro_f1}')

macro_f1 = evaluate(model4, test_dataloader)
print(f'Macro F1 score on validation set for model 4: {macro_f1}')

macro_f1 = evaluate(model5, test_dataloader)
print(f'Macro F1 score on validation set for model 5: {macro_f1}')

macro_f1 = evaluate(model6, test_dataloader)
print(f'Macro F1 score on validation set for model 6: {macro_f1}')

macro_f1 = evaluate(model7, test_dataloader)
print(f'Macro F1 score on validation set for model 7: {macro_f1}')

macro_f1 = evaluate(model8, test_dataloader)
print(f'Macro F1 score on validation set for model 8: {macro_f1}')

macro_f1 = evaluate(model9, test_dataloader)
print(f'Macro F1 score on validation set for model 9: {macro_f1}')

Macro F1 score on validation set for model 1: 0.0752215363537528
Macro F1 score on validation set for model 2: 0.07324899400924201
Macro F1 score on validation set for model 3: 0.07099872944756906


In [297]:
# Save the model
import os
os.makedirs('word2vec', exist_ok=True)
os.makedirs('glove', exist_ok=True)
os.makedirs('fasttext', exist_ok=True)

torch.save(model.state_dict(), 'word2vec/rnn_model.pth')
torch.save(model2.state_dict(), 'word2vec/lstm_model.pth')
torch.save(model3.state_dict(), 'word2vec/gru_model.pth')

torch.save(model4.state_dict(), 'glove/rnn_model.pth')
torch.save(model5.state_dict(), 'glove/lstm_model.pth')
torch.save(model6.state_dict(), 'glove/gru_model.pth')

torch.save(model7.state_dict(), 'fasttext/rnn_model.pth')
torch.save(model8.state_dict(), 'fasttext/lstm_model.pth')
torch.save(model9.state_dict(), 'fasttext/gru_model.pth')