In [106]:
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator,Vocab
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score,confusion_matrix
import numpy as np
import torch
import warnings
import scipy as sp
from torch.utils.data import Dataset

In [107]:
UNK_CUTOFF=3
UNKNOWN_TOKEN='<unk>'
WINDOW_SIZE=5
BATCH_SIZE=128
EMBEDDING_SIZE_SVD=300
PAD_TOKEN='<pad>'
NUM_LABELS=4
HIDDEN_SIZE=128
lrate=1e-3
EPOCHS=10

In [108]:
df=pd.read_csv('data/train.csv')
train_labels=df['Class Index'].tolist()
df=df['Description']
warnings.filterwarnings("ignore")
sentences=[]
for sent in df:
    tokenizer=RegexpTokenizer(r'\w+')
    tokens=tokenizer.tokenize(sent)
    tokens=[token.lower() for token in tokens]
    sentences.append(tokens)

In [109]:
def replace_low_frequency_words(sentences, threshold=UNK_CUTOFF):
    word_counts = Counter(word for sentence in sentences for word in sentence)
    replaced_sentences = [
        [UNKNOWN_TOKEN if word_counts[word] < threshold else word for word in sentence]
        for sentence in sentences
    ]
    return replaced_sentences
sentences=replace_low_frequency_words(sentences)
vocab_svd=build_vocab_from_iterator(sentences, specials=[UNKNOWN_TOKEN,PAD_TOKEN])
vocab_svd.set_default_index(vocab_svd[UNKNOWN_TOKEN])
co_occurrence_matrix=sp.sparse.lil_matrix((len(vocab_svd),len(vocab_svd)))

In [110]:
for word_list in sentences:
        for i, word in enumerate(word_list):
            center_index = vocab_svd[word]
            context_indices = [vocab_svd[word_list[j]] for j in range(max(0, i - WINDOW_SIZE), min(len(word_list), i + WINDOW_SIZE + 1)) if i!=j]
            for context_index in context_indices:
                co_occurrence_matrix[center_index,context_index] += 1
U,_,_=sp.sparse.linalg.svds(co_occurrence_matrix, EMBEDDING_SIZE_SVD, return_singular_vectors='u', which='LM')
embeddings_SVD=U

In [111]:
torch.save(embeddings_SVD,'svd_word_vectors.pt')

In [112]:
# embeddings_svd=torch.load('svd_word_vectors.pt')

In [113]:
class Dataset_LSTM(Dataset):
  def __init__(self, sent, labs, embeddings, vocabulary):
    self.sentences = sent
    self.labels = labs
    self.vocabulary = vocabulary
    self.embeddings=embeddings
  def __len__(self) -> int:
    return len(self.sentences)
  def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
    word_embeddings=[self.embeddings[self.vocabulary[j]] for j in self.sentences[index]]
    return torch.tensor(word_embeddings,dtype=torch.float32), torch.tensor(torch.nn.functional.one_hot(torch.tensor(self.labels[index]-1), num_classes=NUM_LABELS)).float()
  def collate(self, batch: list[tuple[torch.Tensor, torch.Tensor]]) -> tuple[torch.Tensor, torch.Tensor]:
    sentences = [i[0] for i in batch]
    labels = [i[1] for i in batch]
    padded_sentences = pad_sequence(sentences, batch_first=True, padding_value=self.vocabulary[PAD_TOKEN])
    padded_labels=padded_labels = pad_sequence(labels, batch_first=True, padding_value=torch.tensor(0))
    return padded_sentences, padded_labels

In [114]:
df=pd.read_csv('data/test.csv')
test_labels=df['Class Index'].tolist()
df=df['Description']
test_sentences=[]
for sent in df:
    tokenizer=RegexpTokenizer(r'\w+')
    tokens=tokenizer.tokenize(sent)
    tokens=[token.lower() for token in tokens]
    test_sentences.append(tokens)

In [115]:
train_dataset=Dataset_LSTM(sentences,train_labels,embeddings_SVD,vocab_svd)
test_dataset=Dataset_LSTM(test_sentences,test_labels,embeddings_SVD,vocab_svd)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,shuffle=True,collate_fn=train_dataset.collate)
test_dataloader=DataLoader(test_dataset,batch_size=BATCH_SIZE,collate_fn=test_dataset.collate)

In [116]:
class LSTMModel(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2label = torch.nn.Linear(hidden_dim, num_classes)
    def forward(self, sentence):
        lstm_out, _ = self.lstm(sentence)
        tag_space = self.hidden2label(lstm_out[-1])
        tag_scores = torch.softmax(tag_space, dim=1)
        return tag_scores

In [117]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device("mps")
device

device(type='mps')

In [118]:
model = LSTMModel(EMBEDDING_SIZE_SVD, HIDDEN_SIZE, NUM_LABELS)
model=model.to(device)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=lrate)
for epoch in range(EPOCHS):
    total_loss = 0
    model.train()
    for batch_sentences, batch_labels in train_dataloader:
        (batch_sentences, batch_labels) = (batch_sentences.to(device), batch_labels.to(device))
        outputs = model(batch_sentences.permute(1,0,2))
        loss = loss_fn(outputs, batch_labels)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

Epoch 1, Loss: 4764.482236921787
Epoch 2, Loss: 3594.3006317019463
Epoch 3, Loss: 3463.311684846878
Epoch 4, Loss: 3379.8793036341667
Epoch 5, Loss: 3352.7804537415504
Epoch 6, Loss: 3332.8128027915955
Epoch 7, Loss: 3311.2320789694786
Epoch 8, Loss: 3294.139129459858
Epoch 9, Loss: 3283.313927948475
Epoch 10, Loss: 3267.174511373043


In [119]:
model.eval()
predictions=[]
true_vals=[]
with torch.no_grad():
    for words, tags in train_dataloader:
        (words, tags) = (words.to(device), tags.to(device))
        pred = model(words.permute(1,0,2))
        pred_max_index = torch.argmax(pred, dim=1)+1
        true_vals.extend((torch.argmax(tags, dim=1)+1).cpu())
        predictions.extend(pred_max_index.cpu())
predictions=torch.stack(predictions).numpy()
true_vals=torch.stack(true_vals).numpy()
print('Evaluation Metrics for train set :')
print(f'Accuracy Score: {accuracy_score(true_vals,predictions)}')
print(f'F1_Score (Macro): {f1_score(true_vals,predictions, average='macro')}')
print(f'F1_Score (Micro): {f1_score(true_vals,predictions, average='micro')}')
print(f'Precision Score: {precision_score(true_vals,predictions, average='weighted')}')
print(f'Recall Score: {recall_score(true_vals,predictions, average='weighted')}')
print(f'Confusion Matrix:\n {confusion_matrix(true_vals,predictions)}')

Evaluation Metrics for train set :
Accuracy Score: 0.8759833333333333
F1_Score (Macro): 0.8757692587965115
F1_Score (Micro): 0.8759833333333333
Precision Score: 0.8777963602869505
Recall Score: 0.8759833333333333
Confusion Matrix:
 [[26022  1166  1415  1397]
 [  558 28523   191   728]
 [ 1272   472 23946  4310]
 [ 1361   496  1516 26627]]


In [120]:
model.eval()
predictions=[]
true_vals=[]
with torch.no_grad():
    for words, tags in test_dataloader:
        (words, tags) = (words.to(device), tags.to(device))
        pred = model(words.permute(1,0,2))
        pred_max_index = torch.argmax(pred, dim=1)+1
        true_vals.extend((torch.argmax(tags, dim=1)+1).cpu())
        predictions.extend(pred_max_index.cpu())
predictions=torch.stack(predictions).numpy()
true_vals=torch.stack(true_vals).numpy()
print('Evaluation Metrics for test set :')
print(f'Accuracy Score: {accuracy_score(true_vals,predictions)}')
print(f'F1_Score (Macro): {f1_score(true_vals,predictions, average='macro')}')
print(f'F1_Score (Micro): {f1_score(true_vals,predictions, average='micro')}')
print(f'Precision Score: {precision_score(true_vals,predictions, average='weighted')}')
print(f'Recall Score: {recall_score(true_vals,predictions, average='weighted')}')
print(f'Confusion Matrix:\n {confusion_matrix(true_vals,predictions)}')

Evaluation Metrics for test set :
Accuracy Score: 0.8602631578947368
F1_Score (Macro): 0.859842356834317
F1_Score (Micro): 0.8602631578947368
Precision Score: 0.8619504569432361
Recall Score: 0.8602631578947368
Confusion Matrix:
 [[1628   75   96  101]
 [  47 1791   21   41]
 [ 105   36 1457  302]
 [  85   34  119 1662]]
