In [8]:
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator,Vocab
from torch.utils.data import DataLoader
import warnings
from sklearn.metrics import f1_score,recall_score,precision_score,accuracy_score,confusion_matrix
import numpy as np
import torch
import scipy as sp
from torch.utils.data import Dataset
import random

In [9]:
UNK_CUTOFF=3
UNKNOWN_TOKEN='<unk>'
WINDOW_SIZE=5
BATCH_SIZE=128
EMBEDDING_SIZE=150
EMBEDDING_SIZE_SGNS=300
PAD_TOKEN='<pad>'
NUM_LABELS=4
HIDDEN_SIZE=128
lrate=1e-3
NEG_SAMPLES=4
EPOCHS=10
THRESHOLD=1e-5

In [10]:
df=pd.read_csv('../input/ass3-curr/train.csv')
train_labels=df['Class Index'].tolist()
df=df['Description']
warnings.filterwarnings("ignore")
sentences=[]
for sent in df:
    tokenizer=RegexpTokenizer(r'\w+')
    tokens=tokenizer.tokenize(sent)
    tokens=[token.lower() for token in tokens]
    sentences.append(tokens)

In [11]:
def replace_low_frequency_words(sentences, threshold=UNK_CUTOFF):
    word_counts = Counter(word for sentence in sentences for word in sentence)
    replaced_sentences = [
        [UNKNOWN_TOKEN if word_counts[word] < threshold else word for word in sentence]
        for sentence in sentences
    ]
    return replaced_sentences
sentences=replace_low_frequency_words(sentences)
vocab_sgns=build_vocab_from_iterator(sentences, specials=[UNKNOWN_TOKEN,PAD_TOKEN])
vocab_sgns.set_default_index(vocab_sgns[UNKNOWN_TOKEN])

In [12]:
def count_word_occurrences(list_of_lists):
    word_count = Counter()
    for inner_list in list_of_lists:
        word_count.update(inner_list)
    return word_count
word_counts=count_word_occurrences(sentences)
int_to_vocabword={}
for i,w in enumerate(vocab_sgns.get_itos()):
    if w!=PAD_TOKEN:
        int_to_vocabword[i]=w
words=[w for sen in sentences for w in sen]
word_counts = Counter(words)
sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True) # descending freq order
int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}
int_words = [vocab_to_int[word] for word in words]

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [14]:
def subsample_sentences(words, sentences, threshold = THRESHOLD):
  word_counts = Counter(words)
  total_n_words = len(words)
  freq_ratios = {word: count/total_n_words for word, count in word_counts.items()}
  p_drop = {word: 1 - np.sqrt(threshold/freq_ratios[word]) for word in word_counts}
  return [[word for word in sentence if random.random() < (1 - p_drop[word])] for sentence in sentences]
train_sens = subsample_sentences(words,sentences)
# train_sens=sentences
tot=0
for x in train_sens:
  tot+=len(x) 
print(tot)
print(len(words))
print(tot/len(words))

971008
3897542
0.24913342819654027


In [15]:
training_pairs = set()
for sentence in train_sens:
    for i, target_word in enumerate(sentence):
        context = [sentence[j] for j in range(max(0, i - WINDOW_SIZE), min(len(sentence), i + WINDOW_SIZE + 1)) if j != i]
        for context_word in context:
            training_pairs.add((torch.tensor(vocab_sgns[target_word]), torch.tensor(vocab_sgns[context_word])))
training_pairs=list(training_pairs) # add

word_counts = Counter(words)
word_freqs = np.array([word_counts[word] / sum(word_counts.values()) for word in word_counts.keys()])
neg_sampling_weights = torch.from_numpy(word_freqs ** 0.75 / np.sum(word_freqs ** 0.75))
before = neg_sampling_weights[:1]
after =neg_sampling_weights[1:]
new_tensor = torch.cat((before, torch.tensor(0).unsqueeze(0), after))
neg_sampling_weights=new_tensor

arranged_values = [0]*len(vocab_sgns)
temp=list(word_counts.keys())
temp.insert(1,PAD_TOKEN)
for a,b in zip(temp,neg_sampling_weights):
    arranged_values[vocab_sgns[a]]=b
neg_sampling_weights=arranged_values

In [16]:
class NegativeSamplingLoss(torch.nn.Module):
  def __init__(self):
    super().__init__()
  def forward(self,input_vectors,output_vectors,noise_vectors):
    # losses=[]
    # for i in range(input_vector.size(0)):
    #   curr_loss=-(torch.log(torch.sigmoid(torch.dot(input_vector[i],output_vector[i])))+torch.sum(torch.log(torch.sigmoid(torch.mm(torch.neg(noise_vectors[i]), (input_vector[i]).unsqueeze(1))))))
    #   losses.append(curr_loss)
    # return sum(losses)/len(losses)
    batch_size, embed_size = input_vectors.shape
    input_vectors = input_vectors.view(batch_size, embed_size, 1)
    output_vectors = output_vectors.view(batch_size, 1, embed_size)
    out_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log().squeeze()
    noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log()
    noise_loss = noise_loss.squeeze().sum(1)
    return -(out_loss + noise_loss).mean()

In [17]:
class SkipGramNeg(torch.nn.Module):
    def __init__(self, n_vocab, n_embed, noise_dist, vocab):
        super().__init__()
        self.n_vocab = n_vocab
        self.n_embed = n_embed
        self.noise_dist = noise_dist
        self.vocab=vocab

        self.in_embed = torch.nn.Embedding(n_vocab, n_embed)
        self.out_embed = torch.nn.Embedding(n_vocab, n_embed)
        
        # # Initialize both embedding tables with uniform distribution
        self.in_embed.weight.data.uniform_(-1, 1)
        self.out_embed.weight.data.uniform_(-1, 1)
        
    def forward_input(self, input_words):
        input_vectors = self.in_embed(input_words)
        return input_vectors  # input vector embeddings
    
    def forward_target(self, output_words):
        output_vectors = self.out_embed(output_words)
        return output_vectors  # output vector embeddings
    
    def forward_noise(self, inp, n_samples):
        """ Generate noise vectors with shape (batch_size, n_samples, n_embed)"""
        batch_size=inp.size(0)
        # tensor_lst=[]
        # for i in range(batch_size):
        #     a=target[i]
        #     b=inp[i]
        #     a_orig=self.noise_dist[a]
        #     b_orig=self.noise_dist[b]
        #     self.noise_dist[a]=0
        #     self.noise_dist[b]=0
        #     noise_words = torch.multinomial(input=self.noise_dist,num_samples=n_samples,replacement=False)
        #     noise_words = noise_words.to(device)
        #     noise_vectors = self.out_embed(noise_words)
        #     tensor_lst.append(noise_vectors)
        #     self.noise_dist[a]=a_orig
        #     self.noise_dist[b]=b_orig
        # return torch.stack(tensor_lst)
        noise_words = torch.multinomial(input=self.noise_dist,num_samples=batch_size*n_samples,replacement = False)
        noise_words = noise_words.to(device)
        noise_vectors = self.out_embed(noise_words).view(batch_size, n_samples, self.n_embed)
        return noise_vectors

In [18]:
class Dataset_SGNS(Dataset):
  def __init__(self, train_dt):
    self.training_pairs=train_dt
  def __len__(self) -> int:
    return len(self.training_pairs)
  def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
    return self.training_pairs[index][0],self.training_pairs[index][1]

In [19]:
model = SkipGramNeg(len(vocab_sgns),EMBEDDING_SIZE,torch.tensor(neg_sampling_weights), vocab_sgns)
model=model.to(device)
criterion = NegativeSamplingLoss()
optimizer = torch.optim.Adam(model.parameters(),lrate)
train_dataset=Dataset_SGNS(training_pairs)
train_dataloader=DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True)

In [20]:
for epoch in range(EPOCHS):
    total_loss=0
    model.train()
    for inputs, targets in train_dataloader:
      inputs = inputs.to(device)
      targets = targets.to(device)
      embedded_input_words = model.forward_input(inputs)
      embedded_target_words = model.forward_target(targets)
      embedded_noise_words = model.forward_noise(inputs,n_samples=NEG_SAMPLES)
      loss = criterion(embedded_input_words, embedded_target_words, embedded_noise_words)
      total_loss+=loss.item()
      optimizer.zero_grad()
      loss.backward() 
      optimizer.step()
    print(f'Epoch: {epoch+1}, Loss: {total_loss}')

Epoch: 1, Loss: 215335.09066295624
Epoch: 2, Loss: 113121.26427972317
Epoch: 3, Loss: 96926.61923456192
Epoch: 4, Loss: 89543.70216393471
Epoch: 5, Loss: 85067.5099543333
Epoch: 6, Loss: 82032.7832762003
Epoch: 7, Loss: 79883.54188108444
Epoch: 8, Loss: 78217.43746113777
Epoch: 9, Loss: 76960.06075644493
Epoch: 10, Loss: 75937.58561635017


In [21]:
word_embeddings=model.in_embed.weight.data
context_embeddings=model.out_embed.weight.data
embeddings_sgns=torch.cat((word_embeddings,context_embeddings),dim=1)

In [22]:
torch.save(embeddings_sgns,'skip-gram-word-vectors.pt')

In [None]:
# embeddings_sgns=torch.load('skip-gram-word-vectors.pt')
embeddings_sgns = torch.tensor(embeddings_sgns, device='cpu')

In [24]:
df=pd.read_csv('../input/ass3-curr/test.csv')
test_labels=df['Class Index'].tolist()
df=df['Description']
test_sentences=[]
for sent in df:
    tokenizer=RegexpTokenizer(r'\w+')
    tokens=tokenizer.tokenize(sent)
    tokens=[token.lower() for token in tokens]
    test_sentences.append(tokens)

In [25]:
class Dataset_LSTM(Dataset):
  def __init__(self, sent, labs, embeddings, vocabulary):
    self.sentences = sent
    self.labels = labs
    self.vocabulary = vocabulary
    self.embeddings=embeddings
  def __len__(self) -> int:
    return len(self.sentences)
  def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
    word_embeddings=[self.embeddings[self.vocabulary[j]] for j in self.sentences[index]]
    return torch.stack(word_embeddings), torch.tensor(torch.nn.functional.one_hot(torch.tensor(self.labels[index]-1), num_classes=NUM_LABELS)).float()
  def collate(self, batch: list[tuple[torch.Tensor, torch.Tensor]]) -> tuple[torch.Tensor, torch.Tensor]:
    sentences = [i[0] for i in batch]
    labels = [i[1] for i in batch]
    padded_sentences = pad_sequence(sentences, batch_first=True, padding_value=self.vocabulary[PAD_TOKEN])
    padded_labels=padded_labels = pad_sequence(labels, batch_first=True, padding_value=torch.tensor(0))
    return padded_sentences, padded_labels

In [26]:
train_dataset=Dataset_LSTM(sentences,train_labels,embeddings_sgns,vocab_sgns)
test_dataset=Dataset_LSTM(test_sentences,test_labels,embeddings_sgns,vocab_sgns)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,shuffle=True,collate_fn=train_dataset.collate)
test_dataloader=DataLoader(test_dataset,batch_size=BATCH_SIZE,collate_fn=test_dataset.collate)

In [27]:
class LSTMModel(torch.nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes):
        super(LSTMModel, self).__init__()
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2label = torch.nn.Linear(hidden_dim, num_classes)
    def forward(self, sentence):
        lstm_out, _ = self.lstm(sentence)
        tag_space = self.hidden2label(lstm_out[-1])
        tag_scores = torch.softmax(tag_space, dim=1)
        return tag_scores

In [28]:
model = LSTMModel(EMBEDDING_SIZE_SGNS, HIDDEN_SIZE, NUM_LABELS)
model=model.to(device)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=lrate)
for epoch in range(EPOCHS):
    total_loss = 0
    model.train()
    for batch_sentences, batch_labels in train_dataloader:
        (batch_sentences, batch_labels) = (batch_sentences.to(device), batch_labels.to(device))
        outputs = model(batch_sentences.permute(1,0,2))
        loss = loss_fn(outputs, batch_labels)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

Epoch 1, Loss: 1082.6663433909416
Epoch 2, Loss: 806.4324569702148
Epoch 3, Loss: 797.682878434658
Epoch 4, Loss: 792.9893666505814
Epoch 5, Loss: 786.983192563057
Epoch 6, Loss: 781.5390492081642
Epoch 7, Loss: 777.6512961983681
Epoch 8, Loss: 774.9042839407921
Epoch 9, Loss: 771.3663699030876
Epoch 10, Loss: 768.101092338562


In [30]:
model.eval()
predictions=[]
true_vals=[]
with torch.no_grad():
    for words, tags in train_dataloader:
        (words, tags) = (words.to(device), tags.to(device))
        pred = model(words.permute(1,0,2))
        pred_max_index = torch.argmax(pred, dim=1)+1
        true_vals.extend((torch.argmax(tags, dim=1)+1).cpu())
        predictions.extend(pred_max_index.cpu())
predictions=torch.stack(predictions).numpy()
true_vals=torch.stack(true_vals).numpy()
print('Evaluation Metrics for train set :')
print(f'Accuracy Score: {accuracy_score(true_vals,predictions)}')
print('F1_Score (Macro)',f1_score(true_vals,predictions, average='macro'))
print('F1_Score (Micro)', f1_score(true_vals,predictions, average='micro'))
print('Precision Score:', precision_score(true_vals,predictions, average='weighted'))
print('Recall Score:',recall_score(true_vals,predictions, average='weighted'))
print('Confusion Matrix:\n',confusion_matrix(true_vals,predictions))

Evaluation Metrics for train set :
Accuracy Score: 0.9244333333333333
F1_Score (Macro) 0.924246920056135
F1_Score (Micro) 0.9244333333333333
Precision Score: 0.9241405261810478
Recall Score: 0.9244333333333333
Confusion Matrix:
 [[27518   802  1008   672]
 [  293 29514    79   114]
 [ 1071   198 26872  1859]
 [ 1011   150  1811 27028]]


In [31]:
model.eval()
predictions=[]
true_vals=[]
with torch.no_grad():
    for words, tags in test_dataloader:
        (words, tags) = (words.to(device), tags.to(device))
        pred = model(words.permute(1,0,2))
        pred_max_index = torch.argmax(pred, dim=1)+1
        true_vals.extend((torch.argmax(tags, dim=1)+1).cpu())
        predictions.extend(pred_max_index.cpu())
predictions=torch.stack(predictions).numpy()
true_vals=torch.stack(true_vals).numpy()
print('Evaluation Metrics for test set :')
print(f'Accuracy Score: {accuracy_score(true_vals,predictions)}')
print('F1_Score (Macro)',f1_score(true_vals,predictions, average='macro'))
print('F1_Score (Micro)', f1_score(true_vals,predictions, average='micro'))
print('Precision Score:', precision_score(true_vals,predictions, average='weighted'))
print('Recall Score:',recall_score(true_vals,predictions, average='weighted'))
print('Confusion Matrix:\n',confusion_matrix(true_vals,predictions))

Evaluation Metrics for test set :
Accuracy Score: 0.9025
F1_Score (Macro) 0.9022434220605823
F1_Score (Micro) 0.9025
Precision Score: 0.9021187370108664
Recall Score: 0.9025
Confusion Matrix:
 [[1721   54   73   52]
 [  34 1846   14    6]
 [  91   17 1642  150]
 [  78   21  151 1650]]
