In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from nltk.corpus import reuters
from collections import Counter
from itertools import combinations_with_replacement
import math
import time

In [2]:
# Download NLTK Reuters Dataset
import nltk
nltk.download("reuters")
nltk.download("punkt_tab")

[nltk_data] Downloading package reuters to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Arunya
[nltk_data]     Senadeera\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Load Reuters corpus
categories = reuters.categories()
corpus = [reuters.words(fileid) for fileid in reuters.fileids(categories)]

In [4]:
# Preprocess corpus: Lowercase and split sentences
corpus = [[word.lower() for word in sentence] for sentence in corpus]
flatten = lambda l: [item for sublist in l for item in sublist]

In [5]:
# Vocabulary and numericalization
vocab = list(set(flatten(corpus)))
word2index = {w: i for i, w in enumerate(vocab)}
vocab.append('<UNK>')
word2index['<UNK>'] = len(vocab) - 1
index2word = {v: k for k, v in word2index.items()}
voc_size = len(vocab)

In [6]:
# Set a frequency threshold
min_word_freq = 100
filtered_vocab = [word for word, count in Counter(flatten(corpus)).items() if count >= min_word_freq]
word2index = {w: i for i, w in enumerate(filtered_vocab)}
filtered_vocab.append('<UNK>')
word2index['<UNK>'] = len(filtered_vocab) - 1
index2word = {v: k for k, v in word2index.items()}
voc_size = len(filtered_vocab)

In [7]:
# Function to dynamically modify window size
def set_window_size(window_size=2):
    skip_grams = []
    for sent in corpus:
        for i, target in enumerate(sent):
            context_indices = range(max(0, i - window_size), min(len(sent), i + window_size + 1))
            for j in context_indices:
                if i != j:
                    skip_grams.append((target, sent[j]))
    return skip_grams

In [8]:
# Default window size
skip_grams = set_window_size(window_size=2)

In [9]:
# Co-occurrence Matrix and Weighting Function
X_ik_skipgram = Counter(skip_grams)

def weighting(w_i, w_j, X_ik):
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1
    x_max = 5
    alpha = 0.75
    return (x_ij / x_max) ** alpha if x_ij < x_max else 1



In [10]:
from itertools import combinations_with_replacement
X_ik = {}
weighting_dic = {}

for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgram.get(bigram) is not None:
        co_occur = X_ik_skipgram[bigram]
        X_ik[bigram] = co_occur + 1
        X_ik[(bigram[1], bigram[0])] = co_occur + 1
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

KeyboardInterrupt: 

In [12]:
# Random Batch Generator
def random_batch(batch_size, skip_grams, X_ik, weighting_dic):
    skip_grams_id = [(word2index.get(w1, word2index['<UNK>']), word2index.get(w2, word2index['<UNK>'])) for w1, w2 in skip_grams]
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])
        random_labels.append([skip_grams_id[i][1]])
        pair = skip_grams[i]
        cooc = X_ik.get(pair, 1)
        random_coocs.append([math.log(cooc)])
        weighting = weighting_dic.get(pair, 0.0)
        random_weightings.append([weighting])
    
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

In [13]:
# GloVe Model Class
class GloVe(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(GloVe, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embed_size)
        self.embedding_u = nn.Embedding(vocab_size, embed_size)
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)

    def forward(self, center_words, target_words, coocs, weighting):
        center_embeds = self.embedding_v(center_words)
        target_embeds = self.embedding_u(target_words)
        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)
        inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        return torch.sum(loss)

In [None]:
# Training Parameters
batch_size = 100
embedding_size = 50
model = GloVe(voc_size, embedding_size)
optimizer = optim.Adam(model.parameters(), lr=0.01)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# Training Loop
num_epochs = 1000
for epoch in range(num_epochs):
    start_time = time.time()
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, skip_grams, X_ik, weighting_dic)
    input_batch = torch.LongTensor(input_batch)
    target_batch = torch.LongTensor(target_batch)
    cooc_batch = torch.FloatTensor(cooc_batch)
    weighting_batch = torch.FloatTensor(weighting_batch)

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    loss.backward()
    optimizer.step()

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1}/{num_epochs} | Loss: {loss.item():.4f} | Time: {epoch_mins}m {epoch_secs:.4f}s")



Epoch: 10/1000 | Loss: 11.1475 | Time: 0m 1.0000s


KeyboardInterrupt: 

In [None]:
# Documentation
print("Dataset Source: NLTK Reuters Corpus.")