In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import time
import pandas as pd
import os
import requests
from collections import Counter, defaultdict
from scipy.stats import spearmanr
import nltk
from nltk.corpus import reuters
import gensim.downloader as api



In [2]:
# DATA LOADING & PREPROCESSING
try:
    nltk.download('reuters')
    nltk.download('punkt')
except Exception as e:
    print(f"NLTK Download Error: {e}")

def load_news_dataset(category='grain', max_vocab=2000):
    raw_sents = reuters.sents(categories=category)
    clean_corpus = [[w.lower() for w in s if w.isalpha()] for s in raw_sents]
    all_words = [w for s in clean_corpus for w in s]
    counts = Counter(all_words)
    vocab = sorted(counts, key=counts.get, reverse=True)[:max_vocab]
    vocab.append('<UNK>')
    word2idx = {w: i for i, w in enumerate(vocab)}
    return clean_corpus, vocab, word2idx

def get_skipgrams(corpus, w2i, window_size):
    """Generates training pairs dynamically based on window size."""
    data = []
    for sentence in corpus:
        for i, word in enumerate(sentence):
            if word not in w2i: continue
            target = w2i[word]
            start = max(0, i - window_size)
            end = min(len(sentence), i + window_size + 1)
            for j in range(start, end):
                if i == j or sentence[j] not in w2i: continue
                data.append((target, w2i[sentence[j]]))
    return data

def get_cooc_matrix(corpus, w2i, window_size):
    """Generates co-occurrence matrix dynamically for GloVe."""
    cooc = defaultdict(float)
    for sentence in corpus:
        for i, word in enumerate(sentence):
            if word not in w2i: continue
            w_i = w2i[word]
            start = max(0, i - window_size)
            end = min(len(sentence), i + window_size + 1)
            for j in range(start, end):
                if i == j or sentence[j] not in w2i: continue
                w_j = w2i[sentence[j]]
                cooc[(w_i, w_j)] += 1.0 / abs(i - j)
    return [(i, j, c, min(1.0, (c/100)**0.75)) for (i, j), c in cooc.items()]

def download_analogy_data():
    url = "http://download.tensorflow.org/data/questions-words.txt"
    path = "questions-words.txt"
    if not os.path.exists(path):
        print("Downloading Analogy Dataset...")
        r = requests.get(url)
        with open(path, 'wb') as f:
            f.write(r.content)
    return path



[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\alsto\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alsto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# MODEL ARCHITECTURES

class SkipgramSoftmax(nn.Module): 
    def __init__(self, v_size, emb_dim):
        super().__init__()
        self.embeddings = nn.Embedding(v_size, emb_dim)
        self.output = nn.Linear(emb_dim, v_size)
    def forward(self, x):
        return self.output(self.embeddings(x))

class Word2VecNeg(nn.Module): 
    def __init__(self, v_size, emb_dim):
        super().__init__()
        self.v_embeddings = nn.Embedding(v_size, emb_dim)
        self.u_embeddings = nn.Embedding(v_size, emb_dim)
        self.log_sigmoid = nn.LogSigmoid()
    def forward(self, center, target, negative):
        v = self.v_embeddings(center).view(center.size(0), 1, -1)
        u = self.u_embeddings(target).view(target.size(0), 1, -1)
        n = -self.u_embeddings(negative)
        pos = torch.bmm(u, v.transpose(1, 2)).view(center.size(0), -1)
        neg = torch.bmm(n, v.transpose(1, 2)).squeeze(2)
        return -torch.mean(self.log_sigmoid(pos) + torch.sum(self.log_sigmoid(neg), 1))

class GloVeModel(nn.Module): 
    def __init__(self, v_size, emb_dim):
        super().__init__()
        self.v_emb, self.u_emb = nn.Embedding(v_size, emb_dim), nn.Embedding(v_size, emb_dim)
        self.v_bias, self.u_bias = nn.Embedding(v_size, 1), nn.Embedding(v_size, 1)
    def forward(self, i, j, cooc, weights):
        dot = (self.v_emb(i) * self.u_emb(j)).sum(1)
        b_i, b_j = self.v_bias(i).squeeze(), self.u_bias(j).squeeze()
        return torch.mean(weights * torch.pow(dot + b_i + b_j - torch.log(cooc), 2))




In [4]:
# EVALUATION LOGIC

def evaluate_analogy(vectors, w2i, i2w, analogy_path, category_name):
    correct, total = 0, 0
    if not os.path.exists(analogy_path): return 0.0
    with open(analogy_path, 'r') as f:
        target_section = False
        for line in f:
            if line.startswith(':'):
                target_section = category_name in line
                continue
            if not target_section: continue
            words = line.strip().lower().split()
            if len(words) != 4: continue
            a, b, c, d = words
            if all(w in w2i for w in words):
                total += 1
                va, vb, vc = vectors[w2i[a]], vectors[w2i[b]], vectors[w2i[c]]
                target_vec = vb - va + vc
                sims = np.dot(vectors, target_vec) / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(target_vec) + 1e-10)
                sims[w2i[a]] = sims[w2i[b]] = sims[w2i[c]] = -1
                pred = i2w[np.argmax(sims)]
                if pred == d: correct += 1
    return (correct / total) * 100 if total > 0 else 0.0

def evaluate_spearman(model, w2i, csv_path, is_gensim=False):
    if not os.path.exists(csv_path): return 0.0
    df = pd.read_csv(csv_path)
    if is_gensim:
        vecs = model
    else:
        if hasattr(model, 'v_embeddings'): v = model.v_embeddings.weight.detach().numpy()
        elif hasattr(model, 'v_emb'): v = (model.v_emb.weight + model.u_emb.weight).detach().numpy()
        else: v = model.embeddings.weight.detach().numpy()
        vecs = v
    m_sims, h_scores = [], []
    for _, row in df.iterrows():
        w1, w2, h_score = str(row['Word 1']).lower(), str(row['Word 2']).lower(), float(row['Human (mean)'])
        if is_gensim:
            if w1 in vecs and w2 in vecs:
                m_sims.append(np.dot(vecs[w1], vecs[w2]))
                h_scores.append(h_score)
        elif w1 in w2i and w2 in w2i:
            v1, v2 = vecs[w2i[w1]], vecs[w2i[w2]]
            m_sims.append(np.dot(v1, v2))
            h_scores.append(h_score)
    if len(m_sims) < 2: return 0.0
    corr, _ = spearmanr(m_sims, h_scores)
    return corr




In [5]:
# EXECUTION LOOP FOR MULTIPLE WINDOW SIZES 

WINDOW_SIZES = [2, 5, 10]
corpus, vocab, word2idx = load_news_dataset()
i2word = {i: w for w, i in word2idx.items()}
analogy_path = download_analogy_data()
performance_results = []
judgment_results = {}

for window in WINDOW_SIZES:
    print(f"\n TESTING WINDOW SIZE: {window} ")
    
    models = [
        ("Skipgram Softmax", SkipgramSoftmax(len(vocab), 100)),
        ("Skipgram NEG", Word2VecNeg(len(vocab), 100)),
        ("GloVe Scratch", GloVeModel(len(vocab), 100))
    ]

    for name, model in models:
        optimizer = optim.Adam(model.parameters(), lr=0.005)
        start_time = time.time()
        
        # Dynamic Data Generation and Training with Epoch Logging
        if "Skipgram" in name:
            pairs = get_skipgrams(corpus, word2idx, window)
            criterion = nn.CrossEntropyLoss() if "Softmax" in name else None
            for epoch in range(20): 
                total_loss = 0
                for i in range(0, len(pairs), 128):
                    batch = pairs[i:i+128]
                    if len(batch) < 128: continue
                    c, t = torch.LongTensor([p[0] for p in batch]), torch.LongTensor([p[1] for p in batch])
                    optimizer.zero_grad()
                    loss = model(c, t, torch.LongTensor(np.random.randint(0, len(vocab), (128, 5)))) if "NEG" in name else criterion(model(c), t)
                    loss.backward(); optimizer.step(); total_loss += loss.item()
                avg_loss = total_loss / (len(pairs)//128)
                print(f"  {name} (W={window}) Epoch {epoch+1}: Loss {avg_loss:.4f}")
        else: # GloVe
            cooc = get_cooc_matrix(corpus, word2idx, window)
            for epoch in range(20):
                total_loss = 0
                for i in range(0, len(cooc), 128):
                    batch = cooc[i:i+128]
                    if len(batch) < 128: continue
                    idx_i, idx_j = torch.LongTensor([p[0] for p in batch]), torch.LongTensor([p[1] for p in batch])
                    cnt, wgt = torch.FloatTensor([p[2] for p in batch]), torch.FloatTensor([p[3] for p in batch])
                    optimizer.zero_grad(); loss = model(idx_i, idx_j, cnt, wgt); loss.backward(); optimizer.step(); total_loss += loss.item()
                avg_loss = total_loss / (len(cooc)//128)
                print(f"  {name} (W={window}) Epoch {epoch+1}: Loss {avg_loss:.4f}")

        duration = time.time() - start_time
        corr = evaluate_spearman(model, word2idx, 'combined.csv')
        
        # Extract Vectors for Analogy Task
        if "NEG" in name: vecs = model.v_embeddings.weight.detach().numpy()
        elif "GloVe" in name: vecs = (model.v_emb.weight + model.u_emb.weight).detach().numpy()
        else: vecs = model.embeddings.weight.detach().numpy()

        syntactic = evaluate_analogy(vecs, word2idx, i2word, analogy_path, "past-tense")
        semantic = evaluate_analogy(vecs, word2idx, i2word, analogy_path, "capital-common-countries")
        
        performance_results.append({
            "Model": name, "Window": window, 
            "Training Time": f"{duration:.2f}s", "Spearman Rho": f"{corr:.4f}",
            "Syntactic Accuracy": f"{syntactic:.2f}%", "Semantic Accuracy": f"{semantic:.2f}%"
        })
        
        if name not in judgment_results or corr > judgment_results[name]:
            judgment_results[name] = corr




 TESTING WINDOW SIZE: 2 
  Skipgram Softmax (W=2) Epoch 1: Loss 6.0906
  Skipgram Softmax (W=2) Epoch 2: Loss 5.6272
  Skipgram Softmax (W=2) Epoch 3: Loss 5.4717
  Skipgram Softmax (W=2) Epoch 4: Loss 5.3762
  Skipgram Softmax (W=2) Epoch 5: Loss 5.3095
  Skipgram Softmax (W=2) Epoch 6: Loss 5.2631
  Skipgram Softmax (W=2) Epoch 7: Loss 5.2283
  Skipgram Softmax (W=2) Epoch 8: Loss 5.2012
  Skipgram Softmax (W=2) Epoch 9: Loss 5.1792
  Skipgram Softmax (W=2) Epoch 10: Loss 5.1637
  Skipgram Softmax (W=2) Epoch 11: Loss 5.1494
  Skipgram Softmax (W=2) Epoch 12: Loss 5.1381
  Skipgram Softmax (W=2) Epoch 13: Loss 5.1288
  Skipgram Softmax (W=2) Epoch 14: Loss 5.1221
  Skipgram Softmax (W=2) Epoch 15: Loss 5.1150
  Skipgram Softmax (W=2) Epoch 16: Loss 5.1111
  Skipgram Softmax (W=2) Epoch 17: Loss 5.1047
  Skipgram Softmax (W=2) Epoch 18: Loss 5.1017
  Skipgram Softmax (W=2) Epoch 19: Loss 5.0983
  Skipgram Softmax (W=2) Epoch 20: Loss 5.0954
  Skipgram NEG (W=2) Epoch 1: Loss 9.8150
 

In [6]:
# Pre-trained Gensim Benchmark
print("\nEvaluating GloVe (Gensim)")
g_g = api.load("glove-wiki-gigaword-100")
g_corr = evaluate_spearman(g_g, None, 'combined.csv', is_gensim=True)
g_syn = evaluate_analogy(g_g.vectors, g_g.key_to_index, {v:k for k,v in g_g.key_to_index.items()}, analogy_path, "past-tense")
g_sem = evaluate_analogy(g_g.vectors, g_g.key_to_index, {v:k for k,v in g_g.key_to_index.items()}, analogy_path, "capital-common-countries")

performance_results.append({
    "Model": "GloVe (gensim)", "Window": "N/A", "Training Time": "N/A", "Spearman Rho": f"{g_corr:.4f}",
    "Syntactic Accuracy": f"{g_syn:.2f}%", "Semantic Accuracy": f"{g_sem:.2f}%"
})
judgment_results["GloVe (gensim)"] = g_corr




Evaluating GloVe (Gensim)


In [7]:
# DISPLAY RESULTS
print("\n PERFORMANCE SUMMARY TABLE ")
print(pd.DataFrame(performance_results).to_string(index=False))

swapped_data = [[
    judgment_results.get("Skipgram Softmax", 0),
    judgment_results.get("Skipgram NEG", 0),
    judgment_results.get("GloVe Scratch", 0),
    judgment_results.get("GloVe (gensim)", 0),
    1.0 # Y_true benchmark
]]
print("\n TABLE 1: ASSESSMENT OF HUMAN JUDGMENT CORRELATION")
print(pd.DataFrame(swapped_data, columns=["Skipgram", "NEG", "GloVe", "GloVe (gensim)", "Y_true"], index=["Spearman Rho"]))


 PERFORMANCE SUMMARY TABLE 
           Model Window Training Time Spearman Rho Syntactic Accuracy Semantic Accuracy
Skipgram Softmax      2       182.89s      -0.1652              2.38%             0.00%
    Skipgram NEG      2       152.94s      -0.0452              2.38%             0.00%
   GloVe Scratch      2        35.95s      -0.1397              0.00%             0.00%
Skipgram Softmax      5       428.92s       0.1843              0.00%             0.00%
    Skipgram NEG      5       351.88s       0.0136              0.00%             0.00%
   GloVe Scratch      5        78.62s      -0.1365              0.00%             0.00%
Skipgram Softmax     10       732.30s       0.3189              0.00%             0.00%
    Skipgram NEG     10       637.84s      -0.2813              0.00%             0.00%
   GloVe Scratch     10       128.10s       0.0319              0.00%             0.00%
  GloVe (gensim)    N/A           N/A       0.4924             53.40%            94.07%

 T