In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import random
import re
from collections import Counter
dataset_root = "../Datasets"
#maxLen = 18387
#vocab_size = 24787

In [2]:
df = pd.read_csv("../Datasets/TrainData.csv")

In [3]:
def _clean_data(sent):
    sent = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", sent)
    sent = re.sub(r"\'s", " \'s", sent)
    sent = re.sub(r"\'ve", " \'ve", sent)
    sent = re.sub(r"n\'t", " n\'t", sent)
    sent = re.sub(r"\'re", " \'re", sent)
    sent = re.sub(r"\'d", " \'d", sent)
    sent = re.sub(r"\'ll", " \'ll", sent)
    sent = re.sub(r",", " , ", sent)
    sent = re.sub(r"!", " ! ", sent)
    sent = re.sub(r"\(", " \( ", sent)
    sent = re.sub(r"\)", " \) ", sent)
    sent = re.sub(r"\?", " \? ", sent)
    sent = re.sub(r"\s{2,}", " ", sent)
    return sent

def tokenize(text):
    text = text.lower()
    text = _clean_data(text)
    return text.split()

In [4]:
df['tokens'] = df['Text'].apply(tokenize)

In [5]:
min_freq = 1

all_tokens = [token for tokens in df['tokens'] for token in tokens]
word_counts = Counter(all_tokens)

In [6]:
vocab = {word: i for i, (word, count) in enumerate(word_counts.items()) if count >= min_freq}
vocab_size = len(vocab)

In [7]:
freqs = np.array([word_counts[word] for word in vocab.keys()], dtype=np.float32)
# Raise counts to the 0.75 power (as in the original word2vec paper).
freqs = freqs ** 0.75
neg_sampling_prob = freqs / freqs.sum()
# Convert to a torch tensor for sampling.
neg_sampling_prob_tensor = torch.tensor(neg_sampling_prob)


In [8]:
window_size = 2  # context window on each side

def generate_skipgram_pairs(token_list, vocab, window_size):
    """Generate (target, context) pairs from a tokenized sentence."""
    pairs = []
    indices = [vocab[token] for token in token_list if token in vocab]
    for i, target in enumerate(indices):
        start = max(0, i - window_size)
        end = min(len(indices), i + window_size + 1)
        for j in range(start, end):
            if i != j:
                pairs.append((target, indices[j]))
    return pairs

In [9]:
all_pairs = []
for tokens in df['tokens']:
    pairs = generate_skipgram_pairs(tokens, vocab, window_size)
    all_pairs.extend(pairs)

print("Total positive pairs:", len(all_pairs))

Total positive pairs: 2318548


In [10]:
class SkipGramDataset(Dataset):
    def __init__(self, pairs, neg_sampling_prob_tensor, num_negative):
        self.pairs = pairs
        self.num_negative = num_negative
        self.neg_sampling_prob_tensor = neg_sampling_prob_tensor
        self.vocab_size = len(neg_sampling_prob_tensor)
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        target, context = self.pairs[idx]
        # Vectorized negative sampling using torch.multinomial.
        negatives = torch.multinomial(
            self.neg_sampling_prob_tensor, self.num_negative, replacement=True
        )
        return (torch.tensor(target, dtype=torch.long),
                torch.tensor(context, dtype=torch.long),
                negatives)

num_negative = 5
dataset = SkipGramDataset(all_pairs, neg_sampling_prob_tensor, num_negative)
trainloader = DataLoader(dataset, batch_size=1024, shuffle=True, num_workers=2)

In [11]:
# -------------------------
# 6. Define the Skip-Gram Model with Negative Sampling
# -------------------------
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim)
    
    def forward(self, target, context, negatives):
        # Get embeddings for target, positive context, and negatives.
        v_target = self.embeddings(target)        # shape: [batch_size, embedding_dim]
        v_context = self.out_embeddings(context)    # shape: [batch_size, embedding_dim]
        v_negatives = self.out_embeddings(negatives)  # shape: [batch_size, num_negative, embedding_dim]
        
        # Positive score: dot product between target and context embeddings.
        pos_score = torch.sum(v_target * v_context, dim=1)  # shape: [batch_size]
        pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-10)
        
        # Negative score: dot product between target and negative embeddings.
        # Using batch matrix multiplication.
        neg_score = torch.bmm(v_negatives, v_target.unsqueeze(2)).squeeze(2)  # shape: [batch_size, num_negative]
        neg_loss = -torch.sum(torch.log(1 - torch.sigmoid(neg_score) + 1e-10), dim=1)
        
        loss = torch.mean(pos_loss + neg_loss)
        return loss

embedding_dim = 256
model = SkipGramModel(vocab_size, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [12]:
num_epochs = 5
for epoch in range(num_epochs):
    total_loss = 0
    for target, context, negatives in trainloader:
        optimizer.zero_grad()
        loss = model(target, context, negatives)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * target.size(0)
    avg_loss = total_loss / len(dataset)
    print(f"Epoch {epoch+1}, Avg Loss: {avg_loss:.4f}")

Epoch 1, Avg Loss: 12.5794
Epoch 2, Avg Loss: 4.0337
Epoch 3, Avg Loss: 2.9767
Epoch 4, Avg Loss: 2.6732
Epoch 5, Avg Loss: 2.5252


In [16]:
import json
torch.save(model.embeddings.weight, 'skipgram_embeddings.pt')

with open('vocab.json', 'w') as f:
    json.dump(vocab, f)