# Data Collection and Preprocessing

In [None]:
# Collecting data
file=open("/kaggle/input/wikipedia-dump/enwik8","r")
text=file.read()

In [45]:
# removing all the unnecessary data like tags...
import re
text=re.sub(r"<[^>]+>"," ",text)
text=text.lower()
text=re.sub(r"[^a-z\s]"," ",text)
tokens=text.split()
print(tokens[:10])

['wikipedia', 'http', 'en', 'wikipedia', 'org', 'wiki', 'main', 'page', 'mediawiki', 'alpha']


# Vocabulary Construction 
* Neural Networks cannot work on strings and rare words adds noise
* So we will apply minimum frequency cutoff i.e rare words are removed from the vocabulary

In [46]:
freq={}
min_count=5
word2id={}
id2word={}

for token in tokens:
    if token in freq.keys():
        freq[token]+=1
    else:
        freq[token]=1
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
idx=0
for word,count in sorted_freq:
    if count>=min_count:
        word2id[word]=idx
        id2word[idx]=word
        idx+=1
vocab_size = len(word2id)
print("Vocab size:", vocab_size)


Vocab size: 71993


In [47]:
window_size=4
word_indices = [word2id[token] for token in tokens if token in word2id]

#Decreasing the size for time constraints
word_indices = word_indices[:20000]


# Getting the Unigram Probabilites for Negative Sampling

In [48]:
# Unigram^0.75 distribution over VOCAB ONLY
unigram_words = []
unigram_probs = []

for word, idx in word2id.items():
    unigram_words.append(idx)
    unigram_probs.append(freq[word] ** 0.75)

# Normalize
total = sum(unigram_probs)
unigram_probs = [p / total for p in unigram_probs]

print("unigram prob sum:", sum(unigram_probs))


unigram prob sum: 1.0


# Skipgram Model

In [49]:
import torch
import torch.nn as nn

class SkipGramNegSampling(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()

        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)

        init_range = 0.5 / embedding_dim
        self.target_embeddings.weight.data.uniform_(-init_range, init_range)
        self.context_embeddings.weight.data.uniform_(-init_range, init_range)

    def forward(self, center_word, pos_context, neg_contexts):
        w = self.target_embeddings(center_word)        # (1, d)
        c_pos = self.context_embeddings(pos_context)   # (1, d)
        c_neg = self.context_embeddings(neg_contexts)  # (k, d)

        # Positive loss
        pos_score = torch.sum(w * c_pos, dim=1)
        pos_loss = -torch.log(torch.sigmoid(pos_score))

        # Negative loss
        neg_score = torch.matmul(c_neg, w.t()).squeeze()
        neg_loss = -torch.sum(torch.log(torch.sigmoid(-neg_score)))

        return pos_loss + neg_loss


In [50]:
import random

def sample_negatives(k, unigram_words, unigram_probs, forbidden):
    negatives = []
    while len(negatives) < k:
        neg_id = random.choices(unigram_words, unigram_probs)[0]
        if neg_id not in forbidden:
            negatives.append(neg_id)
    return negatives


In [51]:
embedding_dim = 100
num_negatives = 5
learning_rate = 0.025
epochs = 1

model = SkipGramNegSampling(vocab_size, embedding_dim)
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


In [None]:
from tqdm import tqdm

window_size = 4
total_loss = 0

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}")

    for i in tqdm(range(len(word_indices))):
        center_word = word_indices[i]

        for j in range(1, window_size // 2 + 1):

            # Left context
            if i - j >= 0:
                context_word = word_indices[i - j]

                negatives = sample_negatives(
                    num_negatives,
                    unigram_words,
                    unigram_probs,
                    forbidden={center_word, context_word}
                )

                center = torch.tensor([center_word])
                pos = torch.tensor([context_word])
                neg = torch.tensor(negatives)

                optimizer.zero_grad()
                loss = model(center, pos, neg)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

            # Right context
            if i + j < len(word_indices):
                context_word = word_indices[i + j]

                negatives = sample_negatives(
                    num_negatives,
                    unigram_words,
                    unigram_probs,
                    forbidden={center_word, context_word}
                )

                center = torch.tensor([center_word])
                pos = torch.tensor([context_word])
                neg = torch.tensor(negatives)

                optimizer.zero_grad()
                loss = model(center, pos, neg)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()

    print("Total loss:", total_loss)


In [52]:
import numpy as np

# Use target embeddings (W)
embeddings = model.target_embeddings.weight.data.cpu().numpy()

np.save("my_word2vec_embeddings.npy", embeddings)


In [53]:
import pickle

with open("word2id.pkl", "wb") as f:
    pickle.dump(word2id, f)


In [54]:
from numpy.linalg import norm


def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2) + 1e-9)
def get_vector(word):
    if word not in word2id:
        return None
    return embeddings[word2id[word]]
pairs = [
    ("king", "queen"),
    ("man", "woman"),
    ("paris", "france"),
    ("delhi", "india"),
    ("computer", "laptop")
]

for w1, w2 in pairs:
    v1 = get_vector(w1)
    v2 = get_vector(w2)
    if v1 is not None and v2 is not None:
        print(f"{w1} - {w2} : {cosine_similarity(v1, v2):.4f}")


king - queen : -0.0263
man - woman : 0.0451
paris - france : 0.0788
delhi - india : 0.1913
computer - laptop : -0.0204


In [55]:
def analogy(a, b, c, top_k=5):
    if a not in word2id or b not in word2id or c not in word2id:
        return None

    target_vec = embeddings[word2id[a]] - embeddings[word2id[b]] + embeddings[word2id[c]]

    scores = []
    for i in range(len(embeddings)):
        score = cosine_similarity(target_vec, embeddings[i])
        scores.append((id2word[i], score))

    scores.sort(key=lambda x: x[1], reverse=True)
    return scores[:top_k]

print("king - man + woman:")
print(analogy("king", "man", "woman")[0][0])

print("\nparis - france + india:")
print(analogy("paris", "france", "india")[0][0])


king - man + woman:
woman

paris - france + india:
india


In [56]:
male_words = ["he", "man", "male"]
female_words = ["she", "woman", "female"]

def mean_vector(words):
    vecs = [get_vector(w) for w in words if get_vector(w) is not None]
    return np.mean(vecs, axis=0)

gender_direction = mean_vector(male_words) - mean_vector(female_words)
occupations = ["doctor", "nurse", "engineer", "teacher", "scientist"]

for word in occupations:
    vec = get_vector(word)
    if vec is not None:
        score = cosine_similarity(vec, gender_direction)
        print(f"{word:10s} -> {score:.4f}")


doctor     -> -0.0773
nurse      -> 0.0402
engineer   -> -0.1015
teacher    -> 0.0137
scientist  -> -0.2043


In [None]:
import gensim.downloader as api

gensim_model = api.load("word2vec-google-news-300")
for w1, w2 in pairs:
    if w1 in gensim_model and w2 in gensim_model:
        gensim_sim = gensim_model.similarity(w1, w2)
        my_sim = cosine_similarity(get_vector(w1), get_vector(w2))
        print(f"{w1:10s} {w2:10s} | My: {my_sim:.4f} | Gensim: {gensim_sim:.4f}")
