# Embeddings Demo using PyTorch

## Word2Vec

### Import Necessary Libraries

In [117]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Prepare Corpus and **Vocabulary**

In [118]:
# Step 1: Prepare a simple corpus
corpus = "He is the king . The king is royal. She is the royal queen . He is a prince, and she is a princess ."

# Step 2: Tokenize and build vocabulary
tokens = corpus.lower().split()  # Tokenize and lower-case words
vocab = set(tokens)  # Create a set of unique words
word2idx = {word: idx for idx, word in enumerate(vocab)}  # Create word to index mapping
idx2word = {idx: word for word, idx in word2idx.items()}  # Create index to word mapping
vocab_size = len(vocab)  # Calculate the size of vocabulary

In [119]:
print("Tokens: ", tokens)
print("Vocabulary: ", vocab)
print("Word to index mapping: ", word2idx)
print("Index to word mapping: ", idx2word)
print("Vocabulary size: ", vocab_size)

Tokens:  ['he', 'is', 'the', 'king', '.', 'the', 'king', 'is', 'royal.', 'she', 'is', 'the', 'royal', 'queen', '.', 'he', 'is', 'a', 'prince,', 'and', 'she', 'is', 'a', 'princess', '.']
Vocabulary:  {'king', 'and', 'a', 'he', 'is', 'royal.', 'she', 'royal', '.', 'princess', 'queen', 'prince,', 'the'}
Word to index mapping:  {'king': 0, 'and': 1, 'a': 2, 'he': 3, 'is': 4, 'royal.': 5, 'she': 6, 'royal': 7, '.': 8, 'princess': 9, 'queen': 10, 'prince,': 11, 'the': 12}
Index to word mapping:  {0: 'king', 1: 'and', 2: 'a', 3: 'he', 4: 'is', 5: 'royal.', 6: 'she', 7: 'royal', 8: '.', 9: 'princess', 10: 'queen', 11: 'prince,', 12: 'the'}
Vocabulary size:  13


### Create Dataset

In [120]:
context_window = 2  # Define the size of context window
data = []  # Initialize empty list to hold data

# Loop through each token and extract its context and target word
for i in range(context_window, len(tokens) - context_window):
    context = [tokens[i - t] for t in range(context_window, 0, -1)] + [tokens[i + t] for t in range(1, context_window + 1)]
    target = tokens[i]
    data.append((context, target))  # Append the context and target word as a tuple to the data list

### Define Skip-gram Model

In [121]:
# Step 4: Define the Skip-gram model architecture
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(SkipGram, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embed_dim)
        self.out_embed = nn.Embedding(vocab_size, embed_dim)
        
        
    def forward(self, target, context):
        in_embeds = self.in_embed(target)
        out_embeds = self.out_embed(context)
        scores = torch.matmul(out_embeds, in_embeds.t())
        return scores.squeeze()

### Initialize Model and Train

In [122]:
# Step 5: Initialize the model, loss, and optimizer
embed_dim = 50
model = SkipGram(vocab_size, embed_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [123]:
print(model)

SkipGram(
  (in_embed): Embedding(13, 50)
  (out_embed): Embedding(13, 50)
)


In [124]:
# Step 6: Training loop
epochs = 10
for epoch in range(epochs):
    total_loss = 0
    for context, target in data:
        print(context, target)
        context_idx = torch.tensor([word2idx[w] for w in context], dtype=torch.long)
        target_idx = torch.tensor([word2idx[target]], dtype=torch.long)
        
        print("Max context_idx:", torch.max(context_idx).item())  # Debug line
        print("Min context_idx:", torch.min(context_idx).item())  # Debug line

        print("context_idx shape:", context_idx.shape)  # Debug line
        print(context_idx)  # Debug line
        optimizer.zero_grad()
        scores = model(target_idx, context_idx)
        # change scores to be a float tensor

        print("Scores", scores)
        print("Scores shape:", scores.shape)  # Debug line
        print("Score dtype: ", scores.dtype)  # Debug line

        print("context_idx dtype: ", context_idx.dtype)  # Debug line

        context_idx = context_idx.float()
        print("context_idx: ", context_idx)
        loss = criterion(scores, context_idx)
        print(loss)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()
        print("Loss: ", loss.item()) # Debug line
        
        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(data):.4f}')

['he', 'is', 'king', '.'] the
Max context_idx: 8
Min context_idx: 0
context_idx shape: torch.Size([4])
tensor([3, 4, 0, 8])
Scores tensor([-10.7856,  -5.3515,   3.3014, -10.2490], grad_fn=<SqueezeBackward0>)
Scores shape: torch.Size([4])
Score dtype:  torch.float32
context_idx dtype:  torch.int64
context_idx:  tensor([3., 4., 0., 8.])
tensor(185.2784, grad_fn=<DivBackward1>)
Loss:  185.27841186523438
['is', 'the', '.', 'the'] king
Max context_idx: 12
Min context_idx: 4
context_idx shape: torch.Size([4])
tensor([ 4, 12,  8, 12])
Scores tensor([-5.3809, -8.7242,  8.7315, -8.7242], grad_fn=<SqueezeBackward0>)
Scores shape: torch.Size([4])
Score dtype:  torch.float32
context_idx dtype:  torch.int64
context_idx:  tensor([ 4., 12.,  8., 12.])
tensor(475.3839, grad_fn=<DivBackward1>)
Loss:  475.3839416503906
['the', 'king', 'the', 'king'] .
Max context_idx: 12
Min context_idx: 0
context_idx shape: torch.Size([4])
tensor([12,  0, 12,  0])
Scores tensor([-5.7535,  1.1268, -5.7535,  1.1268], gra

tensor([-4.8035, -3.1767,  1.6315, -3.1767], grad_fn=<SqueezeBackward0>)
Scores shape: torch.Size([4])
Score dtype:  torch.float32
context_idx dtype:  torch.int64
context_idx:  tensor([ 4., 12.,  8., 12.])
tensor(141.7785, grad_fn=<DivBackward1>)
Loss:  141.77850341796875
['the', 'king', 'the', 'king'] .
Max context_idx: 12
Min context_idx: 0
context_idx shape: torch.Size([4])
tensor([12,  0, 12,  0])
Scores tensor([-1.3274, -1.7162, -1.3274, -1.7162], grad_fn=<SqueezeBackward0>)
Scores shape: torch.Size([4])
Score dtype:  torch.float32
context_idx dtype:  torch.int64
context_idx:  tensor([12.,  0., 12.,  0.])
tensor(29.0564, grad_fn=<DivBackward1>)
Loss:  29.056377410888672
['king', '.', 'king', 'is'] the
Max context_idx: 8
Min context_idx: 0
context_idx shape: torch.Size([4])
tensor([0, 8, 0, 4])
Scores tensor([-3.7142, -2.8052, -3.7142, -2.6439], grad_fn=<SqueezeBackward0>)
Scores shape: torch.Size([4])
Score dtype:  torch.float32
context_idx dtype:  torch.int64
context_idx:  tensor

### Evaluate Model

In [125]:
# Step 7: Evaluation
def most_similar(word, word2idx, idx2word, embedding_matrix, topk=5):
    word_embedding = embedding_matrix[word2idx[word]]
    similarities = cosine_similarity([word_embedding], embedding_matrix)[0]
    print([tup for tup in zip(idx2word.values(), similarities)])
    similar_words = [(idx2word[idx], similarities[idx]) for idx in np.argsort(similarities, axis=-1)[-topk-1:-1][::-1]]
    return similar_words

In [126]:
embedding_matrix = model.in_embed.weight.data.numpy()
# print(embedding_matrix) # Debug line
most_similar_words = most_similar('king', word2idx, idx2word, embedding_matrix, topk=5)
print(most_similar_words)

[('king', 0.99999994), ('and', 0.011666652), ('a', 0.17769969), ('he', -0.1712333), ('is', -0.0024271682), ('royal.', 0.015272625), ('she', -0.22830305), ('royal', -0.032777235), ('.', 0.09018909), ('princess', -0.1588235), ('queen', -0.20705464), ('prince,', 0.0042718425), ('the', 0.031071626)]
[('a', 0.17769969), ('.', 0.09018909), ('the', 0.031071626), ('royal.', 0.015272625), ('and', 0.011666652)]


## Using word2vec for Text Classification

### Installing Required Libraries

In [127]:
# !pip install gensim torch

### Loading Pre-trained Word2Vec Embeddings

TODO: make sure to download the embeddings file from [kaggle](https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300) and place it in the same directory as this notebook

In [None]:
from gensim.models import KeyedVectors

# Load vectors directly from the file
model = KeyedVectors.load_word2vec_format('path/to/GoogleNews-vectors-negative300.bin', binary=True)

### Converting Embeddings to PyTorch Tensors


In [None]:
vocab_size = len(model.index_to_key)
embedding_dim = model.vector_size

# Initialize a tensor to store the embeddings
embedding_matrix = torch.zeros((vocab_size, embedding_dim))

# Fill the tensor with the Word2Vec embeddings
for i, word in enumerate(model.index_to_key):
    embedding_matrix[i] = torch.tensor(model[word])

# Create an nn.Embedding layer and load the pre-trained embeddings
embedding_layer = torch.nn.Embedding.from_pretrained(embedding_matrix)

### Building a Simple Text Classification Model

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.fc1 = nn.Linear(embed_dim, 128)
        self.fc2 = nn.Linear(128, num_class)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x



### Training the Model

In [None]:
# Training code here

### Evaluating the Model

In [None]:
# Evaluation code here

## GloVe

### Loading Pre-trained GloVe Embeddings

In [7]:
from torchtext.vocab import GloVe
# Load GloVe vectors using torchtext
glove = GloVe(name='6B', dim=100)

.vector_cache\glove.6B.zip: 862MB [05:35, 2.57MB/s]                              
100%|█████████▉| 399999/400000 [01:26<00:00, 4607.26it/s]


### Converting Embeddings to PyTorch Tensors

In [None]:
# Fetch the indices for the words in your vocabulary
word_indices = [glove.stoi[word] for word in ['hello', 'world']]

# Create a tensor with the GloVe embeddings
embedding_tensor = torch.stack([glove.vectors[i] for i in word_indices])

### Building a Text Classification Model


In [None]:
import torch.nn as nn

class TextClassifier(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_tensor)
        self.fc1 = nn.Linear(embedding_dim, 128)
        self.fc2 = nn.Linear(128, num_class)
        
    def forward(self, x):
        x = self.embedding(x)
        x = self.fc1(x)
        x = self.fc2(x)
        return x

### Training the Model

In [1]:
# Training code here

### Evaluating the Model

In [None]:
# Evaluation code here