# Demo: Skip-Gram

Steps:

1. Data Preparation: 
- The corpus is tokenized into words, and a vocabulary is built.
- Context-target pairs are generated using a window size of 2.

2. Skip-gram Model: The SkipGramModel class defines a neural network with an embedding layer to learn word vectors and an output layer to predict context words.

3. Training Loop: The model is trained using the context-target pairs. For each target word, it predicts its context words using the embeddings and updates them to minimize the prediction loss.

### Step 1: Data Preparation

This step prepares a small corpus and generates context-target pairs for training.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import numpy as np

# Sample text corpus
corpus = [
    "we are learning nlp",
    "nlp is fun",
    "we love deep learning",
    "deep learning is powerful"
]

# Tokenize the corpus
tokenized_corpus = [sentence.split() for sentence in corpus]

# Build vocabulary
vocabulary = Counter()
for sentence in tokenized_corpus:
    for word in sentence:
        vocabulary[word] += 1

# Create word to index and index to word mappings
word_to_idx = {word: i for i, word in enumerate(vocabulary)}
idx_to_word = {i: word for word, i in word_to_idx.items()}

vocab_size = len(vocabulary)

# Generate context-target pairs
def generate_skipgram_pairs(tokenized_corpus, window_size=2):
    pairs = []
    for sentence in tokenized_corpus:
        sentence_len = len(sentence)
        for idx, word in enumerate(sentence):
            for neighbor in range(max(idx - window_size, 0), min(idx + window_size + 1, sentence_len)):
                if neighbor != idx:
                    pairs.append((word, sentence[neighbor]))
    return pairs

pairs = generate_skipgram_pairs(tokenized_corpus)


### Step 2: Define the Skip-gram Model

This step creates a simple neural network with an embedding layer and a linear layer to predict context words.

In [2]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, target_word):
        # Get the embedding of the target word
        word_embed = self.embeddings(target_word)
        # Calculate scores for all words in the vocabulary
        output = self.output_layer(word_embed)
        return output


### Step 3: Training the Model

This step trains the Skip-gram model using the context-target pairs.

In [3]:
# Hyperparameters
embedding_dim = 10
learning_rate = 0.01
epochs = 100

# Initialize model, loss function, and optimizer
model = SkipGramModel(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Prepare data for training
def prepare_data(pairs, word_to_idx):
    inputs = [word_to_idx[target] for target, context in pairs]
    targets = [word_to_idx[context] for target, context in pairs]
    return torch.LongTensor(inputs), torch.LongTensor(targets)

inputs, targets = prepare_data(pairs, word_to_idx)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for input_word, target_word in zip(inputs, targets):
        input_word = input_word.unsqueeze(0)
        target_word = target_word.unsqueeze(0)

        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        output = model(input_word)
        
        # Compute loss
        loss = criterion(output, target_word)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# Display learned embeddings
for word, idx in word_to_idx.items():
    print(f"Word: {word}, Embedding: {model.embeddings.weight[idx].detach().numpy()}")


Epoch 10/100, Loss: 70.2636
Epoch 20/100, Loss: 65.7007
Epoch 30/100, Loss: 63.5598
Epoch 40/100, Loss: 62.0821
Epoch 50/100, Loss: 60.8883
Epoch 60/100, Loss: 59.8661
Epoch 70/100, Loss: 58.9722
Epoch 80/100, Loss: 58.1843
Epoch 90/100, Loss: 57.4870
Epoch 100/100, Loss: 56.8684
Word: we, Embedding: [-0.69457585  0.03122669  1.0407579  -0.17249197 -0.36623123 -1.0612456
  0.35565415  0.60919106 -0.66884327  0.57878953]
Word: are, Embedding: [-0.8537248   0.83230984  1.4542491  -0.03443678 -1.3387407  -0.09828038
 -0.6228846   0.8793163   0.9466001   0.28804305]
Word: learning, Embedding: [ 0.64245325 -0.40728158 -0.6611613  -0.70274705  1.3546456  -1.6175278
 -0.38449246 -0.3137371   0.51765305  0.869688  ]
Word: nlp, Embedding: [-1.4155036   1.2556876  -0.45933115  0.7096862   0.6858076   0.09723409
 -2.1826375  -0.08341801  1.3511233   0.847898  ]
Word: is, Embedding: [ 0.88156915 -2.7319615   1.063199   -0.93589664 -0.5708867   0.0468301
 -0.2517836   1.1327626   0.41310048  1.3730