In [2]:
# a. Data Preparation 
import numpy as np 
from collections import defaultdict 


In [4]:
# Sample corpus 
corpus = [ 
    "the quick brown fox jumped over the lazy dog", 
    "I love playing with my dog", 
    "the dog is quick and smart" 
] 


In [6]:
# Tokenize 
words = [] 
for sentence in corpus: 
    for word in sentence.lower().split(): 
        words.append(word) 
vocab = set(words) 
word2idx = {w: idx for idx, w in enumerate(vocab)} 
idx2word = {idx: w for w, idx in word2idx.items()} 
vocab_size = len(vocab) 
print("Vocabulary:", word2idx) 


Vocabulary: {'the': 0, 'fox': 1, 'over': 2, 'i': 3, 'playing': 4, 'my': 5, 'quick': 6, 'and': 7, 'love': 8, 'with': 9, 'is': 10, 'smart': 11, 'brown': 12, 'jumped': 13, 'dog': 14, 'lazy': 15}


In [8]:
# b. Generate training data (CBOW: context -> center word) 
window_size = 2 


In [10]:
def generate_training_data(words, window_size):
    data = []
    for i in range(window_size, len(words) - window_size):
        context = []
        for j in range(-window_size, window_size + 1):
            if j != 0:
                context.append(words[i + j])
        target = words[i]
        data.append((context, target))
    return data


In [12]:
training_data = generate_training_data(words, window_size)
print("\nSample training data (context -> target):") 
for context, target in training_data[:5]: 
    print(context, "->", target) 



Sample training data (context -> target):
['the', 'quick', 'fox', 'jumped'] -> brown
['quick', 'brown', 'jumped', 'over'] -> fox
['brown', 'fox', 'over', 'the'] -> jumped
['fox', 'jumped', 'the', 'lazy'] -> over
['jumped', 'over', 'lazy', 'dog'] -> the


In [14]:
# One-hot encoding 
def one_hot_vector(word): 
    vec = np.zeros(vocab_size) 
    vec[word2idx[word]] = 1 
    return vec 


In [16]:
# Prepare training sets 
X_train = [] 
y_train = [] 


In [18]:
for context, target in training_data:
    context_vec = np.zeros(vocab_size)
    for w in context:
        context_vec += one_hot_vector(w)  
    X_train.append(context_vec)
    y_train.append(one_hot_vector(target))


In [20]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [22]:
# c. Train Model (CBOW using simple neural network) 
embedding_dim = 10 # size of hidden layer 


In [24]:
# Initialize weights 
W1 = np.random.randn(vocab_size, embedding_dim) 
W2 = np.random.randn(embedding_dim, vocab_size) 


In [28]:
# Training parameters 
lr = 0.01 
epochs = 2000 

def softmax(x): 
    e_x = np.exp(x - np.max(x)) 
    return e_x / e_x.sum(axis=0) 


In [30]:
# Training loop 
for epoch in range(epochs): 
    loss = 0 
    for x, y in zip(X_train, y_train):
        
        # Forward pass 
        h = np.dot(x, W1) # hidden layer 
        u = np.dot(h, W2) # output scores 
        y_pred = softmax(u) # prediction 
           
        # Loss (cross-entropy) 
        loss -= np.sum(y * np.log(y_pred + 1e-9)) 
           
        # Backpropagation 
        e = y_pred - y
        dW2 = np.outer(h, e) 
        dW1 = np.outer(x, np.dot(W2, e)) 
           
        # Update weights 
        W1 -= lr * dW1 
        W2 -= lr * dW2 
        
    if epoch % 200 == 0: 
        print(f"Epoch {epoch}, Loss: {loss:.4f}") 
        
# d. Output: Word embeddings 
print("\nWord embeddings (rows = words):") 
for word in word2idx: 
    print(word, ":", W1[word2idx[word]]) 



Epoch 0, Loss: 142.0672
Epoch 200, Loss: 0.5007
Epoch 400, Loss: 0.2176
Epoch 600, Loss: 0.1356
Epoch 800, Loss: 0.0974
Epoch 1000, Loss: 0.0756
Epoch 1200, Loss: 0.0615
Epoch 1400, Loss: 0.0516
Epoch 1600, Loss: 0.0444
Epoch 1800, Loss: 0.0389

Word embeddings (rows = words):
the : [-0.38863192 -0.95496354  0.25487471 -0.69653746  2.04599293  0.61005768
 -2.24678005 -0.72163723  1.35788491  0.52597301]
fox : [ 0.37073786  1.44496748 -0.29078907  0.41768502  0.79790655 -1.5149945
 -0.04489068 -0.50071754 -1.75035752  2.4596813 ]
over : [-0.01709991 -0.31965164 -0.85496474  0.4451881  -0.96375938  0.06823534
  1.70847518  0.66545708  1.76049585  1.37519817]
i : [ 1.84515532 -1.47099442 -0.93772927 -0.97575407 -0.58789594  0.70305567
 -0.75233154  0.86052084  0.53661963  0.31534405]
playing : [ 0.41699327 -0.3338303   0.20126558  1.96154822 -0.11252112 -1.29557228
  2.27922366 -0.75395952  0.19021353 -0.47387182]
my : [-1.41612498 -0.08617071 -0.38318465  0.03567377  1.2147519  -0.232826