In [1]:
import numpy as np
import re
import random

# -------------------------------
# Step 1: Define a multi-line corpus
# -------------------------------
corpus = """
the quick brown fox jumps over the lazy dog
a fast brown fox leaps over sleeping dogs
the dog chased the fox through the field
foxes are quick and clever animals
dogs are loyal and friendly companions
"""

# -------------------------------
# Step 2: Preprocess text
# -------------------------------
def preprocess_text(corpus):
    corpus = corpus.lower()
    corpus = re.sub(r'[^a-z\s]', '', corpus)
    sentences = [sentence.split() for sentence in corpus.strip().split('\n') if sentence]
    return sentences

sentences = preprocess_text(corpus)

print("âœ… Sentences after preprocessing:")
for s in sentences:
    print(s)

# Build vocabulary
vocab = sorted(set(word for sentence in sentences for word in sentence))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
vocab_size = len(vocab)
print(f"\nVocabulary size: {vocab_size}")

# -------------------------------
# Step 3: Generate training data (context, target)
# -------------------------------
def generate_training_data(sentences, window_size):
    data = []
    for sentence in sentences:
        for i in range(window_size, len(sentence) - window_size):
            context = []
            for j in range(-window_size, window_size + 1):
                if j != 0:
                    context.append(sentence[i + j])
            target = sentence[i]
            data.append((context, target))
    return data

window_size = 2
training_data = generate_training_data(sentences, window_size)
print("\nSample training pair:", training_data[0])

# -------------------------------
# Step 4: One-hot encoding
# -------------------------------
def one_hot(word):
    vec = np.zeros(vocab_size)
    vec[word2idx[word]] = 1
    return vec

# -------------------------------
# Step 5: Initialize parameters
# -------------------------------
embedding_dim = 10
W1 = np.random.randn(vocab_size, embedding_dim)
W2 = np.random.randn(embedding_dim, vocab_size)

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

# -------------------------------
# Step 6: Train CBOW
# -------------------------------
learning_rate = 0.01
epochs = 1000

for epoch in range(epochs):
    total_loss = 0
    for context_words, target_word in training_data:
        # Average context word vectors
        x = np.zeros(vocab_size)
        for w in context_words:
            x += one_hot(w)
        x = x / len(context_words)

        # Forward pass
        h = np.dot(W1.T, x)
        u = np.dot(W2.T, h)
        y_pred = softmax(u)

        # True output
        y_true = one_hot(target_word)

        # Loss
        total_loss += -np.sum(y_true * np.log(y_pred + 1e-9))

        # Backpropagation
        e = y_pred - y_true
        dW2 = np.outer(h, e)
        dW1 = np.outer(x, np.dot(W2, e))

        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2

    if epoch % 200 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

print("\nâœ… Training Complete")

# -------------------------------
# Step 7: Predict next word
# -------------------------------
def predict_next_word(context_words):
    x = np.zeros(vocab_size)
    for w in context_words:
        if w in word2idx:
            x += one_hot(w)
    x = x / len(context_words)
    
    h = np.dot(W1.T, x)
    u = np.dot(W2.T, h)
    y_pred = softmax(u)
    
    predicted_idx = np.argmax(y_pred)
    return idx2word[predicted_idx]

# -------------------------------
# Step 8: Try predictions
# -------------------------------
test_contexts = [
    ["the", "quick", "brown", "fox"],
    ["dogs", "are", "loyal", "and"],
    ["the", "dog", "chased", "the"],
    ["a", "fast", "brown", "fox"]
]

print("\nðŸ§© Next-word predictions:")
for ctx in test_contexts:
    pred = predict_next_word(ctx[-window_size:])
    print(f"Context: {ctx[-window_size:]} â†’ Predicted next word: {pred}")


âœ… Sentences after preprocessing:
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
['a', 'fast', 'brown', 'fox', 'leaps', 'over', 'sleeping', 'dogs']
['the', 'dog', 'chased', 'the', 'fox', 'through', 'the', 'field']
['foxes', 'are', 'quick', 'and', 'clever', 'animals']
['dogs', 'are', 'loyal', 'and', 'friendly', 'companions']

Vocabulary size: 24

Sample training pair: (['the', 'quick', 'fox', 'jumps'], 'brown')
Epoch 0, Loss: 96.1723
Epoch 200, Loss: 8.0458
Epoch 400, Loss: 2.4323
Epoch 600, Loss: 1.2636
Epoch 800, Loss: 0.8127

âœ… Training Complete

ðŸ§© Next-word predictions:
Context: ['brown', 'fox'] â†’ Predicted next word: jumps
Context: ['loyal', 'and'] â†’ Predicted next word: loyal
Context: ['chased', 'the'] â†’ Predicted next word: fox
Context: ['brown', 'fox'] â†’ Predicted next word: jumps
