In [55]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from collections import Counter
import numpy as np
import re # For tokenization

# For reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("PyTorch Version:", torch.__version__)

PyTorch Version: 2.5.1+cu124


In [56]:
# --- Hyperparameters ---

# Data and Vocabulary
CONTEXT_SIZE = 2        # N: Number of previous words to consider
MIN_WORD_FREQ = 1       # Keep all words for this small example

# Model Architecture
EMBEDDING_DIM = 30      # D: Dimension of word embeddings (keep small for demo)
HIDDEN_DIM = 50         # H: Dimension of the hidden layer

# Training
BATCH_SIZE = 8
NUM_EPOCHS = 100
LEARNING_RATE = 0.01

# Device configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


In [57]:
# --- Sample Data ---
# Slightly larger corpus for NNLM context
corpus = [
    "the cat sat on the mat",
    "the dog chased the cat",
    "a quick brown fox jumps over the lazy dog",
    "the cat chased the mouse",
    "the dog sat on the couch"
]

print(f"Corpus size: {len(corpus)} sentences")

Corpus size: 5 sentences


In [58]:
def tokenize(corpus):
    """Simple whitespace and punctuation tokenizer."""
    processed_corpus = []
    all_tokens = []
    for sentence in corpus:
        sentence = sentence.lower()
        sentence = re.sub(r'[^\w\s]', '', sentence)
        tokens = sentence.split()
        processed_corpus.append(tokens)
        all_tokens.extend(tokens) # Collect all tokens for vocab building
    return processed_corpus, all_tokens

tokenized_corpus, all_tokens = tokenize(corpus)
print("\nSample tokenized sentence:", tokenized_corpus[0])
print(f"Total tokens: {len(all_tokens)}")
# Output: Sample tokenized sentence: ['the', 'cat', 'sat', 'on', 'the', 'mat']
# Output: Total tokens: 32


Sample tokenized sentence: ['the', 'cat', 'sat', 'on', 'the', 'mat']
Total tokens: 31


In [59]:
def build_vocab(all_tokens, min_freq):
    """Builds vocabulary and word/id mappings."""
    word_counts = Counter(all_tokens)
    # No filtering needed if min_freq = 1 for this example
    # vocab = [word for word, count in word_counts.items() if count >= min_freq]
    vocab = list(word_counts.keys())

    word_to_id = {word: i for i, word in enumerate(vocab)}
    id_to_word = {i: word for i, word in enumerate(vocab)}

    print(f"Vocabulary size: {len(vocab)}")
    return vocab, word_to_id, id_to_word

vocab, word_to_id, id_to_word = build_vocab(all_tokens, MIN_WORD_FREQ)
VOCAB_SIZE = len(vocab)

print("Sample vocab words:", vocab[:10])
print("Sample word_to_id:", list(word_to_id.items())[:10])
# Output: Sample vocab words: ['the', 'cat', 'sat', 'on', 'mat', 'dog', 'chased', 'a', 'quick', 'brown']
# Output: Sample word_to_id: [('the', 0), ('cat', 1), ('sat', 2), ('on', 3), ('mat', 4), ('dog', 5), ('chased', 6), ('a', 7), ('quick', 8), ('brown', 9)]

Vocabulary size: 16
Sample vocab words: ['the', 'cat', 'sat', 'on', 'mat', 'dog', 'chased', 'a', 'quick', 'brown']
Sample word_to_id: [('the', 0), ('cat', 1), ('sat', 2), ('on', 3), ('mat', 4), ('dog', 5), ('chased', 6), ('a', 7), ('quick', 8), ('brown', 9)]


In [60]:
def create_nnlm_examples(all_tokens, word_to_id, context_size):
    """Creates (context_ids, target_id) examples."""
    examples = []
    # Convert all tokens to their IDs first
    token_ids = [word_to_id[token] for token in all_tokens]

    # Iterate through the token IDs, starting from the first possible target word
    for i in range(context_size, len(token_ids)):
        context_ids = token_ids[i - context_size : i] # Get the N previous IDs
        target_id = token_ids[i]                  # Get the current ID as target
        examples.append((context_ids, target_id))

    return examples

training_examples = create_nnlm_examples(all_tokens, word_to_id, CONTEXT_SIZE)
print(f"\nTotal training examples created: {len(training_examples)}")
print("Sample NNLM training examples ([context_ids], target_id):")
for i in range(3):
    print(f"  Context: {[id_to_word[idx] for idx in training_examples[i][0]]} -> Target: {id_to_word[training_examples[i][1]]}")
# Example Output (N=2):
#   Context: ['the', 'cat'] -> Target: sat
#   Context: ['cat', 'sat'] -> Target: on
#   Context: ['sat', 'on'] -> Target: the


Total training examples created: 29
Sample NNLM training examples ([context_ids], target_id):
  Context: ['the', 'cat'] -> Target: sat
  Context: ['cat', 'sat'] -> Target: on
  Context: ['sat', 'on'] -> Target: the


In [61]:
class NNLMDataset(Dataset):
    def __init__(self, examples):
        self.examples = examples

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        # Get the context IDs and target ID
        context_ids, target_id = self.examples[idx]
        # Convert them to PyTorch tensors (long type for indices)
        return torch.tensor(context_ids, dtype=torch.long), torch.tensor(target_id, dtype=torch.long)

# Instantiate the Dataset
nnlm_dataset = NNLMDataset(training_examples)

# Test the dataset - get one item
context_tensor, target_tensor = nnlm_dataset[1]
print(f"\nSample Dataset Output (Tensors):")
print(f"  Context Tensor: {context_tensor} (shape: {context_tensor.shape})")
print(f"  Target Tensor: {target_tensor} (shape: {target_tensor.shape})")
# Output:
# Sample Dataset Output (Tensors):
#  Context Tensor: tensor([0, 1]) (shape: torch.Size([2]))
#  Target Tensor: 2 (shape: torch.Size([])) # Target is a scalar tensor


Sample Dataset Output (Tensors):
  Context Tensor: tensor([1, 2]) (shape: torch.Size([2]))
  Target Tensor: 3 (shape: torch.Size([]))


In [62]:
class NNLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(NNLM, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.context_size = context_size
        self.hidden_dim = hidden_dim

        # --- Layers ---
        # 1. Embedding Layer (Projection Matrix C)
        # Input: (batch_size, context_size) -> Output: (batch_size, context_size, embedding_dim)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)

        # 2. Linear layer from concatenated embeddings to hidden layer
        # Input: (batch_size, context_size * embedding_dim) -> Output: (batch_size, hidden_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)

        # 3. Non-linear activation function (tanh)
        self.activation1 = nn.Tanh()

        # 4. Linear layer from hidden layer to output layer (logits)
        # Input: (batch_size, hidden_dim) -> Output: (batch_size, vocab_size)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)

        # Optional: Initialize weights
        self.init_weights()

    def init_weights(self):
        # Initialize embeddings and linear layers uniformly
        initrange = 0.1 # A common range for initialization
        self.embeddings.weight.data.uniform_(-initrange, initrange)
        self.linear1.weight.data.uniform_(-initrange, initrange)
        self.linear1.bias.data.zero_()
        self.linear2.weight.data.uniform_(-initrange, initrange)
        self.linear2.bias.data.zero_()

    def forward(self, context_ids):
        """
        Forward pass of the NNLM.
        Args:
            context_ids: Tensor of shape (batch_size, context_size) containing context word IDs.
        Returns:
            logits: Tensor of shape (batch_size, vocab_size) containing raw scores for each word.
        """
        # 1. Get Embeddings for context words
        # Input: (batch_size, context_size)
        embeds = self.embeddings(context_ids)
        # Output embeds: (batch_size, context_size, embedding_dim)

        # 2. Concatenate Embeddings
        # We need to reshape embeds to (batch_size, context_size * embedding_dim)
        # view(batch_size, -1) automatically calculates the second dimension
        concatenated_embeds = embeds.view(embeds.shape[0], -1)
        # Alternative: torch.flatten(embeds, start_dim=1)
        # Output concatenated_embeds: (batch_size, context_size * embedding_dim)

        # 3. Pass through Hidden Layer
        hidden_output = self.linear1(concatenated_embeds)
        activated_output = self.activation1(hidden_output)
        # Output activated_output: (batch_size, hidden_dim)

        # 4. Pass through Output Layer
        logits = self.linear2(activated_output)
        # Output logits: (batch_size, vocab_size)

        return logits

# Instantiate the model
model = NNLM(VOCAB_SIZE, EMBEDDING_DIM, CONTEXT_SIZE, HIDDEN_DIM).to(DEVICE)
print(f"\nModel Architecture:\n{model}")

# Example: Pass a dummy batch through the model to check shapes
dummy_context = torch.randint(0, VOCAB_SIZE, (BATCH_SIZE, CONTEXT_SIZE)).to(DEVICE)
output_logits = model(dummy_context)
print(f"\nOutput logits shape for dummy batch: {output_logits.shape}") # Should be (BATCH_SIZE, VOCAB_SIZE)


Model Architecture:
NNLM(
  (embeddings): Embedding(16, 30)
  (linear1): Linear(in_features=60, out_features=50, bias=True)
  (activation1): Tanh()
  (linear2): Linear(in_features=50, out_features=16, bias=True)
)

Output logits shape for dummy batch: torch.Size([8, 16])


In [63]:
# --- Training Setup ---
dataloader = DataLoader(nnlm_dataset, batch_size=BATCH_SIZE, shuffle=True)

# CrossEntropyLoss combines LogSoftmax and NLLLoss - suitable for classification
# Expects raw logits from the model and class indices (target word IDs) as target
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(f"\n--- Starting Training ---")
print(f"Vocab size: {VOCAB_SIZE}")
print(f"Embedding dim: {EMBEDDING_DIM}")
print(f"Context size: {CONTEXT_SIZE}")
print(f"Hidden dim: {HIDDEN_DIM}")
print(f"Epochs: {NUM_EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning Rate: {LEARNING_RATE}")
print(f"Training examples: {len(training_examples)}")
print(f"Steps per epoch: {len(dataloader)}")

# --- Training Loop ---
model.train() # Set model to training mode

for epoch in range(NUM_EPOCHS):
    total_loss = 0
    for i, (context_batch, target_batch) in enumerate(dataloader):
        # Move data to device
        context_batch = context_batch.to(DEVICE) # Shape: (batch_size, context_size)
        target_batch = target_batch.to(DEVICE)   # Shape: (batch_size,)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass -> Get logits
        logits = model(context_batch) # Shape: (batch_size, vocab_size)

        # Calculate loss
        # CrossEntropyLoss needs logits (N, C) and targets (N)
        # where C = number of classes (vocab_size)
        loss = loss_fn(logits, target_batch)

        # Backward pass -> Calculate gradients
        loss.backward()

        # Update weights
        optimizer.step()

        total_loss += loss.item()

    # Print average loss for the epoch
    avg_loss = total_loss / len(dataloader) if len(dataloader) > 0 else 0
    if (epoch + 1) % 10 == 0 or epoch == 0: # Print every 10 epochs
      print(f'Epoch [{epoch+1}/{NUM_EPOCHS}], Average Loss: {avg_loss:.4f}')

print("--- Training Finished ---")


--- Starting Training ---
Vocab size: 16
Embedding dim: 30
Context size: 2
Hidden dim: 50
Epochs: 100
Batch size: 8
Learning Rate: 0.01
Training examples: 29
Steps per epoch: 4
Epoch [1/100], Average Loss: 2.7711
Epoch [10/100], Average Loss: 0.5803
Epoch [20/100], Average Loss: 0.3141
Epoch [30/100], Average Loss: 0.3534
Epoch [40/100], Average Loss: 0.2448
Epoch [50/100], Average Loss: 0.2900
Epoch [60/100], Average Loss: 0.2931
Epoch [70/100], Average Loss: 0.2708
Epoch [80/100], Average Loss: 0.2642
Epoch [90/100], Average Loss: 0.3077
Epoch [100/100], Average Loss: 0.2875
--- Training Finished ---


In [64]:
def predict_next_word(model, context_words, word_to_id, id_to_word, context_size):
    """Predicts the most likely next word given a context."""
    model.eval() # Set model to evaluation mode

    if len(context_words) != context_size:
        print(f"Error: Context length must be {context_size}")
        return

    # Check if all context words are in vocabulary
    for word in context_words:
        if word not in word_to_id:
            print(f"Error: Context word '{word}' not in vocabulary.")
            return

    # Convert context words to tensor of IDs
    context_ids = [word_to_id[word] for word in context_words]
    context_tensor = torch.tensor([context_ids], dtype=torch.long).to(DEVICE) # Add batch dimension

    # Get logits from the model
    with torch.no_grad(): # No need to calculate gradients during inference
        logits = model(context_tensor) # Shape: (1, vocab_size)

    # Find the index of the highest logit
    predicted_id = torch.argmax(logits, dim=1).item()

    # Convert ID back to word
    predicted_word = id_to_word[predicted_id]

    # Optional: Get probability distribution using softmax
    # probabilities = torch.softmax(logits, dim=1).squeeze().cpu().numpy()
    # probability_of_predicted = probabilities[predicted_id]
    # print(f"(Probability: {probability_of_predicted:.4f})")

    return predicted_word

# --- Prediction ---
print("\n--- Prediction Example ---")
context1 = ["the", "cat"]
if len(context1) == CONTEXT_SIZE:
    prediction1 = predict_next_word(model, context1, word_to_id, id_to_word, CONTEXT_SIZE)
    print(f"Context: {context1} -> Predicted next word: {prediction1}")
else:
    print(f"Skipping prediction for {context1} (context size mismatch)")


context2 = ["dog", "chased"]
if len(context2) == CONTEXT_SIZE:
    prediction2 = predict_next_word(model, context2, word_to_id, id_to_word, CONTEXT_SIZE)
    print(f"Context: {context2} -> Predicted next word: {prediction2}")
else:
     print(f"Skipping prediction for {context2} (context size mismatch)")

# Note: Predictions on this tiny dataset might not be very meaningful!


--- Prediction Example ---
Context: ['the', 'cat'] -> Predicted next word: sat
Context: ['dog', 'chased'] -> Predicted next word: the
