<a href="https://colab.research.google.com/github/Ali-Backour/6.4610-psets/blob/main/hw1_nlp_shared.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
Homework 1: Language Models

This assignment will guide you through implementing three different types of language models:
1. N-gram model
2. Log-linear model with hand-designed features
3. Continuous Bag of Words (CBOW) model
"""

'\nHomework 1: Language Models\n\nThis assignment will guide you through implementing three different types of language models:\n1. N-gram model\n2. Log-linear model with hand-designed features\n3. Continuous Bag of Words (CBOW) model\n'

In [3]:
import numpy as np
from typing import List, Optional, Tuple
import pickle
from collections import defaultdict, Counter
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers.models import BPE
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [4]:
class LanguageModel:
    """
    Abstract base class for language models.

    This class defines the interface that all language models should implement.
    Language models take a sequence of tokens (represented as integers) and
    predict probability distributions over the next token in the vocabulary.
    """

    def __init__(self):
        self.eos = 'eos'

    def train(self, token_sequences: List[np.ndarray]) -> None:
        """
        Train the language model on a collection of token sequences.

        Args:
            token_sequences (List[np.ndarray]): List of token sequences, where each
                                               sequence is a numpy array of integers
                                               representing token IDs.
        """
        raise NotImplementedError("Subclasses must implement the train method")

    def get_next_token_probs(self, context: np.ndarray) -> np.ndarray:
        """
        Get probability distribution over next tokens given a context.

        Args:
            context (np.ndarray): Array of token IDs representing the context.
                                 Shape: (context_length,)

        Returns:
            np.ndarray: Probability distribution over vocabulary.
                       Shape: (vocab_size,)
                       Should sum to 1.0.
        """
        raise NotImplementedError("Subclasses must implement the get_next_token_probs method")

    def perplexity(self, token_sequences: List[np.ndarray]) -> float:
        """
        Calculate perplexity of the model on a set of token sequences.

        Perplexity is 2^(-average_log_likelihood), where average log likelihood
        is calculated over all tokens in all sequences.

        Args:
            token_sequences (List[np.ndarray]): List of token sequences to evaluate

        Returns:
            float: Perplexity score (lower is better)
        """
        total_log_likelihood = 0.0
        total_tokens = 0

        for sequence in token_sequences:
            if len(sequence) < 2:
                continue  # Need at least 2 tokens (context + target)

            for i in range(1, len(sequence)):
                context = sequence[:i]
                target_token = sequence[i]

                probs = self.get_next_token_probs(context)
                # Add small epsilon to avoid log(0)
                prob = max(float(probs[target_token]), 1e-10)
                total_log_likelihood += np.log2(prob)
                total_tokens += 1

        if total_tokens == 0:
            return float('inf')

        average_log_likelihood = total_log_likelihood / total_tokens
        return 2 ** (-average_log_likelihood)

    def generate_text(self, context: np.ndarray, max_length: int = 100,
                     temperature: float = 1.0) -> np.ndarray:
        """
        Generate text by sampling from the model.

        Args:
            context (np.ndarray): Initial context tokens
            max_length (int): Maximum number of tokens to generate
            temperature (float): Sampling temperature (higher = more random)

        Returns:
            np.ndarray: Generated sequence including the initial context
        """
        generated = list(context)

        for _ in range(max_length):
            current_context = np.array(generated)
            probs = self.get_next_token_probs(current_context)

            # Apply temperature
            if temperature != 1.0:
                probs = np.power(probs, 1.0 / temperature)
                probs = probs / np.sum(probs)

            # Sample next token
            next_token = np.random.choice(len(probs), p=probs)
            generated.append(next_token)

            # Optional: add stopping criteria here (e.g., end-of-sequence token)
            if next_token == self.eos:
              break

        return np.array(generated)

In [61]:
class NGramModel(LanguageModel):
    """
    N-gram language model using maximum likelihood estimation with smoothing.

    This model predicts the next token based on the previous n-1 tokens.
    """

    def __init__(self, vocab_size: int, n: int = 3, smoothing=None):
        """
        Initialize the N-gram model.

        Args:
            vocab_size (int): Size of the vocabulary
            n (int): Order of the n-gram (default: 3 for trigram)

            Bonus! Without smoothing your perplexity will be pretty bad. If you
            implement some kind of smoothing and get the perplexity below 300
            you'll get extra credit.
            smoothing (str): Smoothing method ('laplace' or 'interpolation')
        """
        super().__init__()
        # TODO: YOUR CODE HERE
        # Hint: Consider using nested dictionaries or defaultdict(Counter) to store counts.
        # Hint: Consider how you will handle different context lengths.
        # At the start of a sentence, you might have 0, 1, or 2 words of context
        # instead of the full n-1 words.
        self.n = n
        self.vocab_size = vocab_size
        self.counts = {}
    def train(self, token_sequences: List[np.ndarray]) -> None:
        """
        Train the n-gram model by counting n-grams in the training data.

        Args:
            token_sequences (List[np.ndarray]): Training sequences
        """
        for doc in token_sequences:
            for i in range(len(doc) - self.n + 1):
                for j in range(self.n):
                    window = tuple(doc[i:i+j+1])
                    self.counts.setdefault(window,0)
                    self.counts[window] +=1


    def get_next_token_probs(self, context: np.ndarray) -> np.ndarray:
        """
        Get probability distribution over next tokens for given context.

        Args:
            context (np.ndarray): Context tokens

        Returns:
            np.ndarray: Probability distribution over vocabulary
        """
        # TODO: YOUR CODE HERE
        # Hint: Try contexts from longest to shortest, i.e., try the full context,
        # and if it is not in the training data, try shorter context
        # Hint: What probability distribution should we output if no valid context is found?
        for i in range(self.n - 1,0 ,-1):
            window = tuple(context[-i:])
            if window in self.counts:
                prb =  np.array([self.counts.get(window + (new_word,),0)/self.counts[window] for new_word in range(self.vocab_size)])
                return prb/sum(prb)
        prb =  np.array([self.counts.get(new_word,0) for new_word in range(self.vocab_size)])
        if sum(prb):
            return prb/sum(prb)
        return np.array([1/self.vocab_size for _ in range(self.vocab_size)])
        

In [None]:
class LogLinearModel(LanguageModel):
    """
    Log-linear language model with hand-designed features.

    This model uses a linear combination of features to predict next token probabilities.
    """

    def __init__(self, vocab_size: int, context_size: int = 3):
        """
        Initialize the log-linear model.

        Args:
            vocab_size (int): Size of the vocabulary
            context_size (int): Number of context tokens to consider
        """
        super().__init__()
        # TODO: YOUR CODE HERE

        self.context_size = context_size
        self.vocab_size = vocab_size
        # Hint: You may find the class nn.Linear useful
        # If this is too slow or using too much memory, check out the nn.EmbeddingBag class
        # and see if that's applicable to your use case
        self.model = torch.nn.Sequential(
            torch.nn.Linear(context_size * vocab_size,vocab_size)
            )
        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.01)

    def extract_features(self, context: np.ndarray) -> torch.Tensor:
        """
        Extract features from the context.

        Args:
            context (np.ndarray): Context tokens

        Returns:
            torch.Tensor: Feature vector produced from context tokens
        """
        features = torch.zeros(self.vocab_size,self.context_size,dtype=torch.float32)
        relevant_context = context[-self.context_size:]
        for i,word in enumerate(relevant_context):
            features[int(word),i] = 1.0
        return features.view(-1)

    def train(self, token_sequences: List[np.ndarray], epochs: int = 2, batch_size: int = 32) -> None:
        """
        Train the log-linear model using gradient descent.
        """
        # Create training examples (context, target) pairs
        contexts_list = []
        targets_list = []
        for seq in token_sequences:
            contexts_list.extend(seq[i:i+self.context_size] for i in range(len(seq) - self.context_size))
            targets_list.extend(seq[self.context_size:])
        assert len(contexts_list) == len(targets_list)
        all_features = contexts_list
        # TODO: YOUR CODE HERE


        print(f"Training on {len(all_features)} examples for {epochs} epochs...")

        # Training loop
        # TODO: Put your layers in training mode
        self.model.train()
        losses = []  # Potentially useful for debugging (loss should go down!)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)

        for epoch in range(epochs):
            print(f"Epoch {epoch}")
            total_loss = 0.0
            num_batches = 0

            # Mini-batch training
            # Note: tqdm is used to display progress bars for loops, helping visualize training progress.
            for i in tqdm(range(0, len(all_features), batch_size)):
                batch_contexts= contexts_list[i:i+batch_size]
                batch_targets = targets_list[i:i+batch_size]

                # TODO: Get features for the batch
                batch_contexts = torch.stack([self.extract_features(c) for c in batch_contexts])
                batch_contexts = batch_contexts.to(device)
                batch_targets = torch.tensor(batch_targets,device=device,dtype=torch.int64)
                # TODO: Zero the gradients of the optimizer
                
                self.optimizer.zero_grad()


                # TODO: Perform a forward pass to compute predictions for the model.
                logits = self.model(batch_contexts)


                # TODO: Perform the backward pass and gradient update. Remember,
                # you need to compute the loss, perform the backward pass, and
                # update the model parameters.
                # Your code here!
                loss = self.criterion(logits,batch_targets)
                loss.backward()
                self.optimizer.step()
                total_loss += loss.item()
                num_batches += 1

                if i % (batch_size * 10) == 0:  # Print every 10 batches
                    print(f"Epoch {epoch}, Batch {i // batch_size}: Loss = {loss.item():.4f}")

            if epoch % 2 == 0:
                avg_loss = total_loss / num_batches
                print(f"Epoch {epoch}: Average Loss = {avg_loss:.4f}")

            losses.append(total_loss)
        torch.save(self.model.state_dict(),'log_linear.pth')

    def get_next_token_probs(self, context: np.ndarray) -> np.ndarray:
        """
        Get probability distribution using softmax over linear scores.

        Args:
            context (np.ndarray): Context tokens

        Returns:
            np.ndarray: Probability distribution over vocabulary
        """
        self.model.eval()
        with torch.no_grad():
            if len(context) == 0:
                return torch.full((self.vocab_size,),1.0/self.vocab_size)
            features = self.extract_features(context).unsqueeze(0)
            features = features.to(next(self.model.parameters()).device)
            logits = self.model(features)
            prb = torch.softmax(logits, dim=-1).cpu().numpy().flatten()
        return prb
        # TODO: YOUR CODE HERE
        # Hint: What probability distribution should we output if no valid context is found?

In [None]:
import itertools

class CBOWModel(LanguageModel):
    """
    Continuous Bag of Words (CBOW) model.

    This model learns dense vector representations of words and predicts
    the next word from the context words.
    """

    def __init__(self, vocab_size: int, embedding_dim: int = 100, context_size: int = 2, learning_rate: float = 0.01):
        """
        Initialize the CBOW model.

        Args:
            vocab_size (int): Size of the vocabulary
            embedding_dim (int): Dimension of word embeddings
            context_size (int): Number of context words on each side
            learning_rate (float): Learning rate for training
        """
        super().__init__()
        # TODO: YOUR CODE HERE
        # You may find the classes nn.Embedding and nn.EmbeddingBag useful
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.context_size = context_size
        self.w = nn.Embedding(vocab_size,embedding_dim)
        self.u = nn.Embedding(vocab_size,embedding_dim)
        # We use an Adam optimizer. This is a fancy version of SGD which uses momentum and adaptive updates.
        self.optimizer = optim.Adam(itertools.chain(self.w.parameters(), self.u.parameters()), lr=learning_rate)

        # What loss function should we use for Word2Vec?
        self.criterion = torch.nn.CrossEntropyLoss()


    def train(self, token_sequences: List[np.ndarray], epochs: int = 10, batch_size: int = 32) -> None:
        """
        Train the CBOW model.

        Args:
            token_sequences (List[np.ndarray]): Training sequences
            epochs (int): Number of training epochs
            batch_size (int): Batch size for training
        """
        # TODO: YOUR CODE HERE
        # Create training examples (context, target) pairs
        # Hint, extract left context only for next token prediction
        # Hint, pad shorter contexts with 0, this ensures all have the same length for batching
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        contexts_list = []
        targets_list = []
        for seq in token_sequences:
            contexts_list.extend(np.concatenate((seq[i - self.context_size : i],seq[i+1:i + self.context_size + 1])) for i in range(self.context_size, len(seq)-self.context_size - 1))
            targets_list.extend([seq[i] for i in range(self.context_size, len(seq)-self.context_size - 1)])
        assert len(contexts_list) == len(targets_list)
        all_contexts = torch.tensor(contexts_list).to(device=device)
        all_targets = torch.tensor(targets_list).to(device=device)
        print(f"the shape of contexts is {all_contexts.shape}")
        print(f"the shape of targets is {all_targets.shape}")

        # TODO: Put your layers in training mode
        self.w.train()
        self.u.train()
        self.u.to(device=device)
        self.w.to(device=device)
        losses = []  # Potentially useful for debugging (loss should go down!)
        # Note: tqdm is used to display progress bars for loops, helping visualize training progress.
        for epoch in range(epochs):
            print(f"Epoch {epoch}")
            total_loss = 0.0
            num_batches = 0

            # Shuffle data
            indices = torch.randperm(len(all_contexts))
            all_contexts = all_contexts[indices]
            all_targets = all_targets[indices]

            for i in tqdm(range(0, len(all_contexts), batch_size)):
                # As an alternative to this implementation, you can experiment with
                # DataLoader (https://docs.pytorch.org/docs/stable/data.html) for automatic shuffling, parallel loading
                batch_contexts = all_contexts[i:i+batch_size]
                batch_targets = all_targets[i:i+batch_size]

                # TODO: Zero the gradients of the optimizer
                self.optimizer.zero_grad()

                # TODO: Perform a forward pass to compute predictions for the model.
                # Your code here!

                preds = torch.mean(self.w(batch_contexts),dim=1) @ self.u.weight.T

                # TODO: Finish the backward pass and gradient update.
                # Remember, you need to compute the loss, perform the backward pass, and
                # update the model parameters.
                # Your code here!
                loss = self.criterion(preds,batch_targets)
                loss.backward()
                self.optimizer.step()
                
                total_loss += loss.item()
                num_batches += 1

                if i % (batch_size * 10) == 0:  # Print every 10 batches
                    print(f"Epoch {epoch}, Batch {i // batch_size}: Loss = {loss.item():.4f}")

            if epoch % 2 == 0:
                avg_loss = total_loss / num_batches
                print(f"Epoch {epoch}: Average Loss = {avg_loss:.4f}")

            losses.append(total_loss)

    def get_next_token_probs(self, context: np.ndarray) -> np.ndarray:
        """
        Get next-token probability distributions.

        Args:
            context (np.ndarray): Context tokens

        Returns:
            np.ndarray: Probability distribution over vocabulary
        """
        # TODO: YOUR CODE HERE
        # Hints:
        # For next-token prediction, we use the last context_size tokens
        # No valid context, return uniform distribution
        # Pad context to expected size
        # Don't forget to add batch dimension, torch expects (batch_size, context_size)

        with torch.no_grad():
            if len(context) == 0:
                return torch.full((self.vocab_size,),1.0/self.vocab_size)
            padded_context = np.pad(context,(self.context_size,0),mode = 'constant')
            relevant = torch.tensor(padded_context[-self.context_size:]).to(device=next(self.u.parameters().device()))
            w = self.w(relevant).unsqueeze(0)
            w_avg = torch.mean(w,dim = 1)
            logits = w_avg @ self.u.T
            return torch.softmax(logits,dim=-1).squeeze(0).cpu().numpy()


    def get_word_embedding(self, token_id: int) -> np.ndarray:
        """
        Get the learned embedding for a specific token.

        Args:
            token_id (int): Token ID

        Returns:
            np.ndarray: Word embedding vector
        """
        with torch.no_grad():
            return self.w(token_id)

In [7]:
def load_data(filepath: str, tokenizer: Optional[Tokenizer], max_seq_length: int = 512) -> List[np.ndarray]:
    """
    Load and preprocess text data using GPT-2 tokenizer.

    This function is provided complete - students don't need to modify it.

    Args:
        filepath (str): Path to the text file
        tokenizer (Optional[Tokenizer]): Tokenizer to use. If None, a new tokenizer will be created.
        max_seq_length (int): Maximum sequence length for splitting text

    Returns:
        Tuple[List[np.ndarray], GPT2Tokenizer]: List of token sequences and the tokenizer
    """
    # Byte Pair Encoding (BPE)

    if tokenizer is None:
        tokenizer = Tokenizer(BPE())
        tokenizer.pre_tokenizer = Whitespace()
        #trainer = BpeTrainer(special_tokens=["[PAD]"])
        tokenizer.train([filepath])

    # Read the text file
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read()

    # Tokenize the entire text
    tokens = tokenizer.encode(text).ids

    # Split into sequences of max_seq_length
    sequences = []
    for i in range(0, len(tokens), max_seq_length):
        sequence = tokens[i:i+max_seq_length]
        if len(sequence) > 1:  # Need at least 2 tokens for language modeling
            sequences.append(np.array(sequence))

    print(f"Loaded {len(sequences)} sequences from {filepath}")
    print(f"Vocabulary size: {tokenizer.get_vocab_size()}")
    print(f"Sample tokens: {tokens[:10]}")
    print(f"Sample text: {tokenizer.decode(tokens[:10])}")

    return sequences, tokenizer

In [8]:
def evaluate_models(models: List[LanguageModel], test_data: List[np.ndarray]) -> None:
    """
    Evaluate and compare multiple language models.

    Args:
        models (List[LanguageModel]): List of trained models
        test_data (List[np.ndarray]): Test sequences
    """
    print("Model Evaluation Results:")
    print("=" * 50)

    for i, model in enumerate(models):
        model_name = model.__class__.__name__
        try:
            ppl = model.perplexity(test_data)
            print(f"{model_name}: Perplexity = {ppl:.2f}")
        except Exception as e:
            print(f"{model_name}: Error calculating perplexity - {e}")

    print("=" * 50)

In [9]:
def analyze(model1: LanguageModel, model2: LanguageModel, test_data: List[np.ndarray],
           tokenizer=None, context_length: int = 2) -> dict:
    """
    Compare two models and find contexts where each performs better.

    Args:
        model1 (LanguageModel): First model to compare
        model2 (LanguageModel): Second model to compare
        test_data (List[np.ndarray]): Test sequences
        tokenizer: Tokenizer for decoding (optional, for display purposes)
        context_length (int): Context length to consider

    Returns:
        dict: Analysis results including overall perplexities and context comparisons
    """
    print("Detailed Model Analysis")
    print("=" * 60)

    # Overall perplexity comparison
    try:
        ppl1 = model1.perplexity(test_data)
        ppl2 = model2.perplexity(test_data)

        model1_name = model1.__class__.__name__
        model2_name = model2.__class__.__name__

        print(f"{model1_name} overall perplexity: {ppl1:.3f}")
        print(f"{model2_name} overall perplexity: {ppl2:.3f}")
        print(f"Better overall model: {model1_name if ppl1 < ppl2 else model2_name}")
        print()

    except Exception as e:
        print(f"Error calculating overall perplexity: {e}")
        return {}

    # Context-level analysis
    context_comparisons = []
    model1_better_contexts = []
    model2_better_contexts = []

    print("Analyzing context-level performance...")

    for seq_idx, sequence in enumerate(test_data):
        for i in range(context_length, len(sequence)):
            context = sequence[i - context_length:i]
            target_token = sequence[i]

            try:
                # Get predictions from both models
                probs1 = model1.get_next_token_probs(context)
                probs2 = model2.get_next_token_probs(context)

                # Calculate log probabilities for the actual target
                prob1 = max(probs1[target_token], 1e-10)
                prob2 = max(probs2[target_token], 1e-10)

                log_prob1 = np.log2(prob1)
                log_prob2 = np.log2(prob2)

                # Store comparison data
                context_info = {
                    'context': context.copy(),
                    'target': target_token,
                    'log_prob1': log_prob1,
                    'log_prob2': log_prob2,
                    'seq_idx': seq_idx,
                    'pos': i
                }
                context_comparisons.append(context_info)

                # Categorize based on which model is better
                if log_prob1 > log_prob2:  # Higher log prob = better
                    model1_better_contexts.append(context_info)
                else:
                    model2_better_contexts.append(context_info)

            except Exception as e:
                print(f"Error analyzing context at seq {seq_idx}, pos {i}: {e}")
                continue

    # Calculate statistics
    total_contexts = len(context_comparisons)
    model1_wins = len(model1_better_contexts)
    model2_wins = len(model2_better_contexts)

    print(f"Total contexts analyzed: {total_contexts}")
    print(f"{model1_name} better contexts: {model1_wins} ({100*model1_wins/total_contexts:.1f}%)")
    print(f"{model2_name} better contexts: {model2_wins} ({100*model2_wins/total_contexts:.1f}%)")
    print()

    # Find patterns in contexts where each model excels
    def analyze_context_patterns(better_contexts, model_name, top_k=10):
        print(f"Top {top_k} unique contexts where {model_name} excels:")
        print("-" * 40)

        # Group contexts by (context, target) pairs
        context_groups = {}
        for ctx_info in better_contexts:
            context_tuple = tuple(ctx_info['context'])
            target = ctx_info['target']
            key = (context_tuple, target)

            if key not in context_groups:
                context_groups[key] = {
                    'contexts': [],
                    'best_diff': 0,
                    'context': ctx_info['context'],
                    'target': target
                }

            context_groups[key]['contexts'].append(ctx_info)

            # Track the best performance difference for this context
            diff = (ctx_info['log_prob1'] - ctx_info['log_prob2']
                   if model_name == model1_name
                   else ctx_info['log_prob2'] - ctx_info['log_prob1'])

            if diff > context_groups[key]['best_diff']:
                context_groups[key]['best_diff'] = diff

        # Sort by best performance difference
        sorted_groups = sorted(context_groups.values(),
                             key=lambda x: x['best_diff'],
                             reverse=True)

        for i, group in enumerate(sorted_groups[:top_k]):
            context = group['context']
            target = group['target']
            count = len(group['contexts'])
            best_diff = group['best_diff']

            # Format context display
            if tokenizer is not None:
                try:
                    context_text = tokenizer.decode(context[-min(5, len(context)):])
                    target_text = tokenizer.decode([target])
                    print(f"{i+1:2d}. Context: '{context_text}' → Target: '{target_text}' (×{count})")
                except:
                    print(f"{i+1:2d}. Context: {context[-min(5, len(context)):]} → Target: {target} (×{count})")
            else:
                print(f"{i+1:2d}. Context: {context[-min(5, len(context)):]} → Target: {target} (×{count})")

            # Show best probability difference for this context type
            print(f"     Best log-prob difference: {best_diff:.3f}")

            # If there are multiple instances, show average difference
            if count > 1:
                avg_diff = sum((ctx['log_prob1'] - ctx['log_prob2']
                              if model_name == model1_name
                              else ctx['log_prob2'] - ctx['log_prob1'])
                             for ctx in group['contexts']) / count
                print(f"     Average log-prob difference: {avg_diff:.3f}")
            print()

    # Analyze patterns for both models
    if model1_better_contexts:
        analyze_context_patterns(model1_better_contexts, model1_name)

    if model2_better_contexts:
        analyze_context_patterns(model2_better_contexts, model2_name)

    # Analyze context length effects
    print("Performance by context length:")
    print("-" * 30)

    context_length_stats = {}
    for ctx_info in context_comparisons:
        ctx_len = len(ctx_info['context'])
        if ctx_len not in context_length_stats:
            context_length_stats[ctx_len] = {'model1_wins': 0, 'model2_wins': 0, 'total': 0}

        context_length_stats[ctx_len]['total'] += 1
        if ctx_info['log_prob1'] > ctx_info['log_prob2']:
            context_length_stats[ctx_len]['model1_wins'] += 1
        else:
            context_length_stats[ctx_len]['model2_wins'] += 1

    for ctx_len in sorted(context_length_stats.keys()):
        stats = context_length_stats[ctx_len]
        model1_pct = 100 * stats['model1_wins'] / stats['total']
        model2_pct = 100 * stats['model2_wins'] / stats['total']
        print(f"Length {ctx_len:2d}: {model1_name} {model1_pct:5.1f}% | {model2_name} {model2_pct:5.1f}% ({stats['total']} examples)")

    print("=" * 60)

    # Return structured results
    return {
        'overall_perplexity': {model1_name: ppl1, model2_name: ppl2},
        'context_level': {
            'total_contexts': total_contexts,
            f'{model1_name}_wins': model1_wins,
            f'{model2_name}_wins': model2_wins,
            f'{model1_name}_better_contexts': model1_better_contexts[:10],  # Top 10
            f'{model2_name}_better_contexts': model2_better_contexts[:10],  # Top 10
        },
        'context_length_stats': context_length_stats
    }

In [62]:
# Feel free to comment out portions of the code and run it multiple times, or to
# take it out of the main() function. If you're struggling to lower your
# perplexity, you can play around with the model hyperparameters like the
# learning rate, batch size, and number of epochs.

def main():
    # Here we're training on train.txt and evaluating on test.txt.
    # However, you might find it useful to play with tiny.txt while you're debugging.
    # If you're running into memory issues, you can try training on a smaller set of
    # sentences by truncating train.txt, but you should always report your final
    # results on test.txt (and think about ways of making your code more efficient)!

    train_data, tokenizer = load_data("train.txt", tokenizer=None) # Feel free to swap with tiny.txt for testing
    print("Loaded training data")
    test_data, _ = load_data("test.txt", tokenizer=tokenizer) # Feel free to swap with tiny.txt for testing
    print("Loaded test data")

    print('N-gram')
    ngram_model = NGramModel(tokenizer.get_vocab_size(), n=3)
    ngram_model.train(train_data)
    print(tokenizer.decode(ngram_model.generate_text(test_data[0][:1])))
    print(f"Perplexity: {ngram_model.perplexity(test_data)}")

    # print('Log-linear')
    # log_linear_model = LogLinearModel(tokenizer.get_vocab_size())
    # log_linear_model.train(train_data)
    # print(tokenizer.decode(log_linear_model.generate_text(test_data[0][:1])))
    # print(f"Perplexity: {log_linear_model.perplexity(test_data)}")

    # print('CBOW')
    # cbow_model = CBOWModel(tokenizer.get_vocab_size(), embedding_dim=100, context_size=3)
    # cbow_model.train(train_data, epochs=10)
    # print(tokenizer.decode(cbow_model.generate_text(test_data[0][:1])))
    # print(f"Perplexity: {cbow_model.perplexity(test_data)}")

    # evaluate_models([ngram_model, log_linear_model,cbow_model], test_data)


if __name__ == "__main__":
    main()




Loaded 3892 sequences from train.txt
Vocabulary size: 13672
Sample tokens: [774, 8, 774, 206, 88, 741, 239, 91, 137, 6]
Sample text: Spot . Spot saw the shiny car and said ,
Loaded training data
Loaded 780 sequences from test.txt
Vocabulary size: 13672
Sample tokens: [1, 2313, 221, 166, 293, 205, 288, 8, 1, 1221]
Sample text: " Who are you ?" Tim asked . " Why
Loaded test data
N-gram
" Okay , mom ." Their parents were very important that Mrs . Lung smiled again because he couldn ' t be selfish . She also put some magic to hide behind a tree , she found some yummy food on it . To the kitten were best friends . He only hears his tummy . She loved to jump so much fun . He was three years old and she couldn ' t believe his luck never ran out of the hill he became a good idea and they were never miserable again . The new flower in the pond
Perplexity: 692.5371509498468


In [None]:
# LOG LINEAR MODEL

def main():
    # Here we're training on train.txt and evaluating on test.txt.
    # However, you might find it useful to play with tiny.txt while you're debugging.
    # If you're running into memory issues, you can try training on a smaller set of
    # sentences by truncating train.txt, but you should always report your final
    # results on test.txt (and think about ways of making your code more efficient)!
    train_data, tokenizer = load_data("train.txt", tokenizer=None) # Feel free to swap with tiny.txt for testing
    print("Loaded training data")
    test_data, _ = load_data("test.txt", tokenizer=tokenizer) # Feel free to swap with tiny.txt for testing
    print("Loaded test data")

    print('Log-linear')
    log_linear_model = LogLinearModel(tokenizer.get_vocab_size())
    log_linear_model.train(train_data)
    print(tokenizer.decode(log_linear_model.generate_text(test_data[0][:1])))
    print(f"Perplexity: {log_linear_model.perplexity(test_data)}")

    # evaluate_models([ngram_model, log_linear_model,cbow_model], test_data)

   
if __name__ == "__main__":
    main()




Loaded 3892 sequences from train.txt
Vocabulary size: 13672
Sample tokens: [774, 8, 774, 206, 88, 741, 239, 91, 137, 6]
Sample text: Spot . Spot saw the shiny car and said ,
Loaded training data
Loaded 780 sequences from test.txt
Vocabulary size: 13672
Sample tokens: [1, 2313, 221, 166, 293, 205, 288, 8, 1, 1221]
Sample text: " Who are you ?" Tim asked . " Why
Loaded test data
Log-linear
Training on 1980603 examples for 2 epochs...
Epoch 0


  0%|          | 1/61894 [00:07<123:55:01,  7.21s/it]

Epoch 0, Batch 0: Loss = 9.5231


  0%|          | 7/61894 [00:45<112:39:40,  6.55s/it]


KeyboardInterrupt: 

In [89]:
# WORD2VEC

def main():
    # Here we're training on train.txt and evaluating on test.txt.
    # However, you might find it useful to play with tiny.txt while you're debugging.
    # If you're running into memory issues, you can try training on a smaller set of
    # sentences by truncating train.txt, but you should always report your final
    # results on test.txt (and think about ways of making your code more efficient)!
    train_data, tokenizer = load_data("train.txt", tokenizer=None) # Feel free to swap with tiny.txt for testing
    print("Loaded training data")
    test_data, _ = load_data("test.txt", tokenizer=tokenizer) # Feel free to swap with tiny.txt for testing
    print("Loaded test data")

    print('CBOW')
    cbow_model = CBOWModel(tokenizer.get_vocab_size(), embedding_dim=100, context_size=3)
    cbow_model.train(train_data, epochs=10)
    print(tokenizer.decode(cbow_model.generate_text(test_data[0][:1])))
    print(f"Perplexity: {cbow_model.perplexity(test_data)}")

    # evaluate_models([ngram_model, log_linear_model,cbow_model], test_data)

   
if __name__ == "__main__":
    main()




Loaded 3892 sequences from train.txt
Vocabulary size: 13672
Sample tokens: [774, 8, 774, 206, 88, 741, 239, 91, 137, 6]
Sample text: Spot . Spot saw the shiny car and said ,
Loaded training data
Loaded 780 sequences from test.txt
Vocabulary size: 13672
Sample tokens: [1, 2313, 221, 166, 293, 205, 288, 8, 1, 1221]
Sample text: " Who are you ?" Tim asked . " Why
Loaded test data
CBOW
the shape of contexts is torch.Size([1965035, 6])
the shape of targets is torch.Size([1965035])
Epoch 0


  0%|          | 7/61408 [00:00<16:37, 61.57it/s]

Epoch 0, Batch 0: Loss = 17.9407


  0%|          | 15/61408 [00:00<15:01, 68.07it/s]

Epoch 0, Batch 10: Loss = 16.6517


  0%|          | 22/61408 [00:00<14:59, 68.26it/s]

Epoch 0, Batch 20: Loss = 15.5308


  0%|          | 30/61408 [00:00<14:04, 72.68it/s]

Epoch 0, Batch 30: Loss = 15.9010


  0%|          | 39/61408 [00:00<14:06, 72.49it/s]

Epoch 0, Batch 40: Loss = 14.7735


  0%|          | 55/61408 [00:00<14:08, 72.30it/s]

Epoch 0, Batch 50: Loss = 14.1472


  0%|          | 63/61408 [00:00<13:52, 73.71it/s]

Epoch 0, Batch 60: Loss = 12.3074


  0%|          | 71/61408 [00:00<14:29, 70.55it/s]

Epoch 0, Batch 70: Loss = 13.5296


  0%|          | 79/61408 [00:01<14:33, 70.21it/s]

Epoch 0, Batch 80: Loss = 13.1890


  0%|          | 96/61408 [00:01<13:25, 76.07it/s]

Epoch 0, Batch 90: Loss = 12.5856


  0%|          | 104/61408 [00:01<13:51, 73.72it/s]

Epoch 0, Batch 100: Loss = 12.8679


  0%|          | 113/61408 [00:01<13:26, 76.02it/s]

Epoch 0, Batch 110: Loss = 12.0728


  0%|          | 121/61408 [00:01<13:53, 73.52it/s]

Epoch 0, Batch 120: Loss = 11.9227


  0%|          | 129/61408 [00:01<14:23, 70.94it/s]

Epoch 0, Batch 130: Loss = 10.9255


  0%|          | 146/61408 [00:02<13:53, 73.54it/s]

Epoch 0, Batch 140: Loss = 9.1367


  0%|          | 154/61408 [00:02<14:48, 68.94it/s]

Epoch 0, Batch 150: Loss = 9.9206


  0%|          | 163/61408 [00:02<14:14, 71.64it/s]

Epoch 0, Batch 160: Loss = 11.1360


  0%|          | 171/61408 [00:02<13:50, 73.73it/s]

Epoch 0, Batch 170: Loss = 11.5290


  0%|          | 179/61408 [00:02<14:10, 72.03it/s]

Epoch 0, Batch 180: Loss = 9.6695


  0%|          | 195/61408 [00:02<14:14, 71.60it/s]

Epoch 0, Batch 190: Loss = 10.4111


  0%|          | 203/61408 [00:02<13:49, 73.77it/s]

Epoch 0, Batch 200: Loss = 10.6051


  0%|          | 211/61408 [00:02<13:47, 73.92it/s]

Epoch 0, Batch 210: Loss = 9.4528


  0%|          | 219/61408 [00:03<13:45, 74.14it/s]

Epoch 0, Batch 220: Loss = 10.6605


  0%|          | 228/61408 [00:03<13:17, 76.70it/s]

Epoch 0, Batch 230: Loss = 7.0992


  0%|          | 244/61408 [00:03<13:51, 73.56it/s]

Epoch 0, Batch 240: Loss = 9.4820


  0%|          | 252/61408 [00:03<14:12, 71.76it/s]

Epoch 0, Batch 250: Loss = 7.9058


  0%|          | 261/61408 [00:03<13:32, 75.22it/s]

Epoch 0, Batch 260: Loss = 8.3517


  0%|          | 269/61408 [00:03<13:46, 73.98it/s]

Epoch 0, Batch 270: Loss = 9.0974


  0%|          | 286/61408 [00:03<13:31, 75.35it/s]

Epoch 0, Batch 280: Loss = 8.9446


  0%|          | 294/61408 [00:04<13:25, 75.84it/s]

Epoch 0, Batch 290: Loss = 8.9368


  0%|          | 302/61408 [00:04<13:51, 73.50it/s]

Epoch 0, Batch 300: Loss = 8.6143


  1%|          | 311/61408 [00:04<13:18, 76.50it/s]

Epoch 0, Batch 310: Loss = 8.7556


  1%|          | 319/61408 [00:04<13:21, 76.18it/s]

Epoch 0, Batch 320: Loss = 7.7309


  1%|          | 335/61408 [00:04<13:36, 74.83it/s]

Epoch 0, Batch 330: Loss = 7.4254


  1%|          | 343/61408 [00:04<13:21, 76.18it/s]

Epoch 0, Batch 340: Loss = 7.7109


  1%|          | 351/61408 [00:04<13:24, 75.93it/s]

Epoch 0, Batch 350: Loss = 7.1386


  1%|          | 359/61408 [00:04<14:06, 72.08it/s]

Epoch 0, Batch 360: Loss = 6.9575


  1%|          | 367/61408 [00:05<14:02, 72.43it/s]

Epoch 0, Batch 370: Loss = 8.1370


  1%|          | 382/61408 [00:05<15:13, 66.79it/s]

Epoch 0, Batch 380: Loss = 8.4983


  1%|          | 390/61408 [00:05<14:50, 68.55it/s]

Epoch 0, Batch 390: Loss = 8.1069


  1%|          | 398/61408 [00:05<14:29, 70.17it/s]

Epoch 0, Batch 400: Loss = 6.8417


  1%|          | 414/61408 [00:05<14:00, 72.56it/s]

Epoch 0, Batch 410: Loss = 9.2662


  1%|          | 422/61408 [00:05<15:14, 66.67it/s]

Epoch 0, Batch 420: Loss = 7.8196


  1%|          | 429/61408 [00:05<15:07, 67.20it/s]

Epoch 0, Batch 430: Loss = 8.9732


  1%|          | 446/61408 [00:06<14:12, 71.48it/s]

Epoch 0, Batch 440: Loss = 8.5208


  1%|          | 454/61408 [00:06<15:08, 67.11it/s]

Epoch 0, Batch 450: Loss = 7.4220


  1%|          | 461/61408 [00:06<16:08, 62.96it/s]

Epoch 0, Batch 460: Loss = 8.0444


  1%|          | 470/61408 [00:06<14:41, 69.17it/s]

Epoch 0, Batch 470: Loss = 7.2384


  1%|          | 478/61408 [00:06<14:58, 67.84it/s]

Epoch 0, Batch 480: Loss = 6.4258


  1%|          | 494/61408 [00:06<14:13, 71.37it/s]

Epoch 0, Batch 490: Loss = 6.5836


  1%|          | 503/61408 [00:06<13:37, 74.51it/s]

Epoch 0, Batch 500: Loss = 8.2591


  1%|          | 511/61408 [00:07<14:33, 69.68it/s]

Epoch 0, Batch 510: Loss = 7.0052


  1%|          | 519/61408 [00:07<14:45, 68.74it/s]

Epoch 0, Batch 520: Loss = 6.7398


  1%|          | 536/61408 [00:07<14:00, 72.38it/s]

Epoch 0, Batch 530: Loss = 7.7510


  1%|          | 544/61408 [00:07<14:29, 69.96it/s]

Epoch 0, Batch 540: Loss = 6.6871


  1%|          | 552/61408 [00:07<14:28, 70.03it/s]

Epoch 0, Batch 550: Loss = 7.3270


  1%|          | 560/61408 [00:07<14:23, 70.49it/s]

Epoch 0, Batch 560: Loss = 6.3533


  1%|          | 569/61408 [00:07<13:49, 73.31it/s]

Epoch 0, Batch 570: Loss = 7.4088


  1%|          | 586/61408 [00:08<13:09, 77.02it/s]

Epoch 0, Batch 580: Loss = 5.8723


  1%|          | 594/61408 [00:08<13:04, 77.51it/s]

Epoch 0, Batch 590: Loss = 6.6539


  1%|          | 602/61408 [00:08<13:35, 74.61it/s]

Epoch 0, Batch 600: Loss = 6.2140


  1%|          | 610/61408 [00:08<13:25, 75.47it/s]

Epoch 0, Batch 610: Loss = 7.7942


  1%|          | 618/61408 [00:08<13:15, 76.37it/s]

Epoch 0, Batch 620: Loss = 6.9001


  1%|          | 634/61408 [00:08<13:55, 72.77it/s]

Epoch 0, Batch 630: Loss = 4.9885


  1%|          | 643/61408 [00:08<13:33, 74.69it/s]

Epoch 0, Batch 640: Loss = 6.7234


  1%|          | 651/61408 [00:09<13:22, 75.70it/s]

Epoch 0, Batch 650: Loss = 6.6262


  1%|          | 659/61408 [00:09<14:12, 71.30it/s]

Epoch 0, Batch 660: Loss = 5.3732


  1%|          | 675/61408 [00:09<13:28, 75.15it/s]

Epoch 0, Batch 670: Loss = 5.8738


  1%|          | 683/61408 [00:09<13:27, 75.16it/s]

Epoch 0, Batch 680: Loss = 7.5528


  1%|          | 692/61408 [00:09<13:02, 77.59it/s]

Epoch 0, Batch 690: Loss = 7.3512


  1%|          | 701/61408 [00:09<12:50, 78.81it/s]

Epoch 0, Batch 700: Loss = 6.9043


  1%|          | 709/61408 [00:09<13:29, 74.96it/s]

Epoch 0, Batch 710: Loss = 6.5409


  1%|          | 725/61408 [00:09<13:43, 73.67it/s]

Epoch 0, Batch 720: Loss = 5.2853


  1%|          | 734/61408 [00:10<13:26, 75.28it/s]

Epoch 0, Batch 730: Loss = 7.0013


  1%|          | 742/61408 [00:10<13:29, 74.96it/s]

Epoch 0, Batch 740: Loss = 6.8750


  1%|          | 750/61408 [00:10<13:38, 74.09it/s]

Epoch 0, Batch 750: Loss = 6.0542


  1%|          | 758/61408 [00:10<13:21, 75.68it/s]

Epoch 0, Batch 760: Loss = 7.6245


  1%|▏         | 774/61408 [00:10<13:46, 73.37it/s]

Epoch 0, Batch 770: Loss = 6.8786


  1%|▏         | 782/61408 [00:10<14:24, 70.10it/s]

Epoch 0, Batch 780: Loss = 5.9996


  1%|▏         | 790/61408 [00:10<14:21, 70.33it/s]

Epoch 0, Batch 790: Loss = 5.7660


  1%|▏         | 806/61408 [00:11<13:44, 73.52it/s]

Epoch 0, Batch 800: Loss = 6.8383


  1%|▏         | 814/61408 [00:11<13:57, 72.38it/s]

Epoch 0, Batch 810: Loss = 5.0152


  1%|▏         | 822/61408 [00:11<13:52, 72.81it/s]

Epoch 0, Batch 820: Loss = 7.1050


  1%|▏         | 830/61408 [00:11<13:54, 72.61it/s]

Epoch 0, Batch 830: Loss = 6.4770


  1%|▏         | 838/61408 [00:11<14:23, 70.18it/s]

Epoch 0, Batch 840: Loss = 5.8903


  1%|▏         | 854/61408 [00:11<14:14, 70.86it/s]

Epoch 0, Batch 850: Loss = 6.0885


  1%|▏         | 862/61408 [00:11<15:02, 67.10it/s]

Epoch 0, Batch 860: Loss = 6.3652


  1%|▏         | 870/61408 [00:12<14:30, 69.57it/s]

Epoch 0, Batch 870: Loss = 5.8280


  1%|▏         | 879/61408 [00:12<13:49, 72.97it/s]

Epoch 0, Batch 880: Loss = 7.6973


  1%|▏         | 895/61408 [00:12<13:21, 75.53it/s]

Epoch 0, Batch 890: Loss = 6.4204


  1%|▏         | 903/61408 [00:12<13:48, 73.06it/s]

Epoch 0, Batch 900: Loss = 5.9371


  1%|▏         | 911/61408 [00:12<13:35, 74.17it/s]

Epoch 0, Batch 910: Loss = 5.5973


  1%|▏         | 919/61408 [00:12<13:20, 75.56it/s]

Epoch 0, Batch 920: Loss = 6.2188


  2%|▏         | 927/61408 [00:12<13:17, 75.82it/s]

Epoch 0, Batch 930: Loss = 6.8759


  2%|▏         | 943/61408 [00:12<13:42, 73.50it/s]

Epoch 0, Batch 940: Loss = 5.8212


  2%|▏         | 951/61408 [00:13<14:00, 71.93it/s]

Epoch 0, Batch 950: Loss = 5.8069


  2%|▏         | 959/61408 [00:13<13:50, 72.81it/s]

Epoch 0, Batch 960: Loss = 6.4239


  2%|▏         | 975/61408 [00:13<13:39, 73.72it/s]

Epoch 0, Batch 970: Loss = 6.8132


  2%|▏         | 983/61408 [00:13<13:38, 73.86it/s]

Epoch 0, Batch 980: Loss = 5.9790


  2%|▏         | 991/61408 [00:13<14:11, 70.92it/s]

Epoch 0, Batch 990: Loss = 4.7491


  2%|▏         | 1000/61408 [00:13<13:42, 73.48it/s]

Epoch 0, Batch 1000: Loss = 6.9914


  2%|▏         | 1016/61408 [00:13<13:29, 74.62it/s]

Epoch 0, Batch 1010: Loss = 6.5831


  2%|▏         | 1024/61408 [00:14<14:04, 71.53it/s]

Epoch 0, Batch 1020: Loss = 6.0828


  2%|▏         | 1032/61408 [00:14<14:12, 70.84it/s]

Epoch 0, Batch 1030: Loss = 5.6701


  2%|▏         | 1040/61408 [00:14<14:28, 69.54it/s]

Epoch 0, Batch 1040: Loss = 5.7368


  2%|▏         | 1056/61408 [00:14<13:49, 72.76it/s]

Epoch 0, Batch 1050: Loss = 7.0968


  2%|▏         | 1065/61408 [00:14<13:12, 76.10it/s]

Epoch 0, Batch 1060: Loss = 4.6419
Epoch 0, Batch 1070: Loss = 6.8965


  2%|▏         | 1082/61408 [00:14<14:13, 70.71it/s]

Epoch 0, Batch 1080: Loss = 6.2603


  2%|▏         | 1090/61408 [00:15<13:55, 72.23it/s]

Epoch 0, Batch 1090: Loss = 5.5726


  2%|▏         | 1098/61408 [00:15<14:02, 71.55it/s]

Epoch 0, Batch 1100: Loss = 6.3729


  2%|▏         | 1114/61408 [00:15<13:59, 71.82it/s]

Epoch 0, Batch 1110: Loss = 5.9485


  2%|▏         | 1122/61408 [00:15<14:48, 67.82it/s]

Epoch 0, Batch 1120: Loss = 4.8287


  2%|▏         | 1129/61408 [00:15<15:01, 66.89it/s]

Epoch 0, Batch 1130: Loss = 6.9956


  2%|▏         | 1143/61408 [00:15<15:32, 64.65it/s]

Epoch 0, Batch 1140: Loss = 5.1928


  2%|▏         | 1150/61408 [00:15<15:30, 64.78it/s]

Epoch 0, Batch 1150: Loss = 5.3216


  2%|▏         | 1159/61408 [00:16<13:56, 72.04it/s]


KeyboardInterrupt: 