In [None]:
# Exercise 5.1

In [1]:
import numpy as np
from collections import Counter

def softmax_with_temperature(logits, temperature=1.0):
    """Compute softmax with temperature scaling"""
    scaled_logits = logits / temperature
    exp_logits = np.exp(scaled_logits - np.max(scaled_logits))  # for numerical stability
    return exp_logits / exp_logits.sum()

def print_sampled_tokens(probs, words, num_samples=1000):
    """Sample tokens according to probabilities and print frequencies"""
    samples = np.random.choice(words, size=num_samples, p=probs)
    freq = Counter(samples)
    for word in words:
        print(f"{word}: {freq[word]/num_samples:.2%}")
    return freq

# Original probabilities (approximated from the graph)
original_probs = np.array([0.1, 0.05, 0.15, 0.05, 0.1, 0.5, 0.05, 0.0])
words = ['closer', 'every effort', 'forward', 'inches', 'moves', 'pizza', 'toward', 'you']

# Temperature values to test
temperatures = [0.1, 1.0, 5.0]
num_samples = 10000

print("Sampling frequencies:")
for temp in temperatures:
    print(f"\nTemperature = {temp}")
    scaled_probs = softmax_with_temperature(original_probs, temperature=temp)
    freq = print_sampled_tokens(scaled_probs, words, num_samples)
    pizza_freq = freq['pizza'] / num_samples
    print(f"\nPizza sampling frequency at T={temp}: {pizza_freq:.2%}")

    # More accurate way to get pizza frequency (without sampling)
    accurate_pizza_prob = scaled_probs[words.index('pizza')]
    print(f"Exact pizza probability at T={temp}: {accurate_pizza_prob:.2%}")

Sampling frequencies:

Temperature = 0.1
closer: 1.73%
every effort: 0.94%
forward: 2.32%
inches: 1.12%
moves: 1.83%
pizza: 90.33%
toward: 1.04%
you: 0.69%

Pizza sampling frequency at T=0.1: 90.33%
Exact pizza probability at T=0.1: 90.34%

Temperature = 1.0
closer: 12.21%
every effort: 11.48%
forward: 13.24%
inches: 11.12%
moves: 11.90%
pizza: 17.82%
toward: 11.03%
you: 11.20%

Pizza sampling frequency at T=1.0: 17.82%
Exact pizza probability at T=1.0: 17.97%

Temperature = 5.0
closer: 12.11%
every effort: 12.39%
forward: 12.43%
inches: 12.39%
moves: 12.16%
pizza: 13.71%
toward: 12.30%
you: 12.51%

Pizza sampling frequency at T=5.0: 13.71%
Exact pizza probability at T=5.0: 13.47%


In [None]:
# Exercise 5.2

In [2]:
import numpy as np
from collections import Counter

def softmax_with_temperature(logits, temperature=1.0):
    scaled_logits = logits / temperature
    exp_logits = np.exp(scaled_logits - np.max(scaled_logits))
    return exp_logits / exp_logits.sum()

def top_k_sampling(probs, k):
    top_k_indices = np.argsort(probs)[-k:]
    top_k_probs = probs[top_k_indices]
    return top_k_probs / top_k_probs.sum(), top_k_indices

def sample_with_settings(logits, words, temperature=1.0, top_k=None, num_samples=1000):
    probs = softmax_with_temperature(logits, temperature)

    if top_k is not None:
        top_k_probs, top_k_indices = top_k_sampling(probs, top_k)
        sampled_words = [words[i] for i in top_k_indices]
        samples = np.random.choice(sampled_words, size=num_samples, p=top_k_probs)
    else:
        samples = np.random.choice(words, size=num_samples, p=probs)

    freq = Counter(samples)
    print(f"\nTemperature: {temperature}, Top-k: {top_k}")
    for word in words:
        if word in freq:
            print(f"{word}: {freq[word]/num_samples:.2%}")
        else:
            print(f"{word}: 0.00%")

    return freq

# Example word distribution (logits)
logits = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])
words = ['closer', 'every effort', 'forward', 'inches', 'moves', 'pizza', 'toward', 'you']

# Experiment with different settings
settings = [
    {'temperature': 0.1, 'top_k': None},
    {'temperature': 0.1, 'top_k': 3},
    {'temperature': 1.0, 'top_k': None},
    {'temperature': 1.0, 'top_k': 5},
    {'temperature': 5.0, 'top_k': None},
    {'temperature': 5.0, 'top_k': 8}
]

for setting in settings:
    sample_with_settings(logits, words,
                        temperature=setting['temperature'],
                        top_k=setting['top_k'],
                        num_samples=10000)


Temperature: 0.1, Top-k: None
closer: 0.00%
every effort: 0.00%
forward: 0.00%
inches: 0.00%
moves: 0.00%
pizza: 0.00%
toward: 0.00%
you: 100.00%

Temperature: 0.1, Top-k: 3
closer: 0.00%
every effort: 0.00%
forward: 0.00%
inches: 0.00%
moves: 0.00%
pizza: 0.00%
toward: 0.00%
you: 100.00%

Temperature: 1.0, Top-k: None
closer: 0.07%
every effort: 0.26%
forward: 0.52%
inches: 1.06%
moves: 3.29%
pizza: 9.19%
toward: 23.06%
you: 62.55%

Temperature: 1.0, Top-k: 5
closer: 0.00%
every effort: 0.00%
forward: 0.00%
inches: 1.24%
moves: 3.45%
pizza: 8.35%
toward: 23.33%
you: 63.63%

Temperature: 5.0, Top-k: None
closer: 5.75%
every effort: 6.84%
forward: 8.43%
inches: 9.83%
moves: 12.63%
pizza: 15.65%
toward: 18.43%
you: 22.44%

Temperature: 5.0, Top-k: 8
closer: 6.11%
every effort: 6.45%
forward: 8.53%
inches: 10.56%
moves: 12.35%
pizza: 14.54%
toward: 18.75%
you: 22.71%


In [None]:
# Preferred Applications:
# Technical Writing: Code generation, technical documentation where precision is crucial
# Factual Responses: QA systems where accuracy is paramount
# Medical/Legal Applications: Where incorrect information could have serious consequences
# Summarization: Extracting key information without creative additions
# Translation: Maintaining faithfulness to the original text

In [None]:
# Exercise 5.3

In [6]:
def generate_deterministic(prompt, model, max_length=50):
    """
    Fully deterministic generation that will always produce the same output
    for the same input prompt and model state.

    Parameters:
    - prompt: Input text to start generation
    - model: The language model to use for generation
    - max_length: Maximum number of tokens to generate

    Returns:
    - Generated text (always the same for same inputs)
    """
    # Set all possible randomization controls
    generation_config = {
        'do_sample': False,       # Disable sampling
        'temperature': 0,        # Effectively disables temperature
        'top_k': 1,              # Only consider top 1 token
        'top_p': 0,              # Disable nucleus sampling
        'num_beams': 1,          # No beam search
        'repetition_penalty': 1, # No repetition adjustment
        'seed': 42,              # Fixed random seed
    }

    # Generate text with deterministic settings
    output = model.generate(
        prompt,
        max_length=max_length,
        **generation_config
    )

    return output

In [None]:
# Exercise 5.4

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

# 1. Tiny Model Configuration
class MiniGPTConfig:
    def __init__(self):
        self.vocab_size = 1000  # Reduced vocabulary
        self.n_layer = 2        # Only 2 layers
        self.n_head = 2         # 2 attention heads
        self.n_embd = 64        # Tiny embeddings
        self.block_size = 32    # Short sequences

# 2. Tiny Model Definition
class MiniGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)
        self.head = nn.Linear(config.n_embd, config.vocab_size)

    def forward(self, x):
        return self.head(self.tok_emb(x))

# 3. Training Setup
def main():
    # Configuration
    config = MiniGPTConfig()
    device = torch.device("cpu")  # Force CPU to avoid GPU memory issues

    # Initialize tiny model
    model = MiniGPT(config).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Tiny batch of data
    inputs = torch.randint(0, config.vocab_size, (2, 16)).to(device)  # 2 samples, 16 tokens
    targets = torch.randint(0, config.vocab_size, (2, 16)).to(device)

    # Training loop
    model.train()
    for step in range(3):  # Just 3 steps for demonstration
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = nn.functional.cross_entropy(outputs.view(-1, outputs.size(-1)),
                                         targets.view(-1))
        loss.backward()
        optimizer.step()
        print(f"Step {step+1}, Loss: {loss.item():.4f}")

    print("Completed successfully!")

if __name__ == "__main__":
    print("Starting minimal training example...")
    main()

Starting minimal training example...
Step 1, Loss: 7.1478
Step 2, Loss: 7.0858
Step 3, Loss: 7.0240
Completed successfully!


In [None]:
# Exercise 5.5

In [8]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader

def calc_loss_loader(data_loader, model, device):
    """Calculate loss on dataset using data loader"""
    model.eval()
    total_loss = 0.0
    total_samples = 0

    with torch.no_grad():
        for batch in data_loader:
            inputs = batch['input_ids'].to(device)
            masks = batch['attention_mask'].to(device)
            labels = inputs.clone()  # For language modeling, labels are same as inputs

            outputs = model(inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss

            batch_size = inputs.size(0)
            total_loss += loss.item() * batch_size
            total_samples += batch_size

    return total_loss / total_samples

# Initialize GPT model (124M parameter version)
model_name = "gpt2"  # This is the 124M parameter model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set pad token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Prepare dataset (replace with actual "The Verdict" dataset)
class VerdictDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.encodings = tokenizer(
            texts,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

    def __len__(self):
        return self.encodings['input_ids'].shape[0]

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx]
        }

# Sample data - REPLACE WITH ACTUAL "The Verdict" DATASET
train_texts = ["The verdict was guilty...", "In the case of State v. Smith..."]
val_texts = ["The judge ruled in favor...", "After considering the evidence..."]

# Create data loaders
train_dataset = VerdictDataset(train_texts, tokenizer)
val_dataset = VerdictDataset(val_texts, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)

# Calculate losses
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

train_loss = calc_loss_loader(train_loader, model, device)
val_loss = calc_loss_loader(val_loader, model, device)

print(f"Training loss: {train_loss}")
print(f"Validation loss: {val_loss}")

  from .autonotebook import tqdm as notebook_tqdm
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Training loss: 10.598292350769043
Validation loss: 8.96030044555664


In [None]:
# Exercise 5.6

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from time import time

def load_model_efficiently(model_size, device):
    """Load model with optimal settings for given hardware"""
    model_name = "gpt2" if model_size == "124M" else "gpt2-xl"

    # Precision control based on device
    torch_dtype = torch.float16 if "cuda" in device else torch.float32

    return GPT2LMHeadModel.from_pretrained(
        model_name,
        torch_dtype=torch_dtype,
        device_map="auto" if "cuda" in device else None,
        low_cpu_mem_usage=True
    )

def generate_efficient_sample(model, tokenizer, prompt, device):
    """Optimized generation with hardware-aware settings"""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.inference_mode():  # Faster alternative to torch.no_grad()
        return model.generate(
            **inputs,
            max_new_tokens=50,  # More efficient than max_length
            do_sample=True,
            temperature=0.7,
            top_k=50,
            pad_token_id=tokenizer.eos_token_id
        )

def benchmark_model(model_size, prompts, device):
    """Complete benchmarking routine"""
    print(f"\n=== Benchmarking {model_size} model ===")

    # Load with progress monitoring
    start = time()
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2" if model_size == "124M" else "gpt2-xl")
    model = load_model_efficiently(model_size, device)
    load_time = time() - start
    print(f"Model loaded in {load_time:.2f}s | Device: {device}")

    # Generation benchmarks
    for prompt in prompts:
        start_gen = time()
        outputs = generate_efficient_sample(model, tokenizer, prompt, device)
        gen_time = time() - start_gen
        print(f"\nPrompt: '{prompt}'\nGenerated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
        print(f"Generated in {gen_time:.2f}s ({len(outputs[0])} tokens)")

if __name__ == "__main__":
    # Configuration
    device = "cuda" if torch.cuda.is_available() else "cpu"
    test_prompts = [
        "The verdict in the case was",
        "In a surprising legal development,"
    ]

    # Run benchmarks
    for model_size in ["124M", "1558M"]:
        try:
            benchmark_model(model_size, test_prompts, device)
        except Exception as e:
            print(f"Failed to benchmark {model_size}: {str(e)}")
            if "out of memory" in str(e).lower():
                print("Try reducing model size or using a GPU with more memory")


=== Benchmarking 124M model ===
Model loaded in 1.92s | Device: cpu

Prompt: 'The verdict in the case was'
Generated: The verdict in the case was unanimous.

"In the end, the trial court found that the defendant acted in good faith when he and his friends made a false statement to a police officer, which was an act of maliciousness," the Supreme Court said.

The
Generated in 3.19s (56 tokens)

Prompt: 'In a surprising legal development,'
Generated: In a surprising legal development, the court has allowed the government to use a computer-generated image of a man who has been convicted of raping a 12-year-old girl.

The victim was discovered in the middle of the night last November with her head in her hands
Generated in 2.13s (56 tokens)

=== Benchmarking 1558M model ===


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
