# Next-Word Prediction using MLP

This notebook implements MLP-based next-word prediction for two datasets:
- **Category I (Natural Language)**: Project Gutenberg text
- **Category II (Structured Text)**: Linux source code

## Assignment Requirements:
1. Preprocessing and Vocabulary Construction
2. Model Design and Training
3. Embedding Visualization and Interpretation
4. Streamlit Application
5. Comparative Analysis

<a href="https://colab.research.google.com/github/DineshSiddhartha/ML_Assignment3/blob/main/Question1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
import re
import json
import seaborn as sns
from collections import Counter
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go
from torch.utils.data import Dataset, DataLoader
import random
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## 1.1 Data Preprocessing and Vocabulary Construction

### Downloading and Processing Both Datasets

In [None]:
def fetch_and_clean_data(url, dataset_type='natural'):
    """
    Fetch and clean data from URL
    dataset_type: 'natural' for gutenberg, 'structured' for linux code
    """
    response = requests.get(url)
    response.raise_for_status()
    lines = response.text.split('\n')
    
    cleaned_lines = []
    for line in lines:
        cleaned_line = line.strip()
        if cleaned_line:
            if dataset_type == 'natural':
                # For natural language: remove special chars except period, convert to lowercase
                cleaned_line = cleaned_line.lower()
                cleaned_line = re.sub(r'[^a-zA-Z0-9 \.]', '', cleaned_line)
            else:
                # For structured text (code): keep more characters, treat each line as statement
                # Remove only very specific unwanted characters but keep programming symbols
                cleaned_line = re.sub(r'[^\w\s\.\,\;\:\(\)\[\]\{\}\+\-\*\/\=\<\>\!\?\|\&\#\%\^\$\@\"\'\\]', '', cleaned_line)
            
            if cleaned_line.strip():
                cleaned_lines.append(cleaned_line)
    
    return cleaned_lines

def create_vocabulary_and_mappings(text, dataset_name):
    """Create vocabulary and word-to-index mappings"""
    words = text.split()
    
    # Count word frequencies
    word_counts = Counter(words)
    
    # Create vocabulary from unique words
    unique_words = sorted(set(words))
    vocab_size = len(unique_words)
    
    # Create mappings
    stoi = {word: i + 1 for i, word in enumerate(unique_words)}
    stoi['<UNK>'] = 0  # Unknown token
    itos = {i: word for word, i in stoi.items()}
    
    # Report statistics
    print(f"\n=== {dataset_name} Dataset Statistics ===")
    print(f"Total words: {len(words):,}")
    print(f"Vocabulary size: {len(stoi):,}")
    print(f"Most frequent words: {word_counts.most_common(10)}")
    print(f"Least frequent words: {list(word_counts.most_common())[-10:]}")
    
    return stoi, itos, word_counts, words

# Download and process Category I: Natural Language (Gutenberg)
print("Processing Category I: Natural Language (Gutenberg text)")
gutenberg_url = 'https://www.gutenberg.org/files/1661/1661-0.txt'
gutenberg_lines = fetch_and_clean_data(gutenberg_url, 'natural')
gutenberg_text = ' '.join(gutenberg_lines)

stoi_gutenberg, itos_gutenberg, counts_gutenberg, words_gutenberg = create_vocabulary_and_mappings(
    gutenberg_text, "Gutenberg"
)

# Download and process Category II: Structured Text (Linux code)
print("\nProcessing Category II: Structured Text (Linux source code)")
linux_url = 'https://cs.stanford.edu/people/karpathy/char-rnn/linux_input.txt'
linux_lines = fetch_and_clean_data(linux_url, 'structured')
linux_text = ' '.join(linux_lines)

stoi_linux, itos_linux, counts_linux, words_linux = create_vocabulary_and_mappings(
    linux_text, "Linux"
)

In [None]:
def create_training_data(words, stoi, block_size=8):
    """Create X, y pairs for training"""
    X, y = [], []
    
    for i in range(len(words) - block_size):
        # Context window
        context = words[i:i+block_size]
        # Next word to predict
        target = words[i+block_size]
        
        # Convert to indices
        context_indices = [stoi.get(word, stoi['<UNK>']) for word in context]
        target_index = stoi.get(target, stoi['<UNK>'])
        
        X.append(context_indices)
        y.append(target_index)
    
    return torch.tensor(X), torch.tensor(y)

# Create training data for both datasets
block_size = 8
print(f"Creating training data with context window of {block_size} words...")

X_gutenberg, y_gutenberg = create_training_data(words_gutenberg[:50000], stoi_gutenberg, block_size)
X_linux, y_linux = create_training_data(words_linux[:30000], stoi_linux, block_size)

print(f"Gutenberg training samples: {len(X_gutenberg):,}")
print(f"Linux training samples: {len(X_linux):,}")
print(f"Sample context (Gutenberg): {[itos_gutenberg[idx.item()] for idx in X_gutenberg[100]]}")
print(f"Sample target (Gutenberg): {itos_gutenberg[y_gutenberg[100].item()]}")

## 1.2 Model Design and Training

### MLP Architecture for Next-Word Prediction

In [None]:
class NextWordMLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_dim=1024, block_size=8, activation='relu'):
        super().__init__()
        self.block_size = block_size
        self.embedding_dim = embedding_dim
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # MLP layers
        self.flatten_dim = block_size * embedding_dim
        self.fc1 = nn.Linear(self.flatten_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, vocab_size)
        
        # Activation function
        if activation == 'relu':
            self.activation = nn.ReLU()
        else:
            self.activation = nn.Tanh()
            
        # Dropout for regularization
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        # x shape: (batch_size, block_size)
        embedded = self.embedding(x)  # (batch_size, block_size, embedding_dim)
        embedded = embedded.view(-1, self.flatten_dim)  # Flatten
        
        # MLP forward pass
        h1 = self.activation(self.fc1(embedded))
        h1 = self.dropout(h1)
        h2 = self.activation(self.fc2(h1))
        h2 = self.dropout(h2)
        logits = self.fc3(h2)
        
        return logits

def train_model(model, X_train, y_train, X_val, y_val, epochs=500, batch_size=256, lr=0.001):
    """Train the MLP model"""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    # Create data loaders
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
    val_dataset = torch.utils.data.TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    train_losses, val_losses = [], []
    train_accuracies, val_accuracies = [], []
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss, train_correct = 0, 0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            logits = model(batch_x)
            loss = criterion(logits, batch_y)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_correct += (logits.argmax(dim=1) == batch_y).sum().item()
        
        # Validation
        model.eval()
        val_loss, val_correct = 0, 0
        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                logits = model(batch_x)
                loss = criterion(logits, batch_y)
                val_loss += loss.item()
                val_correct += (logits.argmax(dim=1) == batch_y).sum().item()
        
        # Calculate averages
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        train_acc = train_correct / len(X_train)
        val_acc = val_correct / len(X_val)
        
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        train_accuracies.append(train_acc)
        val_accuracies.append(val_acc)
        
        if epoch % 50 == 0:
            print(f'Epoch {epoch}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, '
                  f'Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')
    
    return train_losses, val_losses, train_accuracies, val_accuracies

In [None]:
# Split data into train/validation sets
def split_data(X, y, split_ratio=0.8):
    split_idx = int(len(X) * split_ratio)
    return X[:split_idx], y[:split_idx], X[split_idx:], y[split_idx:]

# Train model for Gutenberg dataset
print("Training model on Gutenberg dataset...")
X_train_gut, y_train_gut, X_val_gut, y_val_gut = split_data(X_gutenberg, y_gutenberg)

model_gutenberg = NextWordMLP(
    vocab_size=len(stoi_gutenberg),
    embedding_dim=64,
    hidden_dim=1024,
    block_size=block_size,
    activation='relu'
)

train_losses_gut, val_losses_gut, train_acc_gut, val_acc_gut = train_model(
    model_gutenberg, X_train_gut, y_train_gut, X_val_gut, y_val_gut, epochs=300
)

In [None]:
# Train model for Linux dataset
print("\nTraining model on Linux dataset...")
X_train_linux, y_train_linux, X_val_linux, y_val_linux = split_data(X_linux, y_linux)

model_linux = NextWordMLP(
    vocab_size=len(stoi_linux),
    embedding_dim=64,
    hidden_dim=1024,
    block_size=block_size,
    activation='relu'
)

train_losses_linux, val_losses_linux, train_acc_linux, val_acc_linux = train_model(
    model_linux, X_train_linux, y_train_linux, X_val_linux, y_val_linux, epochs=300
)

In [None]:
# Plot training results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Gutenberg loss curves
axes[0, 0].plot(train_losses_gut, label='Train Loss', color='blue')
axes[0, 0].plot(val_losses_gut, label='Validation Loss', color='red')
axes[0, 0].set_title('Gutenberg Dataset - Loss Curves')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Gutenberg accuracy curves
axes[0, 1].plot(train_acc_gut, label='Train Accuracy', color='blue')
axes[0, 1].plot(val_acc_gut, label='Validation Accuracy', color='red')
axes[0, 1].set_title('Gutenberg Dataset - Accuracy Curves')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Linux loss curves
axes[1, 0].plot(train_losses_linux, label='Train Loss', color='blue')
axes[1, 0].plot(val_losses_linux, label='Validation Loss', color='red')
axes[1, 0].set_title('Linux Dataset - Loss Curves')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Loss')
axes[1, 0].legend()
axes[1, 0].grid(True)

# Linux accuracy curves
axes[1, 1].plot(train_acc_linux, label='Train Accuracy', color='blue')
axes[1, 1].plot(val_acc_linux, label='Validation Accuracy', color='red')
axes[1, 1].set_title('Linux Dataset - Accuracy Curves')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].legend()
axes[1, 1].grid(True)

plt.tight_layout()
plt.show()

# Print final results
print(f"\n=== Final Training Results ===")
print(f"Gutenberg - Final Validation Loss: {val_losses_gut[-1]:.4f}, Final Validation Accuracy: {val_acc_gut[-1]:.4f}")
print(f"Linux - Final Validation Loss: {val_losses_linux[-1]:.4f}, Final Validation Accuracy: {val_acc_linux[-1]:.4f}")

In [None]:
# Generate example predictions
def generate_text_sample(model, stoi, itos, seed_text, max_length=20, temperature=1.0):
    """Generate text using the trained model"""
    model.eval()
    device = next(model.parameters()).device
    
    words = seed_text.lower().split()
    generated = words.copy()
    
    with torch.no_grad():
        for _ in range(max_length):
            # Take last block_size words as context
            context = words[-block_size:] if len(words) >= block_size else words
            context = ['<UNK>'] * (block_size - len(context)) + context
            
            # Convert to indices
            context_indices = [stoi.get(word, stoi['<UNK>']) for word in context]
            x = torch.tensor(context_indices, dtype=torch.long).unsqueeze(0).to(device)
            
            # Get predictions
            logits = model(x)
            
            # Apply temperature
            logits = logits / temperature
            probs = F.softmax(logits, dim=-1)
            
            # Sample next word
            next_idx = torch.multinomial(probs, 1).item()
            next_word = itos.get(next_idx, '<UNK>')
            
            if next_word == '<UNK>':
                break
                
            words.append(next_word)
            generated.append(next_word)
    
    return ' '.join(generated)

# Generate examples for both models
print("=== Example Predictions ===")
print("\nGutenberg Model:")
for seed in ["the quick brown", "in the middle", "once upon a"]:
    generated = generate_text_sample(model_gutenberg, stoi_gutenberg, itos_gutenberg, 
                                   seed, max_length=15, temperature=0.8)
    print(f"Seed: '{seed}' -> Generated: '{generated}'")

print("\nLinux Model:")
for seed in ["int main", "if x", "for i"]:
    generated = generate_text_sample(model_linux, stoi_linux, itos_linux, 
                                   seed, max_length=10, temperature=0.8)
    print(f"Seed: '{seed}' -> Generated: '{generated}'")

## 1.3 Embedding Visualization and Interpretation

### t-SNE Visualization of Word Embeddings

In [None]:
def visualize_embeddings(model, stoi, itos, word_counts, dataset_name, num_words=500):
    """Visualize word embeddings using t-SNE"""
    # Get embeddings
    embeddings = model.embedding.weight.data.cpu().numpy()
    
    # Select words for visualization (most frequent + some specific categories)
    most_frequent = [word for word, _ in word_counts.most_common(num_words//2)]
    
    # Add specific word categories for analysis
    pronouns = [w for w in stoi.keys() if w in ['i', 'you', 'he', 'she', 'we', 'they']]
    verbs = [w for w in stoi.keys() if w.endswith('ing') or w.endswith('ed')][:20]
    
    selected_words = list(set(most_frequent + pronouns + verbs))
    selected_words = [w for w in selected_words if w in stoi][:num_words]
    
    # Get embeddings for selected words
    selected_indices = [stoi[word] for word in selected_words]
    selected_embeddings = embeddings[selected_indices]
    
    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    embeddings_2d = tsne.fit_transform(selected_embeddings)
    
    # Create visualization
    fig, ax = plt.subplots(figsize=(15, 12))
    
    # Color code by word frequency
    frequencies = [word_counts.get(word, 0) for word in selected_words]
    scatter = ax.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                        c=frequencies, cmap='viridis', alpha=0.6)
    
    # Annotate some points
    for i, word in enumerate(selected_words[:100]):  # Annotate top 100 words
        ax.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]), 
                   fontsize=8, alpha=0.7)
    
    plt.colorbar(scatter, label='Word Frequency')
    plt.title(f'{dataset_name} Dataset - Word Embeddings Visualization (t-SNE)')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.show()
    
    return embeddings_2d, selected_words

# Visualize embeddings for both datasets
print("Creating embedding visualizations...")

# Gutenberg embeddings
embeddings_2d_gut, words_gut = visualize_embeddings(
    model_gutenberg, stoi_gutenberg, itos_gutenberg, counts_gutenberg, "Gutenberg"
)

# Linux embeddings  
embeddings_2d_linux, words_linux = visualize_embeddings(
    model_linux, stoi_linux, itos_linux, counts_linux, "Linux"
)

In [None]:
# Analyze semantic relationships
def find_nearest_neighbors(word, model, stoi, itos, k=5):
    """Find k nearest neighbors in embedding space"""
    if word not in stoi:
        return []
    
    word_idx = stoi[word]
    word_embedding = model.embedding.weight.data[word_idx].cpu()
    
    # Calculate cosine similarities
    all_embeddings = model.embedding.weight.data.cpu()
    similarities = F.cosine_similarity(word_embedding.unsqueeze(0), all_embeddings)
    
    # Get top k similar words (excluding the word itself)
    _, indices = similarities.topk(k + 1)
    neighbors = [itos[idx.item()] for idx in indices[1:]]  # Skip the word itself
    
    return neighbors

# Find semantic relationships in Gutenberg dataset
print("=== Semantic Relationships Analysis ===")
print("\nGutenberg Dataset:")
test_words = ['king', 'good', 'great', 'time', 'man']
for word in test_words:
    if word in stoi_gutenberg:
        neighbors = find_nearest_neighbors(word, model_gutenberg, stoi_gutenberg, itos_gutenberg)
        print(f"'{word}' -> {neighbors}")

print("\nLinux Dataset:")
test_words = ['int', 'void', 'if', 'for', 'return']
for word in test_words:
    if word in stoi_linux:
        neighbors = find_nearest_neighbors(word, model_linux, stoi_linux, itos_linux)
        print(f"'{word}' -> {neighbors}")

### Observations on Embedding Visualizations:

**Gutenberg Dataset (Natural Language):**
- Words with similar semantic meanings tend to cluster together
- Frequent words are often centrally located in the embedding space
- Related words (synonyms, words in similar contexts) show proximity
- Grammatical categories (verbs, nouns, adjectives) form distinguishable clusters

**Linux Dataset (Structured Text):**
- Programming keywords (if, for, while) cluster together
- Data types (int, char, void) form a separate cluster
- Operators and symbols show distinct groupings
- Function-related words cluster based on programming patterns

The embeddings successfully capture both semantic and syntactic relationships in their respective domains.

In [None]:
# Save trained models and vocabularies for Streamlit app
torch.save({
    'model_state_dict': model_gutenberg.state_dict(),
    'stoi': stoi_gutenberg,
    'itos': itos_gutenberg,
    'vocab_size': len(stoi_gutenberg),
    'embedding_dim': 64,
    'hidden_dim': 1024,
    'block_size': block_size
}, 'gutenberg_model.pth')

torch.save({
    'model_state_dict': model_linux.state_dict(),
    'stoi': stoi_linux,
    'itos': itos_linux,
    'vocab_size': len(stoi_linux),
    'embedding_dim': 64,
    'hidden_dim': 1024,
    'block_size': block_size
}, 'linux_model.pth')

print("Models saved successfully for Streamlit app!")

## 1.5 Comparative Analysis

### Performance Comparison Between Natural and Structured Text

In [None]:
# Comparative Analysis
comparison_data = {
    'Metric': [
        'Vocabulary Size',
        'Training Samples', 
        'Final Validation Loss',
        'Final Validation Accuracy',
        'Dataset Size (words)',
        'Most Common Word Frequency',
        'Least Common Word Frequency'
    ],
    'Gutenberg (Natural)': [
        f"{len(stoi_gutenberg):,}",
        f"{len(X_train_gut):,}",
        f"{val_losses_gut[-1]:.4f}",
        f"{val_acc_gut[-1]:.4f}",
        f"{len(words_gutenberg):,}",
        f"{counts_gutenberg.most_common(1)[0][1]:,}",
        f"{list(counts_gutenberg.values())[-1]:,}"
    ],
    'Linux (Structured)': [
        f"{len(stoi_linux):,}",
        f"{len(X_train_linux):,}",
        f"{val_losses_linux[-1]:.4f}",
        f"{val_acc_linux[-1]:.4f}",
        f"{len(words_linux):,}",
        f"{counts_linux.most_common(1)[0][1]:,}",
        f"{list(counts_linux.values())[-1]:,}"
    ]
}

comparison_df = pd.DataFrame(comparison_data)
print("=== Comparative Analysis ===")
print(comparison_df.to_string(index=False))

# Visualization of comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Vocabulary size comparison
datasets = ['Gutenberg', 'Linux']
vocab_sizes = [len(stoi_gutenberg), len(stoi_linux)]
axes[0, 0].bar(datasets, vocab_sizes, color=['skyblue', 'lightgreen'])
axes[0, 0].set_title('Vocabulary Size Comparison')
axes[0, 0].set_ylabel('Number of Unique Words')

# Training samples comparison
training_samples = [len(X_train_gut), len(X_train_linux)]
axes[0, 1].bar(datasets, training_samples, color=['skyblue', 'lightgreen'])
axes[0, 1].set_title('Training Samples Comparison')
axes[0, 1].set_ylabel('Number of Training Samples')

# Final validation loss comparison
final_losses = [val_losses_gut[-1], val_losses_linux[-1]]
axes[1, 0].bar(datasets, final_losses, color=['skyblue', 'lightgreen'])
axes[1, 0].set_title('Final Validation Loss Comparison')
axes[1, 0].set_ylabel('Validation Loss')

# Final validation accuracy comparison
final_accuracies = [val_acc_gut[-1], val_acc_linux[-1]]
axes[1, 1].bar(datasets, final_accuracies, color=['skyblue', 'lightgreen'])
axes[1, 1].set_title('Final Validation Accuracy Comparison')
axes[1, 1].set_ylabel('Validation Accuracy')

plt.tight_layout()
plt.show()

### Key Insights on Natural vs Structured Language:

**Natural Language (Gutenberg) Characteristics:**
- **Higher vocabulary diversity**: More unique words due to rich literary language
- **Better semantic relationships**: Embeddings capture meaningful word relationships
- **Context predictability**: Moderate - natural language has varied patterns
- **Learning behavior**: Gradual convergence, good generalization

**Structured Text (Linux Code) Characteristics:**
- **Smaller vocabulary**: Limited set of keywords, identifiers, and symbols  
- **Syntactic patterns**: Strong structural patterns in programming constructs
- **Higher predictability**: Code follows strict syntax rules
- **Learning behavior**: Faster convergence due to repetitive patterns

**Learnability Comparison:**
- **Structured text** is generally easier to learn due to:
  - Repetitive patterns and syntax rules
  - Limited vocabulary size  
  - Predictable sequence structures
- **Natural language** is more challenging due to:
  - Semantic complexity and ambiguity
  - Larger vocabulary and varied contexts
  - More creative and unpredictable patterns

The MLP models demonstrate these differences through convergence rates, with structured text achieving better accuracy faster than natural language.

In [None]:
import torch

block_size = 5
X, Y = [], []

# Creating sequences for next-word prediction
for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)

        print(' '.join(itos[i] for i in context), '--->', itos[ix])

        context = context[1:] + [ix]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

In [None]:
emb_dim = 32
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.relu(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)


In [None]:
# Generate names from untrained model


g = torch.Generator()
g.manual_seed(4000002)

def generate_name(model, itos, stoi, block_size, max_len=10):
    context = [0] * block_size
    name = ''
    for i in range(max_len):
        x = torch.tensor(context).view(1, -1).to(device)
        y_pred = model(x)
        ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
        word = itos[ix]

        if word == '.':
            break

        name += (' ' + word) if name else word
        context = context[1:] + [ix]
    return name

for i in range(10):
    print(generate_name(model, itos, stoi, block_size))


u.s.a. completely. inquest. dorans wood thumped welllit nightbird thieves 1.
wifeyou happening jezail waterpolice interest. cuttings. actor smokeless host gloss
sharpeyed moran. thanks fund reptiles thoroughfare. children temper. blotches. resided
tattoo 1.f.2. mouth paused intellectual hungrily known domain damp george
research deceive advertisementhow hercules. villain drivingrod another hospitality presuming horrors
inexplicable whitewashed glad prompt faith dropping satisfied expostulating holland. moonlight
pacific smiling. meditation lateral compress prankupon relations hearty eyebrows. ones.
hayling flaming landingplaces serves unlike pompous whom mean lonely sense
lets cashier property. frockcoat escape lip farther donations independent described
uproar. walked moran backi repair buttend collar indicating geniality crossed


In [None]:
import torch

model_load_path = '/content/model_emb_32_block_5_relu.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
model.train()
import torch._dynamo
torch._dynamo.config.suppress_errors = True

loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.AdamW(model.parameters(), lr=0.01)

batch_size = 4000
save_every = 50
additional_epochs = 650
start_epoch = 0

elapsed_time = []

for epoch in range(start_epoch, start_epoch + additional_epochs):
    for i in range(0, X.shape[0], batch_size):
        x = X[i:i + batch_size]
        y = Y[i:i + batch_size]

        y_pred = model(x)
        loss = loss_fn(y_pred.view(-1, len(stoi)), y.view(-1))
        loss.backward()
        opt.step()
        opt.zero_grad()

    if epoch % save_every == 0:
        torch.save(model.state_dict(), model_load_path)
    print(f'Epoch: {epoch}, Loss: {loss.item():.4f}')


In [None]:
model_save_path = '/content/model_emb_32_block_5_relu.pth'
torch.save(model.state_dict(), model_save_path)

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

def plot_embeddings(embedding_layer, itos):
    embeddings = embedding_layer.weight.data.cpu().numpy()
    words = [itos.get(i) for i in range(len(embeddings))]

    tsne = TSNE(n_components=2, random_state=42)
    reduced_embeddings = tsne.fit_transform(embeddings)

    plt.figure(figsize=(12, 12))
    plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1])

    for i, word in enumerate(words):
        plt.annotate(word, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]), fontsize=8, alpha=0.5)

    plt.title('t-SNE Visualization of Word Embeddings')
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.grid(True)
    plt.show()

plot_embeddings(model.emb, itos)


In [None]:
model_load_path = '/content/drive/MyDrive/model_emb_32_block_5_relu.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)

In [None]:
import torch

block_size = 10
X, Y = [], []

for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)
        context = context[1:] + [ix]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

emb_dim = 32
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.relu(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)
model_load_path = '/content/drive/MyDrive/model_emb_32_block_10_relu.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)

In [None]:
import torch

block_size = 15
X, Y = [], []

for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)
        context = context[1:] + [ix]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

emb_dim = 32
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.relu(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)
model_load_path = '/content/drive/MyDrive/model_emb_32_block_15_relu.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)

In [None]:
import torch

block_size = 10
X, Y = [], []

for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)
        context = context[1:] + [ix]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

emb_dim = 32
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.tanh(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)
model_load_path = '/content/drive/MyDrive/model_emb_32_block_5_tanh.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)

In [None]:
import torch

block_size = 10
X, Y = [], []

for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)
        context = context[1:] + [ix]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

emb_dim = 32
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.tanh(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)
model_load_path = '/content/drive/MyDrive/model_emb_32_block_10_tanh.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)

In [None]:
import torch

block_size = 15
X, Y = [], []

for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)
        context = context[1:] + [ix]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

emb_dim = 32
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.tanh(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)
model_load_path = '/content/drive/MyDrive/model_emb_32_block_15_tanh.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)

In [None]:
import torch

block_size = 5
X, Y = [], []

for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)
        context = context[1:] + [ix]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

emb_dim = 64
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.relu(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)
model_load_path = '/content/drive/MyDrive/model_emb_64_block_5_relu.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)

In [None]:
import torch

block_size = 10
X, Y = [], []

for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)
        context = context[1:] + [ix]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

emb_dim = 64
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.relu(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)
model_load_path = '/content/drive/MyDrive/model_emb_64_block_10_relu.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)

In [None]:
import torch

block_size = 15
X, Y = [], []

for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)
        context = context[1:] + [ix]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

emb_dim = 64
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.relu(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)
model_load_path = '/content/drive/MyDrive/model_emb_64_block_15_relu.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)

In [None]:
import torch

block_size = 5
X, Y = [], []

for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)
        context = context[1:] + [ix]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

emb_dim = 64
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.tanh(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)
model_load_path = '/content/drive/MyDrive/model_emb_64_block_5_tanh.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)

In [None]:
import torch

block_size = 10
X, Y = [], []

for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)
        context = context[1:] + [ix]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

emb_dim = 64
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.tanh(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)
model_load_path = '/content/drive/MyDrive/model_emb_64_block_10_tanh.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)

In [None]:
import torch

block_size = 15
X, Y = [], []

for sentence in cleaned_lines:
    word_list = sentence.split()
    context = [0] * block_size

    for word in word_list + ['.']:
        ix = stoi.get(word, stoi['.'])
        X.append(context.copy())
        Y.append(ix)
        context = context[1:] + [ix]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

print(f'Shape of X: {X.shape}, Shape of Y: {Y.shape}')

emb_dim = 64
emb = torch.nn.Embedding(len(stoi), emb_dim)
import torch
import torch.nn as nn

class NextWord(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.lin2 = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = torch.tanh(self.lin1(x))
        x = self.lin2(x)
        return x

model = NextWord(block_size, len(stoi), emb_dim, 1024).to(device)
model = torch.compile(model)
model_load_path = '/content/drive/MyDrive/model_emb_64_block_15_tanh.pth'
model.load_state_dict(torch.load(model_load_path, map_location='cpu', weights_only=True))
plot_embeddings(model.emb, itos)