# Building a Simple Language Model for Next Word Prediction

This notebook demonstrates the fundamentals of language modeling using PyTorch, including:

- Tokenization
- Word embeddings
- Building and training a neural language model
- Next word prediction


In [None]:
import numpy as np
import torch
import torch.nn as nn
from IPython.display import display, HTMLmateri
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 1️⃣ Training Sentences

We'll start with some simple sentences to train our language model.


In [None]:
sentences = [
    # Pet behaviors
    "cat chases small mouse",
    "dog runs very fast",
    "cat likes warm milk",
    "dog barks at strangers",
    "cats hunt at night",
    "kittens play with yarn",
    "puppies chew on toys",
    # Human-pet interactions
    "people love their pets",
    "children feed the animals",
    "family adopts new puppy",
    "woman trains her dog",
    "man walks his dog",
    # Animal characteristics
    "cats are very independent",
    "dogs are loyal companions",
    "birds sing beautiful songs",
    "rabbits hop around quickly",
    "fish swim in tanks",
    # Daily activities
    "students study every day",
    "friends meet for coffee",
    "runners train each morning",
    "artists paint colorful pictures",
    "writers create new stories",
]


# Display sentences in a nicely formatted table


pd.DataFrame(sentences, columns=["Training Sentences"])

## 2️⃣ Tokenization (Mapping Words to Numbers)

We'll build a vocabulary from all our sentences and convert words to numerical tokens.


In [None]:
# Build vocabulary from all sentences
words = set()
for s in sentences:
    words.update(s.split())
vocab = {word: idx for idx, word in enumerate(sorted(words))}
reverse_vocab = {idx: word for word, idx in vocab.items()}

# Display vocabulary as a formatted table
vocab_df = pd.DataFrame(list(vocab.items()), columns=["Word", "Token ID"])
display(HTML("<h3>Vocabulary:</h3>"))
display(vocab_df)

# Tokenize all sentences
tokenized_sentences = []
for s in sentences:
    tokens = [vocab[word] for word in s.split()]
    tokenized_sentences.append(tokens)

# Display tokenized sentences
tokenization_data = []
for s, t in zip(sentences, tokenized_sentences):
    tokenization_data.append([s, str(t)])

display(HTML("<h3>Tokenized Sentences:</h3>"))
pd.DataFrame(tokenization_data, columns=["Original Sentence", "Tokenized"])

# Interactive LLM Learning Examples

Based on your example that demonstrates next word prediction with visualizations, here are 6 more interactive notebooks to help someone learn about language models:

## 1. Text Sentiment Analysis

Create an interactive analyzer that takes user input and displays sentiment scores (positive/negative/neutral) with a confidence gauge. Learners will understand how models classify emotional tone.

## 2. Text Generation with Parameters

Build a notebook where users can enter a prompt and adjust generation parameters (temperature, max tokens, etc.) to see how they affect text completion. Include visualizations of token probabilities.

## 3. Named Entity Recognition

Develop an interactive tool that highlights entities (people, places, organizations) in user text with different colors. Show confidence scores for each detected entity.

## 4. Question-Answering System

Create a two-panel interface where users can upload/paste a document in one panel and ask questions in another. Visualize how the model finds and extracts answers from the source text.

## 5. Text Summarization Explorer

Build a notebook that lets users paste longer text and generate summaries at different lengths/compression rates. Compare extraction vs. abstraction methods.

## 6. Semantic Search Visualization

Develop a demo where users can search through a document collection using natural language queries. Visualize document embeddings in 2D/3D space to show how semantic similarity works.

Each example should follow your interactive pattern with widgets for input, clear visualization of results, and explanatory components that help learners understand what's happening under the hood.


## 3️⃣ Creating Word Embeddings

We'll create simple vector representations for each word.


In [None]:
embedding_dim = 5  # Increased dimension for better representation
np.random.seed(42)  # for reproducibility
embedding_matrix = np.random.randn(len(vocab), embedding_dim)
word_vectors = {word: embedding_matrix[idx] for word, idx in vocab.items()}

# Display word vectors in a formatted table
vectors_list = []
for word, vector in word_vectors.items():
    vectors_list.append([word] + list(vector))

columns = ["Word"] + [f"Dim {i+1}" for i in range(embedding_dim)]
vectors_df = pd.DataFrame(vectors_list, columns=columns)
display(HTML("<h3>Word Embeddings:</h3>"))
vectors_df.style.background_gradient(cmap="coolwarm")

## 4️⃣ Visualizing Word Embeddings

Let's create a 2D visualization of our word embeddings using PCA.


In [None]:
from sklearn.decomposition import PCA

# Convert embeddings to array for PCA
word_list = list(word_vectors.keys())
embedding_array = np.array([word_vectors[word] for word in word_list])

# Apply PCA to reduce to 2 dimensions for visualization
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embedding_array)

# Plot the embeddings
plt.figure(figsize=(10, 8))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c="blue", alpha=0.5)
for i, word in enumerate(word_list):
    plt.annotate(word, xy=(embeddings_2d[i, 0], embeddings_2d[i, 1]), fontsize=12)

plt.title("2D PCA projection of Word Embeddings", fontsize=15)
plt.xlabel("PCA Dimension 1")
plt.ylabel("PCA Dimension 2")
plt.grid(True, linestyle="--", alpha=0.7)
plt.show()

In [None]:
from sklearn.cluster import KMeans

# Convert embeddings to array for PCA
word_list = list(word_vectors.keys())
embedding_array = np.array([word_vectors[word] for word in word_list])

# Apply KMeans clustering to group similar words
n_clusters = 4  # Adjust based on vocabulary size
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embedding_array)

# Apply PCA to reduce to 2 dimensions for visualization
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embedding_array)

# Plot the embeddings with colors by cluster
plt.figure(figsize=(12, 10))

# Create a colormap
colors = ["#ff9999", "#66b3ff", "#99ff99", "#ffcc99", "#c2c2f0"]
for i in range(n_clusters):
    # Select points in this cluster
    cluster_points = embeddings_2d[cluster_labels == i]
    cluster_words = [word_list[j] for j in range(len(word_list)) if cluster_labels[j] == i]

    # Plot points for this cluster
    plt.scatter(
        cluster_points[:, 0],
        cluster_points[:, 1],
        c=colors[i % len(colors)],
        alpha=0.7,
        label=f"Cluster {i+1}",
    )

    # Add word labels
    for j, word in enumerate(cluster_words):
        idx = word_list.index(word)
        plt.annotate(word, xy=(embeddings_2d[idx, 0], embeddings_2d[idx, 1]), fontsize=12)

plt.title("2D PCA projection of Word Embeddings with Clustering", fontsize=15)
plt.xlabel("PCA Dimension 1")
plt.ylabel("PCA Dimension 2")
plt.grid(True, linestyle="--", alpha=0.7)
plt.legend()
plt.tight_layout()

# Display words in each cluster
print("Words grouped by similarity:")
for i in range(n_clusters):
    cluster_words = [word_list[j] for j in range(len(word_list)) if cluster_labels[j] == i]
    print(f"Cluster {i+1}: {', '.join(cluster_words)}")

plt.show()

## 5️⃣ Neural Network Model

Create a simple language model using LSTM for next word prediction.


In [None]:
class SimpleLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SimpleLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        # x shape: (batch_size, sequence_length)
        embedded = self.embedding(x)  # (batch_size, sequence_length, embedding_dim)
        lstm_out, _ = self.lstm(embedded)  # (batch_size, sequence_length, hidden_dim)
        # Get the last time step output
        output = self.fc(lstm_out[:, -1, :])  # (batch_size, vocab_size)
        return output


# Initialize network with parameters
vocab_size = len(vocab)
embedding_dim = 5
hidden_dim = 10
model = SimpleLanguageModel(vocab_size, embedding_dim, hidden_dim)

# Display model architecture
display(HTML(f"<h3>Model Architecture:</h3>"))
display(HTML(f"<pre>{model}</pre>"))

## 6️⃣ Prepare Training Data

Format our tokenized sentences into input-output pairs for training.


In [None]:
def prepare_training_data(tokenized_sentences):
    X, y = [], []
    for tokens in tokenized_sentences:
        for i in range(1, len(tokens)):
            # Use tokens up to i-1 as input, token i as target
            X.append(tokens[:i])
            y.append(tokens[i])
    return X, y


X, y = prepare_training_data(tokenized_sentences)

# Display training examples in a table
examples = []
for i in range(min(10, len(X))):
    input_words = [reverse_vocab[token] for token in X[i]]
    target_word = reverse_vocab[y[i]]
    examples.append([" ".join(input_words), target_word])

display(HTML(f"<h3>Training Examples (showing {len(examples)} of {len(X)}):</h3>"))
pd.DataFrame(examples, columns=["Input Sequence", "Target Word"])

## 7️⃣ Training Loop

Train the model on our prepared data.


In [None]:
def train_model(model, X, y, epochs=100):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # Keep track of losses for plotting
    losses = []

    for epoch in range(epochs):
        total_loss = 0
        for i in range(len(X)):
            # Prepare input sequence
            seq = torch.tensor(X[i], dtype=torch.long).unsqueeze(0)  # Add batch dimension
            target = torch.tensor([y[i]], dtype=torch.long)

            # Forward pass
            optimizer.zero_grad()
            output = model(seq)
            loss = criterion(output, target)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        epoch_loss = total_loss / len(X)
        losses.append(epoch_loss)

        if epoch % 10 == 0:
            print(f"Epoch {epoch}/{epochs}, Loss: {epoch_loss:.4f}")

    return losses


# Train the model and capture losses
losses = train_model(model, X, y, epochs=100)

## 8️⃣ Visualize Training Loss


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(losses, color="blue", linewidth=2)
plt.title("Training Loss Over Time", fontsize=15)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid(True, linestyle="--", alpha=0.7)
plt.show()

## 9️⃣ Next Word Prediction Function

Create a function to predict the next word given a partial sentence.


In [None]:
def predict_next_word(model, sentence, vocab, reverse_vocab):
    # Tokenize the input
    tokens = [vocab.get(word, 0) for word in sentence.split()]
    seq = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)

    # Get prediction
    with torch.no_grad():
        output = model(seq)
        probabilities = torch.softmax(output, dim=1)
        predicted_idx = torch.argmax(probabilities, dim=1).item()

    # Get top 3 predictions with probabilities
    top_probs, top_indices = torch.topk(probabilities, 3, dim=1)
    top_words = [
        (reverse_vocab[idx.item()], prob.item()) for idx, prob in zip(top_indices[0], top_probs[0])
    ]

    return reverse_vocab[predicted_idx], top_words

## 1️⃣0️⃣ Test the Model with Examples


In [None]:
test_phrases = ["cat chases", "dog runs", "cat", "dog", "people love"]

results = []
for phrase in test_phrases:
    next_word, top_words = predict_next_word(model, phrase, vocab, reverse_vocab)

    # Format top predictions with probabilities
    top_predictions = ", ".join([f"{word} ({prob:.2f})" for word, prob in top_words])
    results.append([phrase, next_word, top_predictions])

# Display results in a nice table
display(HTML("<h3>Next Word Predictions:</h3>"))
pd.DataFrame(
    results, columns=["Input Phrase", "Top Prediction", "Top 3 Predictions (with probabilities)"]
)

## 1️⃣1️⃣ Visualize Prediction Probabilities for a Sample Input


In [None]:
# Choose a sample phrase
sample_phrase = "cat chases"

# Get full probability distribution
tokens = [vocab.get(word, 0) for word in sample_phrase.split()]
seq = torch.tensor(tokens, dtype=torch.long).unsqueeze(0)
with torch.no_grad():
    output = model(seq)
    probabilities = torch.softmax(output, dim=1)[0].numpy()

# Create a DataFrame for visualization
prob_data = [(reverse_vocab[i], float(probabilities[i])) for i in range(len(vocab))]
prob_df = pd.DataFrame(prob_data, columns=["Word", "Probability"])
prob_df = prob_df.sort_values("Probability", ascending=False)

# Get only top 5 results
top5_df = prob_df.head(5)

# Plot the top 5 probabilities
plt.figure(figsize=(10, 4))
sns.barplot(x="Probability", y="Word", data=top5_df, palette="viridis")
plt.title(f'Top 5 Prediction Probabilities After "{sample_phrase}"', fontsize=15)
plt.xlabel("Probability")
plt.ylabel("Word")
plt.grid(True, linestyle="--", alpha=0.7)
plt.xlim(0, max(top5_df["Probability"]) * 1.1)  # Set x-axis limit with some padding
plt.tight_layout()
plt.show()

# Print the top 5 predictions with probabilities
print("Top 5 predicted next words:")
for i, row in top5_df.iterrows():
    print(f"{row['Word']}: {row['Probability']:.4f}")

## 1️⃣2️⃣ Interactive Next Word Prediction

Try your own phrases to see what the model predicts!


In [None]:
from ipywidgets import widgets
from IPython.display import display, clear_output

text_input = widgets.Text(
    value="", placeholder="Type a phrase", description="Input:", disabled=False
)

button = widgets.Button(description="Predict Next Word")
output = widgets.Output()


def on_button_clicked(b):
    with output:
        clear_output()
        phrase = text_input.value
        if phrase.strip() == "":
            print("Please enter a phrase")
            return

        # Check if words are in vocabulary
        unknown_words = [word for word in phrase.split() if word not in vocab]
        if unknown_words:
            print(f"Warning: Unknown words in input: {', '.join(unknown_words)}")
            print("These will be treated as the first word in vocab for prediction.")

        next_word, top_words = predict_next_word(model, phrase, vocab, reverse_vocab)
        print(f"Input: '{phrase}'")
        print(f"Predicted next word: '{next_word}'")
        print("Top 3 predictions:")
        for word, prob in top_words:
            print(f"  - {word} (probability: {prob:.4f})")


button.on_click(on_button_clicked)
display(text_input, button, output)

In [None]:
from ipywidgets import widgets
from IPython.display import display, clear_output
import time
import matplotlib.pyplot as plt

text_input = widgets.Text(
    value="", placeholder="Type a phrase", description="Input:", disabled=False
)
clear_button = widgets.Button(description="Clear")
output = widgets.Output()
status = widgets.HTML(value="")


def predict_with_debounce(change):
    # Add a short delay to avoid excessive predictions while typing
    time.sleep(0.3)

    with output:
        clear_output()
        phrase = change["new"].lower().strip()  # Convert to lowercase

        if not phrase:
            status.value = ""
            return

        status.value = "<i>Predicting...</i>"

        # Remove punctuation for better matching
        import re

        clean_phrase = re.sub(r"[^\w\s]", "", phrase)

        # Check vocabulary
        unknown_words = [word for word in clean_phrase.split() if word not in vocab]

        next_word, top_words = predict_next_word(model, clean_phrase, vocab, reverse_vocab)

        # Display results with better formatting
        print(f"<b>Input:</b> '{phrase}'")
        if unknown_words:
            print(f"<b>Warning:</b> Unknown words: {', '.join(unknown_words)}")

        print(f"<b>Predicted next word:</b> '{next_word}'")
        print("<b>Top predictions:</b>")

        # Visualize probabilities
        words, probs = zip(*top_words)
        plt.figure(figsize=(8, 3))
        plt.bar(words, probs)
        plt.title("Prediction Probabilities")
        plt.show()

        status.value = ""


def on_clear_clicked(b):
    text_input.value = ""
    with output:
        clear_output()
    status.value = ""


# Use observe with debounce instead of button click
text_input.observe(predict_with_debounce, names="value")
clear_button.on_click(on_clear_clicked)

display(text_input, clear_button, status, output)