

Name: Chandeepa Janith

Reg No: SKF2400091


## **Bigram Model**

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import re

# Hyperparameters

In [2]:
# Define hyperparameters for the smaller model version
B = 32  # batch size: how many independent sequences will we process in parallel?
T = 8   # time: what is the maximum context length for predictions?
C = 32  # Feature count: number of different features analyzed
H = 4   # Number of attention heads
L = 4   # Number of layers
learning_rate = 1e-3  # Learning rate for the optimizer


# Hyperparameters
max_iters = 5000
eval_interval = 500
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
dropout = 0.2
torch.manual_seed(1337)


<torch._C.Generator at 0x783cde794990>

#Dataset

In [3]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-09-15 07:42:09--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-09-15 07:42:09 (19.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



# Word-level tokenization

In [12]:
import re
import torch

# Load the text from 'input.txt'
with open('input.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenize the text by splitting on spaces, newlines, or double newlines
tokens = re.split(r'\s+|\n|\n\n', text)
unique_words = sorted(set(tokens))  # Get unique tokens and sort them
vocab_size = len(unique_words)      # Number of unique tokens
word_to_index = {word: index for index, word in enumerate(unique_words)}  # Map words to indices
index_to_word = {index: word for index, word in enumerate(unique_words)}  # Map indices to words

# Functions to encode and decode text
def encode_text(text):
    return [word_to_index.get(word, -1) for word in re.split(r'\s+|\n|\n\n', text)]

def decode_indices(indices):
    return ' '.join([index_to_word.get(index, '<UNK>') for index in indices])

# Display vocabulary details
print(f'Vocabulary Size: {vocab_size}')
print(f'Sample of Vocabulary: {unique_words[:50]}')  # Display the first 50 unique words

# Convert text to tensor and split into training and validation datasets
encoded_data = torch.tensor(encode_text(text), dtype=torch.long)
split_index = int(0.9 * len(encoded_data))  # Use 90% for training and 10% for validation
train_data = encoded_data[:split_index]
val_data = encoded_data[split_index:]

# Function to create batches of data
def create_batch(split, batch_size, seq_length, device):
    data_set = train_data if split == 'train' else val_data
    indices = torch.randint(0, len(data_set) - seq_length, (batch_size,))
    x = torch.stack([data_set[i:i + seq_length] for i in indices])
    y = torch.stack([data_set[i + 1:i + seq_length + 1] for i in indices])
    return x.to(device), y.to(device)


Vocabulary Size: 25671
Sample of Vocabulary: ['', '&C:', '&c.', "'", "'?", "'A", "'Alas,", "'Alas,'", "'Alla", "'An", "'Ay", "'Ay,", "'Ay,'", "'Ay.'", "'Be", "'Beseech", "'Bless", "'Bove", "'Brutus!'", "'By", "'Charge", "'Charge!", "'Citizens!'", "'Clarence", "'Come", "'Come,", "'Commend", "'Con", "'Content'", "'Coriolanus!'", "'Courage!'", "'Courage,", "'Cucullus", "'Dear", "'Death.'", "'Deny", "'Do", "'Fair", "'Faith,", "'Farewell:'", "'Fine;'", "'Fore", "'Forgive", "'Frets,", "'G'", "'Gainst", "'Go", "'God", "'Good", "'Have"]


# Single head of self-attention and MHSA self- attention

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AttentionHead(nn.Module):
    """ Single head of self-attention mechanism """

    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.key_layer = nn.Linear(input_dim, output_dim, bias=False)
        self.query_layer = nn.Linear(input_dim, output_dim, bias=False)
        self.value_layer = nn.Linear(input_dim, output_dim, bias=False)
        self.register_buffer('mask', torch.tril(torch.ones(T, T)))

    def forward(self, x):
        batch_size, seq_len, input_dim = x.shape  # Extract dimensions

        # Compute key, query, and value matrices
        keys = self.key_layer(x)  # (batch_size, seq_len, output_dim)
        queries = self.query_layer(x)  # (batch_size, seq_len, output_dim)

        # Calculate attention scores
        attention_scores = torch.bmm(queries, keys.transpose(1, 2))  # (batch_size, seq_len, output_dim) @ (batch_size, output_dim, seq_len) -> (batch_size, seq_len, seq_len)
        attention_scores /= input_dim ** 0.5  # Normalize by the square root of input_dim
        attention_scores = attention_scores.masked_fill(self.mask[:seq_len, :seq_len] == 0, float('-inf'))  # Apply mask to upper triangle
        attention_probs = F.softmax(attention_scores, dim=-1)  # Apply softmax to attention scores

        values = self.value_layer(x)  # (batch_size, seq_len, output_dim)
        output = torch.bmm(attention_probs, values)  # (batch_size, seq_len, seq_len) @ (batch_size, seq_len, output_dim) -> (batch_size, seq_len, output_dim)

        return output

class MultiHeadSelfAttention(nn.Module):
    """ Multi-head self-attention mechanism """

    def __init__(self, input_dim, num_heads, head_dim):
        super().__init__()
        self.attention_heads = nn.ModuleList([
            AttentionHead(input_dim=input_dim, output_dim=head_dim) for _ in range(num_heads)
        ])  # Create multiple attention heads

    def forward(self, x):
        # Process input through all attention heads and concatenate their outputs
        head_outputs = [head(x) for head in self.attention_heads]
        concatenated_outputs = torch.cat(head_outputs, dim=-1)  # Concatenate along the last dimension
        return concatenated_outputs


# Transformer Block with multi-head attention and feedforward layers

In [18]:
import torch
import torch.nn as nn

class TransformerBlock(nn.Module):
    """ Transformer block consisting of multi-head self-attention and feedforward layers """

    def __init__(self, input_dim, num_heads, dropout_rate):
        super().__init__()
        head_dim = input_dim // num_heads
        self.multihead_attention = MultiHeadSelfAttention(input_dim, num_heads, head_dim)
        self.feedforward = nn.Sequential(
            nn.Linear(input_dim, 4 * input_dim),
            nn.GELU(),  # Using GeLU activation function
            nn.Linear(4 * input_dim, input_dim),
            nn.Dropout(dropout_rate),
        )
        self.layer_norm1 = nn.LayerNorm(input_dim)
        self.layer_norm2 = nn.LayerNorm(input_dim)

    def forward(self, x):
        # Apply multi-head attention with residual connection and layer normalization
        attention_out = self.multihead_attention(self.layer_norm1(x))
        x = x + attention_out

        # Apply feedforward layer with residual connection and layer normalization
        feedforward_out = self.feedforward(self.layer_norm2(x))
        x = x + feedforward_out

        return x



# Bigram language model with self-attention

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    """ A Bigram Language Model with token and position embeddings, transformer blocks, and a final linear layer """

    def __init__(self, batch_size, seq_length, embed_dim, num_heads, num_layers):
        super().__init__()
        self.batch_size = batch_size
        self.seq_length = seq_length
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_layers = num_layers

        # Token and position embeddings
        self.token_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.position_embeddings = nn.Embedding(seq_length, embed_dim)

        # Stack of transformer blocks
        self.transformer_blocks = nn.Sequential(*[Block(embed_dim, num_heads) for _ in range(num_layers)])

        # Layer normalization and final linear layer
        self.final_layer_norm = nn.LayerNorm(embed_dim)
        self.output_layer = nn.Linear(embed_dim, vocab_size)

    def forward(self, idx, targets=None):
        # Get token and position embeddings
        token_embeds = self.token_embeddings(idx)  # (batch_size, seq_length, embed_dim)
        position_embeds = self.position_embeddings(torch.arange(self.seq_length, device=idx.device))  # (seq_length, embed_dim)

        # Combine token and position embeddings
        x = token_embeds + position_embeds  # (batch_size, seq_length, embed_dim)

        # Pass through transformer blocks
        x = self.transformer_blocks(x)  # (batch_size, seq_length, embed_dim)

        # Apply final layer normalization
        x = self.final_layer_norm(x)  # (batch_size, seq_length, embed_dim)

        # Compute logits for each token position
        logits = self.output_layer(x)  # (batch_size, seq_length, vocab_size)

        # Compute loss if targets are provided
        if targets is not None:
            logits = logits.view(-1, vocab_size)  # Flatten to (batch_size * seq_length, vocab_size)
            targets = targets.view(-1)  # Flatten to (batch_size * seq_length)
            loss = F.cross_entropy(logits, targets)  # Compute cross-entropy loss
        else:
            loss = None

        return logits, loss

    def generate(self, idx, max_new_tokens):
        """ Generate new tokens given a starting sequence """
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.seq_length:]  # Use the last `seq_length` tokens
            logits, _ = self(idx_cond)  # Get logits for the current sequence
            logits = logits[:, -1, :]  # Focus on the logits for the next token
            probs = F.softmax(logits, dim=-1)  # Convert logits to probabilities
            idx_next = torch.multinomial(probs, num_samples=1)  # Sample next token
            idx = torch.cat((idx, idx_next), dim=1)  # Append new token to the sequence

        return idx

# Instantiate and move model to the specified device
model = BigramLanguageModel(batch_size=B, seq_length=T, embed_dim=C, num_heads=H, num_layers=L)
model = model.to(device)


# Training and Estimate loss function

In [21]:
import torch

def get_batch(split, batch_size, sequence_length, device):
    """Generate a batch of data for training or validation."""
    data = train_data if split == 'train' else val_data
    indices = torch.randint(0, len(data) - sequence_length, (batch_size,))
    x = torch.stack([data[i:i + sequence_length] for i in indices])
    y = torch.stack([data[i + 1:i + sequence_length + 1] for i in indices])
    return x.to(device), y.to(device)

@torch.no_grad()
def estimate_loss():
    """Estimate the average loss on training and validation datasets."""
    model.eval()  # Set model to evaluation mode
    out = {}
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, batch_size=B, sequence_length=T, device=device)  # Adjusted to include necessary arguments
            logits, loss = model(X, Y)  # Forward pass and compute loss
            losses[k] = loss.item()  # Store loss
        out[split] = losses.mean()  # Compute average loss
    model.train()  # Set model back to training mode
    return out

# Set up the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
for iteration in range(max_iters):
    # Periodically evaluate the loss on training and validation datasets
    if iteration % eval_interval == 0:
        losses = estimate_loss()  # Compute and print the average losses
        print(f"Step {iteration}: Train Loss = {losses['train']:.4f}, Val Loss = {losses['val']:.4f}")

    # Perform a training step
    xb, yb = get_batch('train', batch_size=B, sequence_length=T, device=device)  # Adjusted to include necessary arguments
    logits, loss = model(xb, yb)  # Compute the forward pass and loss
    optimizer.zero_grad(set_to_none=True)  # Clear old gradients
    loss.backward()  # Backpropagate the loss
    optimizer.step()  # Update the model parameters


Step 0: Train Loss = 10.3110, Val Loss = 10.3111
Step 500: Train Loss = 7.6186, Val Loss = 8.0379
Step 1000: Train Loss = 7.3869, Val Loss = 7.9309
Step 1500: Train Loss = 7.1510, Val Loss = 7.9031
Step 2000: Train Loss = 6.9346, Val Loss = 7.8515
Step 2500: Train Loss = 6.7392, Val Loss = 7.8452
Step 3000: Train Loss = 6.6016, Val Loss = 7.8182
Step 3500: Train Loss = 6.4349, Val Loss = 7.8920
Step 4000: Train Loss = 6.3246, Val Loss = 7.9458
Step 4500: Train Loss = 6.1783, Val Loss = 7.9822


# Save the model

In [22]:
torch.save(model.state_dict(), "word_level_gpt.pth")

# Test text generation

In [23]:
# Ensure context length is at least T
context = torch.zeros((1, T), dtype=torch.long, device=device)  # Adjust context length to T
print(decode(m.generate(context, max_new_tokens=50)[0].tolist()))


        graced You? breath: strange. Ho! rivals wasted, cheeks. kills Touching confusion's suitors. urged! lords: cast monument! impregnable, thou! Paulina! arms? nutmegs, necessaries guess flay'd? hairless limping swear remission with! rehearse, baffled greet jointure. stirs: above home-bred decreed, grow: before? children, And, amazed. cell philosophy. bottle. Than many's rage. they're own,--be
