# Group 5 GPT Project IS 640

This is the group 5 GPT Project for IS 640 - Programming for Business Analytics

Members:
- Hans 
- Chetan  
- Danish 
- Srujana 
- Bruna 

## Milestone 1: Dataset Exploration and Preparation

### Import all the modules and packages

In [7]:
import zipfile
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np

### Description: 
This dataset contains information about TV series from IMDb, including details such as title, IMDb ID, release year, genre, cast, synopsis, rating, runtime, certificate, number of votes, and gross revenue. The data is scraped from the IMDb website using web scraping techniques and is organized into separate CSV files for each genre.

### Features:

- Title: The title of the TV series.
- IMDb ID: The unique identifier for the series on IMDb.
- Release Year: The year in which the series was released.
- Genre: The genre(s) of the series.
- Cast: The main cast members of the series.
- Synopsis: A brief summary or description of the series.
- Rating: The average rating of the series on IMDb (scaled from 1 to 10).
- Runtime: The duration of each episode or the total runtime of the series.
- Certificate: The content rating or certificate assigned to the series (e.g., PG-13, TV-MA).
- Number of Votes: The total number of votes or ratings received by the series.
- Gross Revenue: The total gross revenue generated by the series (if available).

### Objective:

We aim to generate text using the GPT transformer model, focusing exclusively on the 'Synopsis' column of the TV series dataset. Our goal is to clean and preprocess the 'Synopsis' data by converting all text to lowercase and replacing non-alphanumeric characters (except dots) with spaces, and then utilize the GPT transformer to generate coherent and relevant text based on the cleaned synopsis data.

### Extract the Zip folder

In [8]:
# Path to the local ZIP file
zip_file_path = 'tv_series_data.zip'

# Extract the ZIP file to a folder
extracted_folder = 'tv_series_data'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder)

FileNotFoundError: [Errno 2] No such file or directory: 'tv_series_data.zip'

### Combine all CSV files into one DataFrame

In [None]:

combined_data = pd.DataFrame()
for file in os.listdir(extracted_folder):
    if file.endswith('.csv'):
        file_path = os.path.join(extracted_folder, file)
        df = pd.read_csv(file_path)
        combined_data = pd.concat([combined_data, df], ignore_index=True)

### View the first 5 rows of the dataset

In [None]:
combined_data.head()

### Inspect the combined data and view the information of the dataset

In [None]:
print("Combined Data Shape:", combined_data.shape)
print("\nCombined Data Columns:", combined_data.columns)
print("\nCombined Data Info:")
print(combined_data.info())
print("\nCombined Data Description:")
print(combined_data.describe())
print("\nMissing Values:")
print(combined_data.isnull().sum())

Combined Data Shape: (236828, 11)

Combined Data Columns: Index(['Title', 'IMDb ID', 'Release Year', 'Genre', 'Cast', 'Synopsis',
       'Rating', 'Runtime', 'Certificate', 'Number of Votes', 'Gross Revenue'],
      dtype='object')

Combined Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236828 entries, 0 to 236827
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Title            236828 non-null  object 
 1   IMDb ID          236828 non-null  object 
 2   Release Year     236819 non-null  object 
 3   Genre            236828 non-null  object 
 4   Cast             235956 non-null  object 
 5   Synopsis         236828 non-null  object 
 6   Rating           236828 non-null  float64
 7   Runtime          216983 non-null  object 
 8   Certificate      169091 non-null  object 
 9   Number of Votes  236828 non-null  object 
 10  Gross Revenue    45611 non-null   object 
dtypes: float64(1), objec

### Clean the data: convert to lowercase and replace non-alphanumeric characters (except dots) with spaces

In [None]:
def clean_text(text):
    return ''.join(char.lower() if char.isalnum() or char == '.' else ' ' for char in text)

### Create a new df called cleaned_data which contains only the cleaned text

In [None]:
cleaned_text = combined_data['Synopsis'].apply(clean_text)

In [None]:
cleaned_data = pd.DataFrame(cleaned_text, columns=['Synopsis'])

In [6]:
# Display the first few rows of the cleaned data
print(cleaned_data.head())

NameError: name 'cleaned_data' is not defined

### Rename column header from Synopsis to text

In [None]:
cleaned_data = cleaned_data.rename(columns={'Synopsis': 'text'})

### Save it into a csv file called `tv_series_synopsis_full.csv`

In [None]:
# Save the cleaned data to a CSV file
cleaned_data.to_csv('tv_series_synopsis_full.csv', index=False)

### Define the hyperparameters for fine tuning

In [9]:
batch_size = 32 # Number of sequences processed in parallel during training
block_size = 128 # Maximum context length for predictions (sequence length)
max_iters = 5000 # Total number of training iterations
eval_interval = 100 # How often to evaluate the model (every 100 iterations)
learning_rate = 1e-3  # Step size for gradient descent optimization
device = 'cuda' if torch.cuda.is_available() else 'cpu' # Use GPU if available, otherwise CPU
eval_iters = 200 # Number of iterations for loss estimation during evaluation
n_embd = 128 # Dimensionality of the token embeddings and model's hidden layers
n_head = 8  # Number of attention heads in each self-attention layer
n_layer = 8 # Number of transformer layers in the model
dropout = 0.1 # Probability of dropping out neurons during training (regularization)

torch.manual_seed(1337)  # Set random seed for reproducibility

<torch._C.Generator at 0x10f6a0cf0>

### Choosing TV show data as the dataset

In [11]:
df = pd.read_csv('Dataset/tv_series_synopsis_full.csv', encoding='latin-1')
df['combined'] =  df['text'].astype(str)
text = " ".join(df['combined'].dropna().tolist())
text[:500]  # print the first 500 characters of the text

'miles morales catapults across the multiverse, where he encounters a team of spiderpeople charged with protecting its very existence. when the heroes clash on how to handle a new threat, miles must redefine what it means to be a hero. a c.i.a. operative on the edge of retirement discovers a family secret and is called back into the field for one last job. a hit man from the midwest moves to los angeles and gets caught up in the citys theatre arts scene. john wick uncovers a path to defeating the'

### Converting string to numerical format for training and testing.
1. Extract the unique characters and find the count of the vocabulary
2. Map the characters to integers and vice versa
3. Define the encode function which converts strings into numerical format
4. Define the decode function which converts numbers into strings

In [12]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

### Diving the data into training and validation sets
1. Encode the text into numbers so that it can be processed as a pytorch tensor
2. Define the split ratio
3. Make the training and validation sets

In [13]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

### Create functions for batch loading and loss estimation
`get_batch`:
Creates small, random batches of input-output pairs for training or validation.
Ensures the model learns from diverse examples within the dataset.

`estimate_loss`:
Provides a measure of the model's performance on both training and validation datasets.
Helps monitor overfitting (training loss much lower than validation loss) and guide hyperparameter tuning.

In [14]:
# data loading
def get_batch(split):
    """
    Generate a small batch of data of inputs x and targets y.

    Args:
        split: 'train' or 'val'. if 'train', we sample from train_data, otherwise val_data

    Returns:
        x: a tensor of shape (bs, block_size) representing the input sequence
        y: a tensor of shape (bs, block_size) representing the target sequence
    """
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    """
    Estimates the average loss for the training and validation datasets 
    over a fixed number of evaluation iterations.

    Returns:
        Dict[str, float]: A dictionary containing the mean loss for both the 
        training and validation datasets. Keys are:
            - 'train': Mean loss for the training dataset.
            - 'val': Mean loss for the validation dataset.
    """
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

## Milestone 2: Basic Model Usage (Bigram Language Model)

Description: This milestone introduces a simple bigram language model. It predicts the next token based solely on the current token, without considering any broader context.

How it works: The model uses a simple lookup table to predict the next token based on the current one.

Code changes:
- Implementation of a basic nn.Embedding layer for token prediction
- Simple forward pass that uses only the current token to predict the next

Metrics: Basic tracking of training and validation loss.

In [15]:
class BigramLanguageModel(nn.Module):
    """
    A simple bigram-based language model that predicts the next token 
    based on the current token using an embedding layer. This model is 
    primarily used as a basic demonstration of language modeling concepts.

    Args:
        vocab_size (int): The size of the vocabulary, defining the number of unique tokens.

    Attributes:
        token_embedding_table (nn.Embedding): Embedding layer that maps tokens to logits 
            for all tokens in the vocabulary.

    Methods:
        forward(idx, targets=None):
            Performs the forward pass of the model, computing logits for the next token 
            and optionally calculating the cross-entropy loss.

            Args:
                idx (torch.Tensor): Tensor of shape (B, T) containing input token indices, 
                    where B is the batch size and T is the sequence length.
                targets (torch.Tensor, optional): Tensor of shape (B, T) containing target 
                    token indices for loss computation. Default is None.

            Returns:
                Tuple[torch.Tensor, torch.Tensor or None]:
                    - logits (torch.Tensor): Tensor of shape (B, T, vocab_size) containing 
                      predicted logits for the next token.
                    - loss (torch.Tensor or None): Scalar tensor representing the cross-entropy 
                      loss if `targets` is provided, otherwise None.

        generate(idx, max_new_tokens):
            Generates a sequence of tokens by sampling from the model's predictions.

            Args:
                idx (torch.Tensor): Tensor of shape (B, T) containing the initial context 
                    (sequence of token indices).
                max_new_tokens (int): Number of new tokens to generate.

            Returns:
                torch.Tensor: Tensor of shape (B, T + max_new_tokens) containing the initial 
                context concatenated with the generated tokens.

    Examples:
        >>> vocab_size = 100
        >>> model = BigramLanguageModel(vocab_size)
        >>> idx = torch.tensor([[1, 2, 3]])
        >>> logits, loss = model(idx, targets=torch.tensor([[2, 3, 4]]))
        >>> generated_sequence = model.generate(idx, max_new_tokens=5)
    """
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [16]:
model = BigramLanguageModel(vocab_size)
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

0.001681 M parameters


### Create a PyTorch optimizer for updating the model's parameter's during training
AdamW is a variant of the Adam optimizer that includes decoupled weight decay, making it better suited for modern deep learning models like transformers.
Key features:
Combines adaptive learning rates (like Adam) with the L2 regularization benefits of weight decay.
Helps prevent overfitting and stabilizes training by penalizing large weights.

In [17]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [18]:
# Initialize lists to store losses
train_losses = []
val_losses = []


for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()

        train_loss = losses['train']
        val_loss = losses['val']
        
        # Store losses
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = decode(m.generate(context, max_new_tokens=2000)[0].tolist())
avg_train_loss = np.mean(train_losses)
avg_val_loss = np.mean(val_losses)
print(f"\nTraining completed.")
print(f"Average training loss: {avg_train_loss:.4f}")
print(f"Average validation loss: {avg_val_loss:.4f}")
print(generated_text)

In [None]:
# Save the text to a file
with open('milestone2.txt', 'w', encoding='utf-8') as f:
    f.write(generated_text)


## Milestone 3: Self-attention & SoftMax Iteration

### Adding Self-Attention Mechanism

#### Self-attention allows the model to weigh the importance of different tokens in the input sequence. You can implement this using PyTorch's nn.Linear layers for query, key, and value projections, followed by a scaled dot-product attention mechanism.

1. Query, Key, and Value Projections: The input tensor x (of shape (B, T, D), where B is batch size, T is sequence length, and D is feature dimension) is transformed into queries, keys, and values using three separate linear layers. These projections prepare the data for the attention mechanism.

2. Scaled Dot-Product Attention: The attention scores are computed by taking the dot product of queries and transposed keys (QK^T), scaled by \sqrt{D} for numerical stability. These scores are passed through a softmax to produce attention weights, which are then used to compute a weighted sum of the values (Attention(Q, K, V) = \text{Softmax}(QK^T / \sqrt{D})V). This results in the output tensor of shape (B, T, D).


In [114]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

### Updating the Model

#### Integrate the `SelfAttention` module into your `BigramLanguageModel`. Replace or augment the token embedding lookup with a multi-head self-attention layer.

In [115]:
class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embd=128):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.self_attention = Head(n_embd)
        self.fc_layer = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        # Embed tokens
        x = self.token_embedding_table(idx)  # (B, T, D)

        # Apply self-attention
        x = self.self_attention(x)  # (B, T, D)

        # Final linear layer to project back to vocabulary size
        logits = self.fc_layer(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]  # Focus on the last time step
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

### Training the Model

#### Use the same training loop as in Milestone 2 but with the updated model. Ensure that you track both training and validation loss during training.

In [116]:
model = TransformerLanguageModel(vocab_size=vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

train_losses = []
val_losses = []

for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        train_losses.append(losses['train'])
        val_losses.append(losses['val'])
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
context = torch.zeros((1, 1), dtype=torch.long).to(device)  # Start with a blank token
generated_sequence = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(f"\nTraining completed.")
print(f"Average training loss: {avg_train_loss:.4f}")
print(f"Average validation loss: {avg_val_loss:.4f}")
print(generated_sequence)

step 0: train loss 3.7259, val loss 3.7278
step 100: train loss 2.5748, val loss 2.5710
step 200: train loss 2.5181, val loss 2.5172
step 300: train loss 2.5020, val loss 2.5033
step 400: train loss 2.4908, val loss 2.4963
step 500: train loss 2.4812, val loss 2.4796
step 600: train loss 2.4778, val loss 2.4749
step 700: train loss 2.4722, val loss 2.4677
step 800: train loss 2.4700, val loss 2.4632
step 900: train loss 2.4686, val loss 2.4624
step 1000: train loss 2.4677, val loss 2.4619
step 1100: train loss 2.4649, val loss 2.4587
step 1200: train loss 2.4643, val loss 2.4591
step 1300: train loss 2.4676, val loss 2.4600
step 1400: train loss 2.4647, val loss 2.4590
step 1500: train loss 2.4636, val loss 2.4582
step 1600: train loss 2.4653, val loss 2.4587
step 1700: train loss 2.4639, val loss 2.4564
step 1800: train loss 2.4561, val loss 2.4559
step 1900: train loss 2.4573, val loss 2.4553
step 2000: train loss 2.4572, val loss 2.4551
step 2100: train loss 2.4591, val loss 2.4543


### Generate Text

#### Generate text using the trained model and save it to a file named `milestone3.txt`.

In [117]:
with open('milestone3.txt', 'w', encoding='utf-8') as f:
    f.write(generated_sequence)
    
print("Generated text saved to milestone3.txt")

Generated text saved to milestone3.txt


## Milestone 4: Multi-head Attention
Description: This milestone extends self-attention to multi-head attention, allowing the model to capture different types of relationships between tokens.

How it works: The model computes multiple sets of attention (heads) in parallel, then combines their outputs.

Code changes:

Implementation of multiple attention heads
Concatenation and projection of multiple head outputs
Metrics: Possible further reduction in loss; may see improved performance on tasks requiring different types of attention.

In [None]:
# Define the Self-Attention Head
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)  # (B, T, C)
        q = self.query(x)  # (B, T, C)
        # Compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C ** -0.5  # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        # Perform the weighted aggregation of the values
        v = self.value(x)  # (B, T, C)
        out = wei @ v  # (B, T, C)
        return out

# Define the Multi-head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        # self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # out = self.dropout(self.proj(out))
        return out

In [None]:
# Update the Bigram Language Model to include Multi-head attention
class BigramLanguageModelWithMultiHeadAttention(nn.Module):
    # def __init__(self, vocab_size, n_embd, block_size, n_head, n_layer, dropout):
    def __init__(self, vocab_size, n_embd, block_size, n_head, n_layer):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[MultiHeadAttention(n_head, n_embd // n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)


    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)
        x = self.blocks(x)  # apply one multi-head attention block
        x = self.ln_f(x)  # (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -self.block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

In [None]:
# Create the model and move it to the device
# model = BigramLanguageModelWithMultiHeadAttention(vocab_size, n_embd, block_size, n_head, n_layer, dropout)
model = BigramLanguageModelWithMultiHeadAttention(vocab_size, n_embd, block_size, n_head, n_layer)
m = model.to(device)

# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
# Initialize lists to store losses
train_losses = []
val_losses = []


for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()

        train_loss = losses['train']
        val_loss = losses['val']
        
        # Store losses
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

#Generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = decode(m.generate(context, max_new_tokens=2000)[0].tolist())
avg_train_loss = np.mean(train_losses)
avg_val_loss = np.mean(val_losses)
print(f"\nTraining completed.")
print(f"Average training loss: {avg_train_loss:.4f}")
print(f"Average validation loss: {avg_val_loss:.4f}")
print(generated_text)

# Save the generated text to a file
with open('milestone4.txt', 'w', encoding='utf-8') as f:
    f.write(generated_text)

## MILESTONE 5 FEED FOWARD LAYERS
Updating the model to include Feed Forward Layers enhances its capability by processing each token independently after interactions with other tokens in the sequence. While attention mechanisms enable tokens to 'communicate' and exchange information, Feed Forward Layers allow each token to refine and deepen its understanding of the aggregated context. This additional step adds depth and complexity to the model, ensuring that tokens not only gather information from others but also process and transform it effectively for downstream tasks.

In [None]:
# Define the Feed Forward Layer
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
# Define the Transformer Block
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        
    def forward(self, x):
        x = self.sa(x)
        x = self.ffwd(x)
        return x

In [None]:
# Update the Bigram Language Model to include Feed Forward Layers
class BigramLanguageModelWithFeedForward(nn.Module):
    # def __init__(self, vocab_size, n_embd, block_size, n_head, n_layer, dropout):
    def __init__(self, vocab_size, n_embd, block_size, n_head, n_layer):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)
        x = self.blocks(x)  # apply transformer blocks
        # x = self.ln_f(x)  # (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -self.block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx

In [None]:
# model = BigramLanguageModelWithFeedForward(vocab_size, n_embd, block_size, n_head, n_layer, dropout)
model = BigramLanguageModelWithFeedForward(vocab_size, n_embd, block_size, n_head, n_layer)
m = model.to(device)

# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
# Initialize lists to store losses
train_losses = []
val_losses = []


for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()

        train_loss = losses['train']
        val_loss = losses['val']
        
        # Store losses
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

#Generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = decode(m.generate(context, max_new_tokens=2000)[0].tolist())
avg_train_loss = np.mean(train_losses)
avg_val_loss = np.mean(val_losses)
print(f"\nTraining completed.")
print(f"Average training loss: {avg_train_loss:.4f}")
print(f"Average validation loss: {avg_val_loss:.4f}")
print(generated_text)


# Save the generated text to a file
with open('milestone5.txt', 'w', encoding='utf-8') as f:
    f.write(generated_text)