# Group 5 GPT Project IS 640

This is the group 5 GPT Project for IS 640 - Programming for Business Analytics

Members:
- Hans 
- Chetan  
- Danish 
- Srujana 
- Bruna 

## Milestone 1: Dataset Exploration and Preparation

### Import all the modules and packages

In [98]:
import zipfile
import os
import pandas as pd

### Extract the Zip folder

In [99]:
# Path to the local ZIP file
zip_file_path = 'tv_series_data.zip'

# Extract the ZIP file to a folder
extracted_folder = 'tv_series_data'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder)

### Combine all CSV files into one DataFrame

In [100]:
combined_data = pd.DataFrame()
for file in os.listdir(extracted_folder):
    if file.endswith('.csv'):
        file_path = os.path.join(extracted_folder, file)
        df = pd.read_csv(file_path)
        combined_data = pd.concat([combined_data, df], ignore_index=True)

  combined_data = pd.concat([combined_data, df], ignore_index=True)


### View the first 5 rows of the dataset

In [101]:
combined_data.head()

Unnamed: 0,Title,IMDb ID,Release Year,Genre,Cast,Synopsis,Rating,Runtime,Certificate,Number of Votes,Gross Revenue
0,The Little Mermaid,tt5971474,I) (2023,"Adventure, Family, Fantasy","Director:, Rob Marshall, | , Stars:, Halle...",A young mermaid makes a deal with a sea witch ...,7.2,135 min,PG,69638,
1,Spider-Man: Across the Spider-Verse,tt9362722,2023,"Animation, Action, Adventure","Directors:, Joaquim Dos Santos, , Kemp Powers,...","Miles Morales catapults across the Multiverse,...",9.1,140 min,PG,71960,
2,Dungeons & Dragons: Honor Among Thieves,tt2906216,2023,"Action, Adventure, Comedy","Directors:, John Francis Daley, , Jonathan Gol...",A charming thief and a band of unlikely advent...,7.3,134 min,PG-13,123247,
3,The Super Mario Bros. Movie,tt6718170,2023,"Animation, Adventure, Comedy","Directors:, Aaron Horvath, , Michael Jelenic, ...",A plumber named Mario travels through an under...,7.2,92 min,PG,134835,
4,Spider-Man: Into the Spider-Verse,tt4633694,2018,"Animation, Action, Adventure","Directors:, Bob Persichetti, , Peter Ramsey, ,...",Teen Miles Morales becomes the Spider-Man of h...,8.4,117 min,PG,575321,190241310.0


### Inspect the combined data and view the information of the dataset

In [102]:
print("Combined Data Shape:", combined_data.shape)
print("\nCombined Data Columns:", combined_data.columns)
print("\nCombined Data Info:")
print(combined_data.info())
print("\nCombined Data Description:")
print(combined_data.describe())
print("\nMissing Values:")
print(combined_data.isnull().sum())

Combined Data Shape: (236828, 11)

Combined Data Columns: Index(['Title', 'IMDb ID', 'Release Year', 'Genre', 'Cast', 'Synopsis',
       'Rating', 'Runtime', 'Certificate', 'Number of Votes', 'Gross Revenue'],
      dtype='object')

Combined Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236828 entries, 0 to 236827
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Title            236828 non-null  object 
 1   IMDb ID          236828 non-null  object 
 2   Release Year     236819 non-null  object 
 3   Genre            236828 non-null  object 
 4   Cast             235956 non-null  object 
 5   Synopsis         236828 non-null  object 
 6   Rating           236828 non-null  float64
 7   Runtime          216983 non-null  object 
 8   Certificate      169091 non-null  object 
 9   Number of Votes  236828 non-null  object 
 10  Gross Revenue    45611 non-null   object 
dtypes: float64(1), objec

### Clean the data: convert to lowercase and replace non-alphanumeric characters (except dots) with spaces

In [103]:
def clean_text(text):
    return ''.join(char.lower() if char.isalnum() or char == '.' else ' ' for char in text)

### Create a new df called cleaned_data which contains only the cleaned text

In [104]:
cleaned_text = combined_data['Synopsis'].apply(clean_text)

In [105]:
cleaned_data = pd.DataFrame(cleaned_text, columns=['Synopsis'])

In [106]:
# Display the first few rows of the cleaned data
print(cleaned_data.head())

                                            Synopsis
0  a young mermaid makes a deal with a sea witch ...
1  miles morales catapults across the multiverse ...
2  a charming thief and a band of unlikely advent...
3  a plumber named mario travels through an under...
4  teen miles morales becomes the spider man of h...


### Rename column header from Synopsis to text

In [107]:
cleaned_data = cleaned_data.rename(columns={'Synopsis': 'text'})

### Save it into a csv file called `tv_series_synopsis_full.csv`

In [108]:
# Save the cleaned data to a CSV file
cleaned_data.to_csv('tv_series_synopsis_full.csv', index=False)

## Milestone 2: Basic Model Usage (Bigram Language Model)

Description: This milestone introduces a simple bigram language model. It predicts the next token based solely on the current token, without considering any broader context.

How it works: The model uses a simple lookup table to predict the next token based on the current one.

Code changes:
- Implementation of a basic nn.Embedding layer for token prediction
- Simple forward pass that uses only the current token to predict the next

Metrics: Basic tracking of training and validation loss.

In [109]:
class BigramLanguageModel(nn.Module):
    """
    A simple bigram-based language model that predicts the next token 
    based on the current token using an embedding layer. This model is 
    primarily used as a basic demonstration of language modeling concepts.

    Args:
        vocab_size (int): The size of the vocabulary, defining the number of unique tokens.

    Attributes:
        token_embedding_table (nn.Embedding): Embedding layer that maps tokens to logits 
            for all tokens in the vocabulary.

    Methods:
        forward(idx, targets=None):
            Performs the forward pass of the model, computing logits for the next token 
            and optionally calculating the cross-entropy loss.

            Args:
                idx (torch.Tensor): Tensor of shape (B, T) containing input token indices, 
                    where B is the batch size and T is the sequence length.
                targets (torch.Tensor, optional): Tensor of shape (B, T) containing target 
                    token indices for loss computation. Default is None.

            Returns:
                Tuple[torch.Tensor, torch.Tensor or None]:
                    - logits (torch.Tensor): Tensor of shape (B, T, vocab_size) containing 
                      predicted logits for the next token.
                    - loss (torch.Tensor or None): Scalar tensor representing the cross-entropy 
                      loss if `targets` is provided, otherwise None.

        generate(idx, max_new_tokens):
            Generates a sequence of tokens by sampling from the model's predictions.

            Args:
                idx (torch.Tensor): Tensor of shape (B, T) containing the initial context 
                    (sequence of token indices).
                max_new_tokens (int): Number of new tokens to generate.

            Returns:
                torch.Tensor: Tensor of shape (B, T + max_new_tokens) containing the initial 
                context concatenated with the generated tokens.

    Examples:
        >>> vocab_size = 100
        >>> model = BigramLanguageModel(vocab_size)
        >>> idx = torch.tensor([[1, 2, 3]])
        >>> logits, loss = model(idx, targets=torch.tensor([[2, 3, 4]]))
        >>> generated_sequence = model.generate(idx, max_new_tokens=5)
    """
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [110]:
model = BigramLanguageModel(vocab_size)
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

0.001681 M parameters


### Create a PyTorch optimizer for updating the model's parameter's during training
AdamW is a variant of the Adam optimizer that includes decoupled weight decay, making it better suited for modern deep learning models like transformers.
Key features:
Combines adaptive learning rates (like Adam) with the L2 regularization benefits of weight decay.
Helps prevent overfitting and stabilizes training by penalizing large weights.

In [111]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [112]:
# Initialize lists to store losses
train_losses = []
val_losses = []


for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()

        train_loss = losses['train']
        val_loss = losses['val']
        
        # Store losses
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = decode(m.generate(context, max_new_tokens=2000)[0].tolist())
avg_train_loss = np.mean(train_losses)
avg_val_loss = np.mean(val_losses)
print(f"\nTraining completed.")
print(f"Average training loss: {avg_train_loss:.4f}")
print(f"Average validation loss: {avg_val_loss:.4f}")
print(generated_text)

step 0: train loss 3.7127, val loss 3.7128
step 100: train loss 3.5950, val loss 3.5955
step 200: train loss 3.4866, val loss 3.4873
step 300: train loss 3.3866, val loss 3.3873
step 400: train loss 3.2957, val loss 3.2957
step 500: train loss 3.2125, val loss 3.2124
step 600: train loss 3.1356, val loss 3.1359
step 700: train loss 3.0661, val loss 3.0661
step 800: train loss 3.0028, val loss 3.0032
step 900: train loss 2.9461, val loss 2.9470
step 1000: train loss 2.8941, val loss 2.8944
step 1100: train loss 2.8486, val loss 2.8480
step 1200: train loss 2.8076, val loss 2.8070
step 1300: train loss 2.7689, val loss 2.7683
step 1400: train loss 2.7361, val loss 2.7338
step 1500: train loss 2.7046, val loss 2.7030
step 1600: train loss 2.6782, val loss 2.6782
step 1700: train loss 2.6541, val loss 2.6511
step 1800: train loss 2.6315, val loss 2.6287
step 1900: train loss 2.6118, val loss 2.6085
step 2000: train loss 2.5956, val loss 2.5905
step 2100: train loss 2.5790, val loss 2.5760


In [113]:
# Save the text to a file
with open('milestone2.txt', 'w', encoding='utf-8') as f:
    f.write(generated_text)


## Milestone 3: Self-attention & SoftMax Iteration

### Adding Self-Attention Mechanism

#### Self-attention allows the model to weigh the importance of different tokens in the input sequence. You can implement this using PyTorch's nn.Linear layers for query, key, and value projections, followed by a scaled dot-product attention mechanism.

1. Query, Key, and Value Projections: The input tensor x (of shape (B, T, D), where B is batch size, T is sequence length, and D is feature dimension) is transformed into queries, keys, and values using three separate linear layers. These projections prepare the data for the attention mechanism.

2. Scaled Dot-Product Attention: The attention scores are computed by taking the dot product of queries and transposed keys (QK^T), scaled by \sqrt{D} for numerical stability. These scores are passed through a softmax to produce attention weights, which are then used to compute a weighted sum of the values (Attention(Q, K, V) = \text{Softmax}(QK^T / \sqrt{D})V). This results in the output tensor of shape (B, T, D).


In [114]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

### Updating the Model

#### Integrate the `SelfAttention` module into your `BigramLanguageModel`. Replace or augment the token embedding lookup with a multi-head self-attention layer.

In [115]:
class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embd=128):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.self_attention = Head(n_embd)
        self.fc_layer = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        # Embed tokens
        x = self.token_embedding_table(idx)  # (B, T, D)

        # Apply self-attention
        x = self.self_attention(x)  # (B, T, D)

        # Final linear layer to project back to vocabulary size
        logits = self.fc_layer(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]  # Focus on the last time step
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

### Training the Model

#### Use the same training loop as in Milestone 2 but with the updated model. Ensure that you track both training and validation loss during training.

In [116]:
model = TransformerLanguageModel(vocab_size=vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

train_losses = []
val_losses = []

for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        train_losses.append(losses['train'])
        val_losses.append(losses['val'])
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
context = torch.zeros((1, 1), dtype=torch.long).to(device)  # Start with a blank token
generated_sequence = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(f"\nTraining completed.")
print(f"Average training loss: {avg_train_loss:.4f}")
print(f"Average validation loss: {avg_val_loss:.4f}")
print(generated_sequence)

step 0: train loss 3.7259, val loss 3.7278
step 100: train loss 2.5748, val loss 2.5710
step 200: train loss 2.5181, val loss 2.5172
step 300: train loss 2.5020, val loss 2.5033
step 400: train loss 2.4908, val loss 2.4963
step 500: train loss 2.4812, val loss 2.4796
step 600: train loss 2.4778, val loss 2.4749
step 700: train loss 2.4722, val loss 2.4677
step 800: train loss 2.4700, val loss 2.4632
step 900: train loss 2.4686, val loss 2.4624
step 1000: train loss 2.4677, val loss 2.4619
step 1100: train loss 2.4649, val loss 2.4587
step 1200: train loss 2.4643, val loss 2.4591
step 1300: train loss 2.4676, val loss 2.4600
step 1400: train loss 2.4647, val loss 2.4590
step 1500: train loss 2.4636, val loss 2.4582
step 1600: train loss 2.4653, val loss 2.4587
step 1700: train loss 2.4639, val loss 2.4564
step 1800: train loss 2.4561, val loss 2.4559
step 1900: train loss 2.4573, val loss 2.4553
step 2000: train loss 2.4572, val loss 2.4551
step 2100: train loss 2.4591, val loss 2.4543


### Generate Text

#### Generate text using the trained model and save it to a file named `milestone3.txt`.

In [117]:
with open('milestone3.txt', 'w', encoding='utf-8') as f:
    f.write(generated_sequence)
    
print("Generated text saved to milestone3.txt")

Generated text saved to milestone3.txt


## Milestone 4: Multi-head Attention

Description: This milestone extends self-attention to multi-head attention, allowing the model to capture different types of relationships between tokens.

How it works: The model computes multiple sets of attention (heads) in parallel, then combines their outputs.

Code changes:
- Implementation of multiple attention heads
- Concatenation and projection of multiple head outputs

Metrics: Possible further reduction in loss; may see improved performance on tasks requiring different types of attention.

In [None]:
# Define the Self-Attention Head
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)  # (B, T, C)
        q = self.query(x)  # (B, T, C)
        # Compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C ** -0.5  # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        # Perform the weighted aggregation of the values
        v = self.value(x)  # (B, T, C)
        out = wei @ v  # (B, T, C)
        return out

# Define the Multi-head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        # self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # out = self.dropout(self.proj(out))
        return out

In [None]:
# Update the Bigram Language Model to include Multi-head attention
class BigramLanguageModelWithMultiHeadAttention(nn.Module):
    # def __init__(self, vocab_size, n_embd, block_size, n_head, n_layer, dropout):
    def __init__(self, vocab_size, n_embd, block_size, n_head, n_layer):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[MultiHeadAttention(n_head, n_embd // n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)


    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)
        x = self.blocks(x)  # apply one multi-head attention block
        x = self.ln_f(x)  # (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -self.block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx
    



In [None]:
# Create the model and move it to the device
# model = BigramLanguageModelWithMultiHeadAttention(vocab_size, n_embd, block_size, n_head, n_layer, dropout)
model = BigramLanguageModelWithMultiHeadAttention(vocab_size, n_embd, block_size, n_head, n_layer)
m = model.to(device)

# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
# Initialize lists to store losses
train_losses = []
val_losses = []


for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()

        train_loss = losses['train']
        val_loss = losses['val']
        
        # Store losses
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sample a batch of data
    xb, yb = get_batch('train')

    # Evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

#Generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = decode(m.generate(context, max_new_tokens=2000)[0].tolist())
avg_train_loss = np.mean(train_losses)
avg_val_loss = np.mean(val_losses)
print(f"\nTraining completed.")
print(f"Average training loss: {avg_train_loss:.4f}")
print(f"Average validation loss: {avg_val_loss:.4f}")
print(generated_text)

# Save the generated text to a file
with open('milestone4.txt', 'w', encoding='utf-8') as f:
    f.write(generated_text)