# SimpleGPT

The objective of this notebook is to create and train a decoder-only model, which is a custom and scaled-down version of GPT, using the specified dataset.



### import libraries

In [None]:
# Import necessary libraries for data manipulation
import pandas as pd
import numpy as np

# Import PyTorch and submodules for neural network construction and operations
import torch
import torch.nn as nn
from torch.nn import functional as F

### Download dataset

In [None]:
!wget https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends.csv

--2024-04-21 15:45:54--  https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5383844 (5.1M) [text/plain]
Saving to: ‘friends.csv’


2024-04-21 15:45:55 (93.1 MB/s) - ‘friends.csv’ saved [5383844/5383844]



## Hyperparameters

In [None]:
batch_size = 16
block_size = 32  # Length of sequence fed into the model
max_iters = 5000  # Maximum number of training iterations
eval_interval = 100  # Interval for evaluating the model on validation data
learning_rate = 1e-3

n_embd = 64  # Dimensionality of the embeddings
n_head = 4   # Number of attention heads
n_layer = 4  # Number of transformer layers

eval_iters = 200  # Number of iterations to run during evaluation

device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(1337)


<torch._C.Generator at 0x7df800776e90>

## Preparing dateset

In [None]:
friends_df = pd.read_csv('friends.csv')
friends_df.head()

Unnamed: 0,text,speaker,season,episode,scene,utterance
0,There's nothing to tell! He's just some guy I ...,Monica Geller,1,1,1,1
1,"C'mon, you're going out with the guy! There's ...",Joey Tribbiani,1,1,1,2
2,"All right Joey, be nice. So does he have a hum...",Chandler Bing,1,1,1,3
3,"Wait, does he eat chalk?",Phoebe Buffay,1,1,1,4
4,"(They all stare, bemused.)",Scene Directions,1,1,1,5


In [None]:
friends_df = friends_df.drop(['episode','season','scene','utterance'], axis='columns')
friends_df = friends_df[friends_df['speaker'].str.contains('Scene')==False].copy()
friends_df['speaker'] = friends_df['speaker'].apply(lambda sp: sp.lower().capitalize().split(' ')[0])

friends_df.head()

Unnamed: 0,text,speaker
0,There's nothing to tell! He's just some guy I ...,Monica
1,"C'mon, you're going out with the guy! There's ...",Joey
2,"All right Joey, be nice. So does he have a hum...",Chandler
3,"Wait, does he eat chalk?",Phoebe
5,"Just, 'cause, I don't want her to go through w...",Phoebe


In [None]:
# Generate the dataset text
text = '\n\n'.join(f"{row['speaker']}:\n{row['text']}" for _, row in friends_df.iterrows())
print("Length of dataset in characters:", len(text))

Length of dataset in characters: 3774765


In [None]:
# Print the first 1000 characters of the dataset text
print(text[:1000])

Monica:
There's nothing to tell! He's just some guy I work with!

Joey:
C'mon, you're going out with the guy! There's gotta be something wrong with him!

Chandler:
All right Joey, be nice. So does he have a hump? A hump and a hairpiece?

Phoebe:
Wait, does he eat chalk?

Phoebe:
Just, 'cause, I don't want her to go through what I went through with Carl- oh!

Monica:
Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.

Chandler:
Sounds like a date to me.

Chandler:
Alright, so I'm back in high school, I'm standing in the middle of the cafeteria, and I realize I am totally naked.

#all#:
Oh, yeah. Had that dream.

Chandler:
Then I look down, and I realize there's a phone... there.

Joey:
Instead of...?

Chandler:
That's right.

Joey:
Never had that dream.

Phoebe:
No.

Chandler:
All of a sudden, the phone starts to ring. Now I don't know what to do, everybody starts looking at me.

Monica:
And they weren't looking at you before?!


In [None]:
# Create a vocabulary and encode/decode functions
chars = sorted(set(text))
vocab_size = len(chars)
char_to_id = {ch: i for i, ch in enumerate(chars)}
id_to_char = {i: ch for i, ch in enumerate(chars)}

def encode(string):
    return [char_to_id[char] for char in string]

def decode(ids):
    return ''.join(id_to_char[id] for id in ids)

In [None]:
# Prepare the data for model training
data = torch.LongTensor(encode(text))
train_part = int(0.9 * len(data))
train_data, val_data = data[:train_part], data[train_part:]


# Display information about the prepared data
print(f"Vocabulary Size: {vocab_size}")
print(f"Training Data Length: {len(train_data)}")
print(f"Validation Data Length: {len(val_data)}")

Vocabulary Size: 88
Training Data Length: 3397288
Validation Data Length: 377477


## Utils

In [None]:
s=nn.Embedding(vocab_size=3000, n_embd=1024).to(device)
m=MultiHeadSelfAttention(num_heads=4, n_embd=1024, head_size=12).to(device)

In [None]:
x=get_random_batch(train_data, block_size, batch_size)[0]
print(x.shape)

torch.Size([16, 32])


In [None]:
a=s(x)
a.shape

torch.Size([16, 32, 64])

In [None]:
def get_random_batch(data_source, block_size, batch_size):
    """
    Generates a random batch of input and label tensors from the data source.

    Parameters:
    - data_source: The dataset from which to sample.
    - block_size: The size of each sequence to be sampled.
    - batch_size: The number of sequences per batch.

    """
    indices = torch.randint(high=len(data_source) - block_size, size=(batch_size,))
    inputs = torch.stack([data_source[idx: idx + block_size] for idx in indices]).to(device)
    labels = torch.stack([data_source[idx + 1: idx + block_size + 1] for idx in indices]).to(device)
    return inputs, labels


def estimate_loss(model, data_sources, block_size, batch_size, eval_iters):
    """
    Estimates the model's loss on different data splits.

    Parameters:
    - model: The model to evaluate.
    - data_sources: A dictionary of datasets for each split.
    - block_size: The size of each sequence block.
    - batch_size: The number of sequences per batch.
    - eval_iters: The number of iterations for evaluation.


    """
    losses_dict = {}
    model.eval()
    with torch.no_grad():
        for split, data_source in data_sources.items():
            losses = [model(*get_random_batch(data_source, block_size, batch_size))[1].item() for _ in range(eval_iters)]
            losses_dict[split] = torch.tensor(losses).mean()
    model.train()
    return losses_dict

def generate_text(model, initial_idx, block_size, max_new_tokens):
    """


    Parameters:
    - model: The model to use for text generation.
    - initial_idx: The initial indices for generation.
    - block_size: The size of the block to consider for each prediction.
    - max_new_tokens: The maximum number of tokens to generate.



    """
    idx = initial_idx
    model.eval()
    with torch.no_grad():
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = model(idx_cond)
            probs = F.softmax(logits[:, -1, :], dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
    model.train()
    return idx


def train_model(model, train_data, val_data, block_size, batch_size, max_iters, eval_interval, optimizer):
    """


    Parameters:
    - model: The model to train.
    - train_data: The training dataset.
    - val_data: The validation dataset.
    - block_size: The size of each sequence block.
    - batch_size: The number of sequences per batch.
    - max_iters: The maximum number of iterations for training.
    - eval_interval: The interval at which to evaluate the model.
    - optimizer: The optimizer for training the model.


    """
    data_sources = {'train': train_data, 'val': val_data}
    for iteration in range(max_iters):
        if iteration % eval_interval == 0 or iteration == max_iters - 1:
            losses = estimate_loss(model, data_sources, block_size, batch_size, eval_iters)
            print(f"Iteration {iteration}: Train Loss {losses['train']:.4f}, Val Loss {losses['val']:.4f}")

        inputs, labels = get_random_batch(train_data, block_size, batch_size)
        optimizer.zero_grad()
        _, loss = model(inputs, labels)
        loss.backward()
        optimizer.step()

    return model



## Transformer block

In [None]:
class SelfAttentionHead(nn.Module):


    def __init__(self, n_embd, head_size):
        super().__init__()

        self.head_size =head_size
        self.key = nn.Linear(n_embd, n_embd,bias=False)
        self.query = nn.Linear(n_embd, n_embd,bias=False)
        self.value = nn.Linear(n_embd, n_embd,bias=False)


    def forward(self, x):

        key = self.key(x)
        query = self.query(x)
        value = self.value(x)

        attn_scores = torch.matmul(query, key.transpose(-2, -1)) / (self.head_size ** 0.5)

        seq_length = x.size(1)
        mask = torch.tril(torch.ones(seq_length, seq_length)).to(x.device)

        attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))
        attn_weights = torch.softmax(attn_scores, dim=-1)
        out = torch.matmul(attn_weights, value)

        return out


class MultiHeadSelfAttention(nn.Module):


    def __init__(self, num_heads, n_embd, head_size):
        super().__init__()

        self.heads =nn.ModuleList([SelfAttentionHead(n_embd, head_size) for _ in range(num_heads)])
        self.projection =nn.Linear(num_heads * head_size, n_embd)


    def forward(self, x):





        # Compute self-attention for each head
        attn_outputs = [head(x) for head in self.heads]

        # Concatenate the outputs from all heads
        combined_attn = torch.cat(attn_outputs, dim=-1)

        # Project the concatenated outputs back to the input size
        out = self.projection(combined_attn)





        return out


class FeedForward(nn.Module):


    def __init__(self, n_embd):
        super().__init__()

        self.forward_expansion=4
        self.net =self.net = nn.Sequential(
            nn.Linear(n_embd, self.forward_expansion * n_embd),  # You can adjust the hidden layer size as needed
            nn.ReLU(),
            nn.Linear(self.forward_expansion * n_embd, n_embd)
        )

    def forward(self, x):


        output = self.net(x)

        return output

In [None]:
class TransformerBlock(nn.Module):


    def __init__(self, n_embd, num_heads):
        super().__init__()

        self.self_attention =MultiHeadSelfAttention(num_heads, n_embd, head_size=64)
        self.feed_forward =FeedForward(n_embd)
        self.norm1 =nn.LayerNorm(n_embd)
        self.norm2 =nn.LayerNorm(n_embd)

    def forward(self, x):

        attn_output = self.self_attention(x)
        x = self.norm1(x + attn_output)
        ff_output = self.feed_forward(x)
        x = self.norm2(x + ff_output)

        return x

## Model

In [None]:
class SimpleGPT(nn.Module):


    def __init__(self, vocab_size, n_embd, block_size, n_layer, n_head):
        super().__init__()

        self.token_embeddings =nn.Embedding(vocab_size, n_embd)
        self.position_embeddings =nn.Embedding(block_size, n_embd)
        self.blocks =nn.Sequential(*[TransformerBlock(n_embd, n_head) for _ in range(n_layer)])
        self.layer_norm = nn.LayerNorm(n_embd)
        self.lm_head =nn.Linear(n_embd, vocab_size)


    def forward(self, idx, targets=None):



        token_embeds = self.token_embeddings(idx)
        seq_length = idx.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=idx.device)
        position_embeds = self.position_embeddings(position_ids)

        embeddings = token_embeds + position_embeds

        hidden_states = self.blocks(embeddings)
        normed_states = self.layer_norm(hidden_states)
        logits = self.lm_head(normed_states)


        loss = None
        if targets is not None:

            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits.view(-1, logits.size(-1)), targets.view(-1))




        return logits, loss


In [None]:
# Initialize the model and move it to the appropriate device
model = SimpleGPT(vocab_size=vocab_size, n_embd=n_embd, block_size=block_size, n_layer=n_layer, n_head=n_head).to(device)

# Calculate the number of parameters in the model
num_parameters = sum(p.numel() for p in model.parameters())
print(f'Number of parameters = {num_parameters}')

Number of parameters = 409304


In [None]:
# Print the model structure
print(model)

SimpleGPT(
  (token_embeddings): Embedding(88, 64)
  (position_embeddings): Embedding(32, 64)
  (blocks): Sequential(
    (0): TransformerBlock(
      (self_attention): MultiHeadSelfAttention(
        (heads): ModuleList(
          (0-3): 4 x SelfAttentionHead(
            (key): Linear(in_features=64, out_features=64, bias=False)
            (query): Linear(in_features=64, out_features=64, bias=False)
            (value): Linear(in_features=64, out_features=64, bias=False)
          )
        )
        (projection): Linear(in_features=256, out_features=64, bias=True)
      )
      (feed_forward): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=64, out_features=256, bias=True)
          (1): ReLU()
          (2): Linear(in_features=256, out_features=64, bias=True)
        )
      )
      (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerBlock(
      (self_

# training and evaluation the model

In [None]:
# Example of generating output with the initial model (before training)
initial_idx = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_output = generate_text(model, initial_idx, block_size, max_new_tokens=2000)
decoded_output = decode(generated_output[0].tolist())
print(decoded_output)


hYpn+grS?HS9LKgbs%HhS:7gu%Mz_6
Jx!;j_S8-Je,"}MFIAaO:&(mAuUy+O7(ICPF17D r_Pu89`bYK;_Ea'_lm1 LFT,AHIgMj1b(t8;l?rSjN/0$IGe1e!nmEF
GWx#"18_WN{QIqaYh!#DtyE*oL!u8,%AM( Qc{M&0'VuKj**TAEm3o2s*Zm2 0_AFC-yYES 2'A2C}m}gERJU!CMT5TRV1hrgK1zy&Pi*(YKAmqaJdg+C`j*8mQB$F(Cb_d'8j$k-yCA4Sw-NBXIG&QOa40RYS1
tTYb5yU98_ASSY2WIE&Axw#>tRD.M4QJ(m8mkAh& h8"3Hg0Bg9"jPc#"1[tX&7&(
,M&csA,
y,7SkwUVaKZkXE!, gW**(*>SP!7g)Na_yNMd"NT8C0t_b%Nn18;n/+mFE$A+w'CZ36&97JS!qwjJhJ#Q&)
AZLCICE2"!n0$p{uGga`Qp 69kSylCZ+,jd3?Zp myU9#NV{vNK!985Sjm0 kwC;Sq?52l'C,{jFEyE2hy?CA+U lJnA *0{E[9'&}i6*U9}pYYhmp;1,ZyI5?G*""&f*JuE{82bjjK0i`OmJc40'Brmr8K"  NaXUh_[A1/_m8gOT'U}
m0QIWmk5+T8YzufRSki[GfeFX;q,Jjo!aTa(m7&j+,.j+gm
'cu;Bg1$Sw&awv63'Qw}fuA`Ug#ysEcDg+M?e}F,7u)h_J"H2.rgka[K;S&+?SGCA&(yz{8CKCpk1%tcUtYt2pHyAqJJuISp!:hF1abK&0'
$bbo{ S !x'a/vACUTj}1k%pgYc>9CTTqtV+$;2oaXo:G1p}SP4Q2t;,+9c3UQjS
JlYHp#FV2QC${tF. EU1dtSMp74Ke3Wqj8Ql2w{_VWnVQR/a M y(}U9 c?Xnj_2ylc b8nwoMVOmQt_gK"mm{&(0P-TE>(JSt{;Xwo}AG8:
1nF
N7ggkBRbZAK%#7QaG"EWg_
m%ENfv`*YdpvSE*O2{A

In [None]:
# training
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
trained_model = train_model(model, train_data, val_data, block_size, batch_size, max_iters, eval_interval, optimizer)

Iteration 0: Train Loss 4.5841, Val Loss 4.5871
Iteration 100: Train Loss 2.4675, Val Loss 2.4640
Iteration 200: Train Loss 2.1891, Val Loss 2.2045
Iteration 300: Train Loss 2.0026, Val Loss 2.0233
Iteration 400: Train Loss 1.9201, Val Loss 1.9123
Iteration 500: Train Loss 1.8115, Val Loss 1.8390
Iteration 600: Train Loss 1.7804, Val Loss 1.7962
Iteration 700: Train Loss 1.7209, Val Loss 1.7691
Iteration 800: Train Loss 1.7065, Val Loss 1.7153
Iteration 900: Train Loss 1.6792, Val Loss 1.6954
Iteration 1000: Train Loss 1.6332, Val Loss 1.6754
Iteration 1100: Train Loss 1.6185, Val Loss 1.6560
Iteration 1200: Train Loss 1.5994, Val Loss 1.6320
Iteration 1300: Train Loss 1.5917, Val Loss 1.6215
Iteration 1400: Train Loss 1.5747, Val Loss 1.5973
Iteration 1500: Train Loss 1.5563, Val Loss 1.5811
Iteration 1600: Train Loss 1.5501, Val Loss 1.5800
Iteration 1700: Train Loss 1.5315, Val Loss 1.5773
Iteration 1800: Train Loss 1.5451, Val Loss 1.5532
Iteration 1900: Train Loss 1.5333, Val Loss

In [None]:
# Example of generating output with the trained model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_output = generate_text(trained_model, initial_idx, block_size, max_new_tokens=2000)
decoded_output = decode(generated_output[0].tolist())
print(decoded_output)


Then:
I've too!

Susan:
Corld-Bain plofe.

Riker:
Uhm.

Rachel:
Oh no.

Jane:
Phoebe's retty what are yard?

Phoebe:
So... All He supe ergother all talked heoping thang, that happy us up.

Ross:
What?!

Rachel:
Honey, guys out who didn't her metate.

Frank:
Your honey?

Chandler:
Oh don't you.

Monica:
Donet my stupid a floon hark mainer today, you knice inve?

Joey:
Well, you've get n't olvel this some him of boonsh this a money clork about make forgot of Papoti to makes a too!

Thankler:
I'm sorry, and of a'fore together nothing.

Joey:
OK. Your and tealthing.

Ross:
Oh, hey, happy presting!

Ross:
Acrowy if chance bad on. Emma! Humm y'know gotta with her.

Esomeon# of decide stupid to for that is at without?

Ross:
Yeah, we over Sary Someroma affect of it a wople her earrily, huses, C'lon't tome on-oy money.home fortant our sewing a ban loge this is of they and coolloidebe, Planing pize?

Ross:
Great is doing.

Chandler:
Okay.

Monica:
Yeah?

Phoebe:
Roch!

Phoebe:
I called?

Phole

In [None]:
# Example of generating output with the trained model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_output = generate_text(trained_model, initial_idx, block_size, max_new_tokens=2000)
decoded_output = decode(generated_output[0].tolist())
print(decoded_output)


Phoebe:
Your kids!

Joey:
Go, am .I need us tole hated. She's?

Joey:
Wela:
Joey!

Morry:
Okay! Mo, actually's on no!

Rachel:
Have yee toinglo, he same her married am-my stort!

Awhen:
Mo-let's quepian, but be us.

Ross:
Handme us a little brit! Chandler!

Monica:
All right a time and uh opa that Joey Greet I can at our listle and told good some when.

Chandler:
So you check we-wear necard chableth a baloue maginal so reft, and hole you treek up, oo-uh, okay, foush.. Oh, the gonna do!"No.

Monica:
What huh?

Monica:
Hey!

Sthank:
Umm, it's some chass a Dilse Joey.

Rachel:
So, Chandlearw a-leasten' pliess isn't one chose that to him a failf come on! Your show guy.

Janie:
What?hat I'm just doing here upm, okay there. We're gonna go! does those that it comptimes fly at that made little again Rell whole home.

Ross:
That's day. Some Phoebe, really any lace how that grest.

Rachel:
Hi, all you call!

Joey:
Huh?! Right!!

Monica:
Rella!

Rachel:
Oh God! Rachel!

Marry:
See I can't rest.
