In [None]:
#fake_chatgpt_scratch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/LLM/fake_chatgpt_scratch

/content/drive/MyDrive/Colab Notebooks/LLM/fake_chatgpt_scratch


In [None]:
%pwd #display the current working directory.

'/content/drive/MyDrive/Colab Notebooks/LLM/fake_chatgpt_scratch'

In [None]:
import torch #a popular deep learning framework.
import numpy as np
import requests #used for making HTTP requests.
## import tiktoken
import torch.nn as nn #Imports the neural network module from PyTorch

from torch.nn import functional as F #Imports the functional module from PyTorch's neural network package as 'F'
                      #to use various functions like activation and loss functions.

In [None]:
!pip install requests
!pip install tiktoken    ## requires python   >    3.9

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
############################################################

torch.manual_seed(1337)     #Sets the random seed for PyTorch to ensure reproducibility.

block_size = 256      ## max content length for predictions
batch_size = 64 
max_iters  = 5000
eval_interval = 500     #Sets the interval for evaluating the model during training.
learning_rate = 3e-4             ## 0.001
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200      #Sets the number of evaluation iterations.
vocab_size = 65     #Sets the size of the vocabulary.
n_embd  = 384     #Sets the size of the embedding vectors for each token.(id)
n_head  = 6     #Sets the number of attention heads for the Transformer model.
n_layer = 6     #Sets the number of layers for the Transformer model.
dropout = 0.2     #Sets the dropout rate for the model's layers.

############################################################

In [None]:
input_file_path = '/content/drive/MyDrive/Colab Notebooks/LLM/fake_chatgpt_scratch/input.txt'

with open(input_file_path, 'r', encoding='utf-8') as f:
    text = f.read()
    
############################################################

print("length of data in characters")
len(text)

############################################################

length of data in characters


1115393

In [None]:
chars = sorted(     list(set(text))   )     #Creates a sorted list of unique characters present in the input text.

vocab_size = len(chars)     #Calculates the vocabulary size by counting the number of unique characters.

print(  ''.join(chars)  )     #Prints the sorted unique characters as a single string.

############################################################# 


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [None]:
chars = sorted(     list(set(text))   )     #Creates a sorted list of unique characters present in the input text.

vocab_size = len(chars)     #Calculates the vocabulary size by counting the number of unique characters.

print(  ''.join(chars)  )     #Prints the sorted unique characters as a single string.

############################################################# 

In [None]:
#Encodes the input text as a list of integers, then converts it to a PyTorch tensor of long integers.
data = torch.tensor(   encode(text), dtype=torch.long   )  

#Calculates the number of data points to be used for training (90% of the data).
n    = int(   0.9*len(data)   )

#Splits the data tensor into training and validation sets using the calculated value 'n'.
train_data = data[:n]
val_data   = data[n:]

#############################################################

In [None]:
#for generating batches of data for training or validation.
#It takes a single parameter, split, which determines whether the data is for training or validation.
def get_batch(split):
    if split == "train":
        data = train_data
    else:
        data = val_data
    
    #Generates a 1D tensor of random integers with the shape (batch_size,).
    #The random integers are sampled from a range of 0 to len(data) - block_size.
    ix = torch.randint(   len(data) - block_size, (batch_size,)   )

    #Constructs the input tensor x by extracting slices of length block_size from the data tensor using the random indices from the ix tensor.
    #The resulting tensor is stacked along a new dimension.
    x  = torch.stack(    [  data[ i : i+block_size ]   for i in ix]    ) 

    #Constructs the target tensor y in a similar way to x, but with an offset of 1, effectively shifting the data one step forward.
    y  = torch.stack(    [  data[ i+1 : i+1+block_size ]   for i in ix]    )
    
    #Moves the input and target tensors to the specified device (GPU or CPU).
    x, y = x.to(device), y.to(device)

    return x, y

############################################################

In [None]:

@torch.no_grad()    ## decorator to disable gradient calculations for efficiency.

#estimates the average loss on the training and validation sets without updating the model parameters.
def estimate_loss():
    out = {}      #empty dictionary to store the average losses for both training and validation sets.
    model.eval()   ## no training; Sets the model to evaluation mode, which disables features like dropout and batch normalization.
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)      #Initializes a tensor of zeros with a length equal to the number of evaluation iterations (eval_iters).
        for k in range(eval_iters):
            X, Y = get_batch(split)     #to obtain input-target pairs
            logits, loss = model(X, Y)      #to compute the logits and loss for the given input-target pair.
            losses[k] = loss.item()     #Assigns the current iteration's loss value to the corresponding index in the losses tensor.
        out[split] = losses.mean()      #Calculates the mean of the losses and assigns it to the corresponding key in the out dictionary.
    model.train()  ## model back to training mode, re-enabling features like dropout and batch normalization.
    return out

##########################################################################################

In [None]:

#Defines the Head class, which inherits from the nn.Module class in PyTorch.
class Head(nn.Module):
    """ one head of self-attention """
    
    def __init__(self, head_size):
        super().__init__()

        #These lines define the linear transformation layers for the key, query, and value projections.
        self.key   = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        
        # Creates a lower-triangular mask with ones in the lower triangle and registers it as a buffer. 
        # This mask is used to enforce the causal attention constraint.
        ## the mask tril is not part of the graph since only for masking
        ## so register buffer makes it a thing out of the graph
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        #Initializes a dropout layer with a dropout rate defined by the dropout variable.
        self.dropout = nn.Dropout(dropout)

    #Defines the forward pass for the self-attention head.
    def forward(self, x):

        #Extracts the batch size (B), sequence length (T), and number of channels (C) from the input tensor x.
        B, T, C = x.shape

        #Applies the key and query linear transformations to the input tensor x.
        k = self.key(x)              ## (B, T, C)
        q = self.query(x)            ## (B, T, C)
        
        #Computes the attention weights by taking the dot product of the query and key tensors and
        #scaling by the inverse square root of the number of channels (C).
        wei = q @ k.transpose(-2, -1) * C**-0.5       ## (B, T, C) @ (B, C, T)  -> (B, T, T)

        #Applies the causal attention mask by filling the upper triangle of the attention weights tensor with negative infinity.
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))     ## (B, T, T)

        #Computes the softmax of the attention weights along the last dimension.
        wei = F.softmax(wei, dim= -1)           ## (B, T, T)

        #Applies dropout to the attention weights.
        wei = self.dropout(   wei   )
        
        ## perform the weighted aggregation of the values/Applies the value linear transformation to the input tensor x.
        v   = self.value(  x  )   ## (B, T, C)

        #Computes the output of the self-attention head
        out = wei @ v             ## (B, T, T) @ (B, T, C) -> (B, T, C)
        
        return out
        
##########################################################################################

In [None]:


class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    
    #num_heads, which is the number of attention heads, and head_size, which is the size of each attention head.
    def __init__(self, num_heads, head_size):
        super().__init__()

        #Initializes a list of Head instances using the head_size parameter, and wraps it in an nn.ModuleList for correct handling of submodules in PyTorch.
        self.heads = nn.ModuleList(  [Head(head_size) for _ in range(num_heads) ] )

        #Defines a linear layer for projecting the concatenated output of the attention heads back to the original input dimensions.
        self.proj  = nn.Linear(n_embd, n_embd)

        #Initializes a dropout layer with a dropout rate defined by the dropout variable.
        self.dropout = nn.Dropout(dropout)
    
    #Defines the forward pass for the multi-head attention.
    def forward(self, x):

        #Applies each Head instance in self.heads to the input tensor x and concatenates the outputs along the last dimension.
        out = torch.cat(   [ h(x) for h in self.heads], dim = -1   )

        #Applies the linear projection layer to the concatenated output, mapping it back to the original input dimensions.
        out = self.proj(  out   )

        #Applies dropout to the projected output.
        out = self.dropout(   out   )

        #Returns the output tensor of the multi-head attention.
        return out

##########################################################################################

In [None]:
#feed-forward neural network (FFN) used as a part of the Transformer architecture
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    
    #n_embd parameter, which represents the input and output dimensions of the FFN.
    def __init__(self, n_embd):
        super().__init__()

        #Defines the feed-forward network using the nn.Sequential container. The network consists of the following layers:
        self.net = nn.Sequential(
            #A linear layer that maps the input dimensions to 4 * n_embd.
            nn.Linear(n_embd, 4 * n_embd),
            #A rectified linear unit (ReLU) activation function.
            nn.ReLU(),
            #A linear layer that maps the dimensions from 4 * n_embd back to n_embd.
            nn.Linear(4 * n_embd, n_embd),
            #A dropout layer with a dropout rate defined by the dropout variable.
            nn.Dropout(dropout),
        )
    
    #Defines the forward pass for the feed-forward network.
    def forward(self, x):
        #Applies the self.net container to the input tensor x and returns the output tensor.
        return self.net(x)

##########################################################################################

In [None]:
#The Block class defines a single Transformer block, which consists of a multi-head self-attention mechanism followed by a feed-forward neural network.
#The forward pass applies these components in sequence with residual connections and layer normalization.
class Block(nn.Module):
    """ Transformer block: comuunication followed by computation """
    
    #n_embd, which represents the input and output dimensions of the block, and n_head, which is the number of self-attention heads in the block.
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        #Calculates the size of each attention head by dividing the embedding dimensions n_embd by the number of heads n_head.
        head_size = n_embd // n_head
        self.sa   = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward( n_embd)

        #Initializes two layer normalization layers with the specified embedding dimensions.
        self.ln1  = nn.LayerNorm(n_embd)
        self.ln2  = nn.LayerNorm(n_embd)
        
    #Defines the forward pass for the Transformer block.
    def forward(self, x):
        ## these normalizations (ln1, ln2) are about the only thing different from
        ## the original Vaswani paper. In the paper, they are done at the end of forward
        ## but now they are usually done at the beginning of forward

        #Applies the first layer normalization, followed by the multi-head self-attention, and adds the result to the original input x (residual connection).
        x = x + self.sa(     self.ln1(x)      )
        x = x + self.ffwd(   self.ln2(x)      )
        return x
    
##########################################################################################

In [None]:

#class BigramLanguageModel, which represents the main language model architecture used in the project. It is a Transformer-based model adapted for the language modeling task.
class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        #Initializes an embedding layer for converting token indices to continuous vectors of dimension n_embd.
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)

        #Initializes an embedding layer for positional encoding, which represents the position of tokens in the input sequence.
        self.position_embedding_table = nn.Embedding(block_size, n_embd)     ## positional encoding 

        #Initializes a sequence of Transformer blocks
        self.blocks = nn.Sequential(
                *[   Block(n_embd, n_head=n_head) for _ in range(n_layer)    ]
        )

        #a layer normalization layer for the output of the Transformer blocks.
        self.ln_f    = nn.LayerNorm(  n_embd    )        ## final layer norm

        #Initializes a linear layer for mapping the output embeddings to logits for each token in the vocabulary
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    
    def forward(self, idx, targets=None):
        
        B, T = idx.shape
        
        #processes the input idx by applying token embeddings, position embeddings, Transformer blocks, and layer normalization.

        ## ids and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(idx)      ## batch, time, embed (4, 8, 32) 
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))      ## (T, C)
        x = tok_emb + pos_emb    ## (B, T, C)
        x = self.blocks(  x  )   ## (B, T, C)        
        x = self.ln_f(x)         ## (B, T, C)

        #maps the output to logits using self.lm_head.
        logits = self.lm_head(x)                 ## (B, T, vocab_sice)
        
        # the model calculates the cross-entropy loss between the logits and targets
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets  = targets.view(B*T)
            loss   = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    #takes the initial input token indices idx and generates a sequence of new tokens up to max_new_tokens in length.
    def generate(self, idx, max_new_tokens):
        
        ## idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            ## get the predictions
            logits, loss = self(idx_cond)
            ## focus only on last time stamp
            logits = logits[:, -1, :]           ## becomes (B, C)
            ## apply softmax to get probs
            probs = F.softmax(logits, dim= -1)    ## (B, C)
            ## sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1)
            ## append sample to the running sequence
            idx = torch.cat(  (idx, idx_next), dim=1  )            ## (B, T+1)
        return idx
            
            
            
######################################################################


In [None]:


model   = BigramLanguageModel()
m = model.to(device)

######################################################################

#Initializes the Adam optimizer with the model's parameters and the specified learning rate.
optimizer = torch.optim.Adam(  m.parameters(), lr=learning_rate   )

######################################################################
#This training loop iteratively trains the BigramLanguageModel using input and target batches,
# computes the gradients using backpropagation, and updates the model's parameters using the Adam optimizer.
# It also evaluates the model at specified intervals and prints the training and validation losses.

for iter in range(max_iters):
    if iter % eval_interval == 0:

        #calculate the average training and validation losses.
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    
    ## evaluate the loss
    logits, loss = m(xb, yb)

    #Resets the gradients of the model's parameters to zero before backpropagation.
    optimizer.zero_grad(set_to_none=True)   ## zero out

    #Computes the gradients of the loss with respect to the model's parameters using backpropagation.
    loss.backward()

    #Updates the model's parameters based on the calculated gradients and the optimizer's learning rate.
    optimizer.step()
    

################################################################

step 0: train loss 4.3832, val loss 4.3875
step 500: train loss 1.9829, val loss 2.0668
step 1000: train loss 1.5972, val loss 1.7695
step 1500: train loss 1.4319, val loss 1.6386
step 2000: train loss 1.3373, val loss 1.5677
step 2500: train loss 1.2738, val loss 1.5253
step 3000: train loss 1.2222, val loss 1.4982
step 3500: train loss 1.1833, val loss 1.4900
step 4000: train loss 1.1455, val loss 1.4854
step 4500: train loss 1.1090, val loss 1.5001


In [None]:
################################################################
#### now, regenerate after some training


## Kick off generation with some starting token. In this case id 0
#generates a sequence of text up to 500 tokens in length, starting from an initial context of all zeros, using the trained language model.

#Creates an initial context tensor of shape (1, 1) filled with zeros, with a data type of torch.
#long and placed on the device (GPU or CPU). The context serves as a starting point for the text generation process.
context = torch.zeros(  (1, 1),  dtype=torch.long, device=device   )

#The resulting tensor has the generated token indices, and the first row (index 0) is extracted and converted to a Python list using tolist().
gen_text = m.generate(context, max_new_tokens=500)[0].tolist()

#Decodes the generated token indices using the decode() function (which converts indices to characters) and prints the generated text as a string.
print(  decode(gen_text)   )


I cannot cold to hear the brace and league,
No deless might that your tomb of worse you.

Nurse:
Suprehead'st Northumberland.

JULIET:
Come, stamp'st in love her princess!

Nurse:
Fall'n one keep the work that hide to pain
That we did, you no have enter garden.

METRCASURE:
No; I; My father hope.

ROMEO:
No common brief.

CAPULET:
Hart thou weapon
What made thee married, lords, thou art eleven
wounds. See the submissips, but our services. Come,
Event help me rather; and at the wedden day
secred 
