In [2]:
#fake_chatgpt_scratch

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/Colab Notebooks/LLM/fake_chatgpt_scratch/Donald Trump

/content/drive/MyDrive/Colab Notebooks/LLM/fake_chatgpt_scratch/Donald Trump


In [6]:
%pwd #display the current working directory.

'/content/drive/MyDrive/Colab Notebooks/LLM/fake_chatgpt_scratch/Donald Trump'

In [7]:
import torch #a popular deep learning framework.
import numpy as np
import requests #used for making HTTP requests.
## import tiktoken
import torch.nn as nn #Imports the neural network module from PyTorch

from torch.nn import functional as F #Imports the functional module from PyTorch's neural network package as 'F'
                      #to use various functions like activation and loss functions.

In [8]:
!pip install requests
!pip install tiktoken    ## requires python   >    3.9

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tiktoken
  Downloading tiktoken-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.3.3


In [9]:
############################################################

torch.manual_seed(1337)     #Sets the random seed for PyTorch to ensure reproducibility.

block_size = 256      ## max content length for predictions
batch_size = 64 
max_iters  = 5000
eval_interval = 500     #Sets the interval for evaluating the model during training.
learning_rate = 3e-4             ## 0.001
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200      #Sets the number of evaluation iterations.
vocab_size = 65     #Sets the size of the vocabulary.
n_embd  = 384     #Sets the size of the embedding vectors for each token.(id)
n_head  = 6     #Sets the number of attention heads for the Transformer model.
n_layer = 6     #Sets the number of layers for the Transformer model.
dropout = 0.2     #Sets the dropout rate for the model's layers.

############################################################

In [11]:
input_file_path = '/content/drive/MyDrive/Colab Notebooks/LLM/fake_chatgpt_scratch/Donald Trump/input.txt'

with open(input_file_path, 'r', encoding='utf-8') as f:
    text = f.read()
    
############################################################

print("length of data in characters")
len(text)

############################################################

length of data in characters


896269

In [13]:
chars = sorted(     list(set(text))   )     #Creates a sorted list of unique characters present in the input text.

vocab_size = len(chars)     #Calculates the vocabulary size by counting the number of unique characters.

print(  ''.join(chars)  )     #Prints the sorted unique characters as a single string.

############################################################# 


 !"$%&'(),-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzé–—‘’“”…


In [14]:
## tokenizer

#Create dictionaries for character-to-index (stoi) and index-to-character (itos) mappings.
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }

#for encoding a string to a list of integers (encode
encode = lambda s: [ stoi[c]          for c in s   ]    ## encoder: string to integer

#for decoding a list of integers back to a string (decode), using the dictionaries created above.
decode = lambda l: ''.join(   itos[i] for i in l   )    ## decoder: interger to string

#############################################################

In [15]:
#Encodes the input text as a list of integers, then converts it to a PyTorch tensor of long integers.
data = torch.tensor(   encode(text), dtype=torch.long   )  

#Calculates the number of data points to be used for training (90% of the data).
n    = int(   0.9*len(data)   )

#Splits the data tensor into training and validation sets using the calculated value 'n'.
train_data = data[:n]
val_data   = data[n:]

#############################################################

In [16]:
#for generating batches of data for training or validation.
#It takes a single parameter, split, which determines whether the data is for training or validation.
def get_batch(split):
    if split == "train":
        data = train_data
    else:
        data = val_data
    
    #Generates a 1D tensor of random integers with the shape (batch_size,).
    #The random integers are sampled from a range of 0 to len(data) - block_size.
    ix = torch.randint(   len(data) - block_size, (batch_size,)   )

    #Constructs the input tensor x by extracting slices of length block_size from the data tensor using the random indices from the ix tensor.
    #The resulting tensor is stacked along a new dimension.
    x  = torch.stack(    [  data[ i : i+block_size ]   for i in ix]    ) 

    #Constructs the target tensor y in a similar way to x, but with an offset of 1, effectively shifting the data one step forward.
    y  = torch.stack(    [  data[ i+1 : i+1+block_size ]   for i in ix]    )
    
    #Moves the input and target tensors to the specified device (GPU or CPU).
    x, y = x.to(device), y.to(device)

    return x, y

############################################################

In [17]:

@torch.no_grad()    ## decorator to disable gradient calculations for efficiency.

#estimates the average loss on the training and validation sets without updating the model parameters.
def estimate_loss():
    out = {}      #empty dictionary to store the average losses for both training and validation sets.
    model.eval()   ## no training; Sets the model to evaluation mode, which disables features like dropout and batch normalization.
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)      #Initializes a tensor of zeros with a length equal to the number of evaluation iterations (eval_iters).
        for k in range(eval_iters):
            X, Y = get_batch(split)     #to obtain input-target pairs
            logits, loss = model(X, Y)      #to compute the logits and loss for the given input-target pair.
            losses[k] = loss.item()     #Assigns the current iteration's loss value to the corresponding index in the losses tensor.
        out[split] = losses.mean()      #Calculates the mean of the losses and assigns it to the corresponding key in the out dictionary.
    model.train()  ## model back to training mode, re-enabling features like dropout and batch normalization.
    return out

##########################################################################################

In [18]:

#Defines the Head class, which inherits from the nn.Module class in PyTorch.
class Head(nn.Module):
    """ one head of self-attention """
    
    def __init__(self, head_size):
        super().__init__()

        #These lines define the linear transformation layers for the key, query, and value projections.
        self.key   = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        
        # Creates a lower-triangular mask with ones in the lower triangle and registers it as a buffer. 
        # This mask is used to enforce the causal attention constraint.
        ## the mask tril is not part of the graph since only for masking
        ## so register buffer makes it a thing out of the graph
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        #Initializes a dropout layer with a dropout rate defined by the dropout variable.
        self.dropout = nn.Dropout(dropout)

    #Defines the forward pass for the self-attention head.
    def forward(self, x):

        #Extracts the batch size (B), sequence length (T), and number of channels (C) from the input tensor x.
        B, T, C = x.shape

        #Applies the key and query linear transformations to the input tensor x.
        k = self.key(x)              ## (B, T, C)
        q = self.query(x)            ## (B, T, C)
        
        #Computes the attention weights by taking the dot product of the query and key tensors and
        #scaling by the inverse square root of the number of channels (C).
        wei = q @ k.transpose(-2, -1) * C**-0.5       ## (B, T, C) @ (B, C, T)  -> (B, T, T)

        #Applies the causal attention mask by filling the upper triangle of the attention weights tensor with negative infinity.
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))     ## (B, T, T)

        #Computes the softmax of the attention weights along the last dimension.
        wei = F.softmax(wei, dim= -1)           ## (B, T, T)

        #Applies dropout to the attention weights.
        wei = self.dropout(   wei   )
        
        ## perform the weighted aggregation of the values/Applies the value linear transformation to the input tensor x.
        v   = self.value(  x  )   ## (B, T, C)

        #Computes the output of the self-attention head
        out = wei @ v             ## (B, T, T) @ (B, T, C) -> (B, T, C)
        
        return out
        
##########################################################################################

In [19]:


class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    
    #num_heads, which is the number of attention heads, and head_size, which is the size of each attention head.
    def __init__(self, num_heads, head_size):
        super().__init__()

        #Initializes a list of Head instances using the head_size parameter, and wraps it in an nn.ModuleList for correct handling of submodules in PyTorch.
        self.heads = nn.ModuleList(  [Head(head_size) for _ in range(num_heads) ] )

        #Defines a linear layer for projecting the concatenated output of the attention heads back to the original input dimensions.
        self.proj  = nn.Linear(n_embd, n_embd)

        #Initializes a dropout layer with a dropout rate defined by the dropout variable.
        self.dropout = nn.Dropout(dropout)
    
    #Defines the forward pass for the multi-head attention.
    def forward(self, x):

        #Applies each Head instance in self.heads to the input tensor x and concatenates the outputs along the last dimension.
        out = torch.cat(   [ h(x) for h in self.heads], dim = -1   )

        #Applies the linear projection layer to the concatenated output, mapping it back to the original input dimensions.
        out = self.proj(  out   )

        #Applies dropout to the projected output.
        out = self.dropout(   out   )

        #Returns the output tensor of the multi-head attention.
        return out

##########################################################################################

In [20]:
#feed-forward neural network (FFN) used as a part of the Transformer architecture
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    
    #n_embd parameter, which represents the input and output dimensions of the FFN.
    def __init__(self, n_embd):
        super().__init__()

        #Defines the feed-forward network using the nn.Sequential container. The network consists of the following layers:
        self.net = nn.Sequential(
            #A linear layer that maps the input dimensions to 4 * n_embd.
            nn.Linear(n_embd, 4 * n_embd),
            #A rectified linear unit (ReLU) activation function.
            nn.ReLU(),
            #A linear layer that maps the dimensions from 4 * n_embd back to n_embd.
            nn.Linear(4 * n_embd, n_embd),
            #A dropout layer with a dropout rate defined by the dropout variable.
            nn.Dropout(dropout),
        )
    
    #Defines the forward pass for the feed-forward network.
    def forward(self, x):
        #Applies the self.net container to the input tensor x and returns the output tensor.
        return self.net(x)

##########################################################################################

In [21]:

class Block(nn.Module):
    """ Transformer block: comuunication followed by computation """
    
    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa   = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward( n_embd)
        self.ln1  = nn.LayerNorm(n_embd)
        self.ln2  = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        ## these normalizations (ln1, ln2) are about the only thing different from
        ## the original Vaswani paper. In the paper, they are done at the end of forward
        ## but now they are usually done at the beginning of forward
        x = x + self.sa(     self.ln1(x)      )
        x = x + self.ffwd(   self.ln2(x)      )
        return x
    
##########################################################################################

In [22]:


class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)     ## positional encoding 
        self.blocks = nn.Sequential(
                *[   Block(n_embd, n_head=n_head) for _ in range(n_layer)    ]
        )
        self.ln_f    = nn.LayerNorm(  n_embd    )        ## final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    
    def forward(self, idx, targets=None):
        
        B, T = idx.shape
        
        ## ids and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(idx)      ## batch, time, embed (4, 8, 32) 
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))      ## (T, C)
        x = tok_emb + pos_emb    ## (B, T, C)
        x = self.blocks(  x  )   ## (B, T, C)        
        x = self.ln_f(x)         ## (B, T, C)
        logits = self.lm_head(x)                 ## (B, T, vocab_sice)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits  = logits.view(B*T, C)
            targets  = targets.view(B*T)
            loss   = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        
        ## idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            ## get the predictions
            logits, loss = self(idx_cond)
            ## focus only on last time stamp
            logits = logits[:, -1, :]           ## becomes (B, C)
            ## apply softmax to get probs
            probs = F.softmax(logits, dim= -1)    ## (B, C)
            ## sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1)
            ## append sample to the running sequence
            idx = torch.cat(  (idx, idx_next), dim=1  )            ## (B, T+1)
        return idx
            
            
            
######################################################################


In [23]:


model   = BigramLanguageModel()
m = model.to(device)

######################################################################

optimizer = torch.optim.Adam(  m.parameters(), lr=learning_rate   )

######################################################################

for iter in range(max_iters):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    
    ## evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)   ## zero out
    loss.backward()
    optimizer.step()
    

################################################################

step 0: train loss 4.7092, val loss 4.6695
step 500: train loss 1.8405, val loss 3.7560
step 1000: train loss 1.3263, val loss 2.8281
step 1500: train loss 1.1441, val loss 2.6464
step 2000: train loss 1.0420, val loss 2.7472
step 2500: train loss 0.9609, val loss 2.5894
step 3000: train loss 0.8950, val loss 2.6118
step 3500: train loss 0.8370, val loss 2.6907
step 4000: train loss 0.7881, val loss 2.7162
step 4500: train loss 0.7414, val loss 2.6951


In [24]:
################################################################
#### now, regenerate after some training


## Kick off generation with some starting token. In this case id 0

context = torch.zeros(  (1, 1),  dtype=torch.long, device=device   )

gen_text = m.generate(context, max_new_tokens=500)[0].tolist()

print(  decode(gen_text)   )


somebody says, it and it all these political persons who’ve for one for Israel for proper until Islam Hillary does.
And what’s happening, I’m the only biggest interest that are many thinking now that. So I’m talking all of the problems, it’s — they’ve had throwing to be a total change.
So again, you saw about [ridiculations in people in the Sopeling 175 were into the United States. Donald Everything’s been done is beautiful Islamic terrorism.


You know the liberal day deal is a wall. My vote fo
