#### •	You need a transformer to build the single word or a token this is a sub-transformer, and you need another transformer model to build a sequence of words through the tokens

In [2]:
# Kubrart calls and hyper parameters
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel in tran encoder & decoder?

block_size = 8 # what is the maximum context length for predictions? After predicting this number of characters we should truncate and start from the beginning
# as the trans you train can't predict more than that

max_iters = 3000
eval_interval = 300

# It's the learning rate for the model
learning_rate = 1e-2

# It's for enabling GPU when running the model to make things fater and of course that 'If' you have a GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200


In [3]:
# Reading training data
torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Encoding & Decoding words:
Notice here We are going to use our self method to encode words but of course there are many ready methods that you can use to encode your words such as "tiktokens" used by GPT, "Sentence Piece" in google, and so on

In [10]:

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))

# Length of all characters after separating the characters of the text in a list
vocab_size = len(chars)

# create a mapping from characters to integers and versa
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

# 'stoi': it's a function that's used to convert each character in the string into an integer number
# to perform the embding operation so each word will be represented as list of numbers 
# Its syntax is (stoi(['character']))
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("Hi this is Ahmed"))
print(decode(encode("Hi this is Ahmed")))

[20, 47, 1, 58, 46, 47, 57, 1, 47, 57, 1, 13, 46, 51, 43, 42]
Hi this is Ahmed


# Passing the training data after decoding it to 'Tensor' to apply training

In [11]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype) # Printing info about data
print(data[:1000]) # Printing first 1000 encoded characters from data


torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [13]:
# Splitting our data into 'tarining' and 'testing' data
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [14]:
x = train_data[:batch_size]# It's the input for trans
y = train_data[1:batch_size+1]# It's the output of trans that's why it's shifted by 1 from x as it indicates the next word after the input

for char in range (batch_size):
    context = x[:char+1] #It means print the sequence from the start to the character
    target = y[char] # It means print the next character after the sequence
    print(f"When Input is {context} The output should be {target}")

When Input is tensor([18]) The output should be 47
When Input is tensor([18, 47]) The output should be 56
When Input is tensor([18, 47, 56]) The output should be 57
When Input is tensor([18, 47, 56, 57]) The output should be 58
When Input is tensor([18, 47, 56, 57, 58]) The output should be 1
When Input is tensor([18, 47, 56, 57, 58,  1]) The output should be 15
When Input is tensor([18, 47, 56, 57, 58,  1, 15]) The output should be 47
When Input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) The output should be 58
When Input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58]) The output should be 47
When Input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47]) The output should be 64
When Input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64]) The output should be 43
When Input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43]) The output should be 52
When Input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52]) The output should be 10
When Input is tensor([18, 

In [15]:
# data loading 'Preparing our batch to enter it to the model'
def get_batch(split):
    
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data # Here determining whether you are dealing with training or test data
    
    ix = torch.randint(len(data) - block_size, (batch_size,)) # Here we are taking a random blocks or chunks from our sequence of data
    
    # It's the input for trans 
    # we use 'stack' key word to faltten them in an one dimensional array
    x = torch.stack([data[i:i+block_size] for i in ix]) 
    # It's the output of trans that's why it's shifted by 1 from x as it indicates the next word after the input
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) 
    
    x, y = x.to(device), y.to(device)
    return x, y


#### Preparing our data to fit the model 'BigramLanguageModel'

In [16]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        ''' 
        - The importance of this 'Embedding function':
            It's going to take the target's index as the output then it will take its sequence or context as the input then it will create nn layers between
            the inputs 'sequence' and the output 'target'
        
        - It takes 2D list of the targets predited from particular sequences and it produces a list has 3 information:
            B 'Batch': it refers to number of distinct sequences enters the model
            T 'Time': it refers to number of predictions of targets in each sequence
            C 'Channel': it's the total size of vocab_size which are the length of the characters used in the text
        '''
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        '''What's done here is that it will take each output
        With its row that's used for predicting it and order them in form of (B,T,C) then this is the logit which is the score
        of the prediction of the next character
        NOTICE: (B,C,T) represents the details and info for each token to predit the next one Because unlike the recurrence model, in transfomer
        each token is independent so from each token we should be able to predict the next one without depending on any other tokens that's why each
        token must have all the details that we need to predict the next one and those details are (B, T, and C) for that token'''
        logits = self.token_embedding_table(idx) # (B,T,C) it means (Batch, Time, Channel)

        # After calculating the logitis which is our prediction list for the next tokens we print its shape and compare it with the real target to 
        # see our score in predicitons
        if targets is None:
            loss = None
        else:
            # Notice if we used the 3 results from logits which are (B,T,C) the cross_entropy produces an error as it expects only 2 parameters 
            # that's why we reshaped the coordinates of logits results in the next 3 statements
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            
            # comparing between logits and targets using the score function 'cross_entropy'
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    # This function is used to generate the next tokens from a given list 
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx) # it means apply the constructor of this class on idx list to produces the 3 info (B,T, and C)
            
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C) # It means takes all the elements except the last element because you can't predit what's after the last
            
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C) # Apply softmax on logits to know what's the maximum probability of tokens to generate
            
            # sample from the distribution
             # It's used for obtaining only one probability which is the biggest one
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) it means for each batch predict one value of the next token
            
            # append sampled index to the running sequence
            # Here we concatenate the sequence with each output 
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1) 
        return idx



#### Training our model to avoid producing rubbish

In [20]:
model = BigramLanguageModel(vocab_size)
m = model.to(device)  

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

@torch.no_grad()
# no_grad: it's used for disabling gradients from updating while calcluating the evaluations

# The main use of this function is averging the loss instead of looking for the optimum loss through multiple iterations so this function is directly average
# the loss in the train and val data
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean() # This is the function that's used to calculate the average of the losses
    model.train()
    return out

for iter in range(max_iters):
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train') # sample a batch of data passing the input data to xb and the output data to yb

    # evaluate the loss
    logits, loss = model(xb, yb) # applying the model to training data returning the loss and logits
    optimizer.zero_grad(set_to_none=True) # zeroing all the gradients from the previous steps
    loss.backward() # Obtaining new gradients through backprobagation through loss function
    optimizer.step() # Finally moves one step using optimizer
 
for step in range (1000):
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb) 
    optimizer.zero_grad(set_to_none=True)
    loss.backward() 
    optimizer.step() 
print(loss.item())



2.3833913803100586


In [21]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))



DUTh gwin s t, pir,is al,
Un f third y wer d w f hen w.

Jonothove m, t ha g gelld?
'?-dllerllle warrerse LO&ju b.

DUMEne theomemThSmungucifajeve'shevoQUBe'Tyout:
Corens mivind thee wilenss wia
TAUCKints rel wie the.
D:
DUPEO;
CUSCaiceaiaplstinge ICllel'ser d hy urwWhelo be


Caleswhe

I d be haVing:
Bu quZI byMPUDUS:
Cipr bjur:
wingt t ook!
T f f w
HEDDWis e ldM
Y Ianes
Alla g iote ake an y$zely titQNor mpld,
BRBur be oupg.
WncerexAqureered?
Gof cop uma I f anqustyoreiY-m.

Nlsleat PThand fur
