In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import requests

# We are downloading and then reading the data.
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text = requests.get(url).text

# The dataset class which is inherited from Dataset Class of PyTorch
class CharDataset(Dataset):
    #This is the constructer function:
    #data: the entire Shakespeare text.
    #block_size: the maximum context length the model will see.
    #It will not see the whole context at once
    #only 128 characters at once for this case
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size
        chars = sorted(list(set(data)))
        # Here we are finding the unique characters- Vocabulary
        #set(data) → takes all unique characters
        #list(...) → converts the set to a list
        #sorted(...) → sorts alphabetically
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        # Character -> Number and,  Number -> Character conversion dictionaries
        #We do this tokenization so the neural network understands the character
        #As long as we work with the same data, the character-index mapping always remains the same.
        #It's because from chars, the lists comes ordered
        self.vocab_size = len(chars)
        #for this case our vocab size is 65
        #This information is required for the model's embedding table, output layer, etc.


    def get_vocab_size(self):
        return self.vocab_size

    def __len__(self):
        return len(self.data) - self.block_size
        # Returns how many samples we can extract from the dataset
        #We extract the block size from the lenght of the data
        # so in the last block we don't have empty characters

    def __getitem__(self, idx):
        # Take a piece of text that is block_size + 1 characters long
        #We add the plus 1 since the last index is not included in the a:b form
        chunk = self.data[idx:idx + self.block_size + 1]
        # Convert characters to numbers
        dix = [self.stoi[s] for s in chunk]
        # Return the chunk and the shifted version as tensors
        # x: The input sequence you will provide to the model (character IDs)(0 to N-1)
        # y: The target sequence you want the model to predict (the next character IDs)(1 to N)
        #This is the next token prediction concept
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y



# Note: Since I was using Google Colab, I reduced the Batch Size to ensure sufficient RAM
batch_size = 64      # B: Reduced to manage RAM usage with the larger model
block_size = 128     # N: Context window size
#Batch size: 32 means the code updates the model after reading 32 pieces of text.
#Each of these 32 pieces has 128 characters
max_iters = 6000     # Total training steps
#This updating process will continue 6000 times
#I tried with 3000 but the result wasn't as good
#And I had a final loss value as 2.0769
learning_rate = 3e-4 # Lower learning rate for stability with deep model
#I first used 3e-4 but the loss didn't decrease enough with this value
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

print(f"Device used: {device}")
n_embd = 768       #This means the number of embeddings, each character is presented by 768 attributes
n_head = 8
n_layer = 12       # Number of layers is 12, showing the depth of the code
dropout = 0.1      # Standart. This means that we randomly deactivate 10% of neurons during training.
                  #This is done to prevent Over-Fitting.



class CausalSelfAttn(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        assert n_embd % n_head == 0
        self.head_size = n_embd // n_head
        self.n_head = n_head
        self.n_embd = n_embd

        # Key, Query, Value projection
        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
        # Output projection
        self.c_proj = nn.Linear(n_embd, n_embd)

        self.attn_dropout = nn.Dropout(dropout)
        #When attention scores are calculated (after Softmax),
        #it randomly drops some connections. This prevents the model from memorizing
        #that, this word is 100% connected to that
        self.resid_dropout = nn.Dropout(dropout)

        # Causal Mask (So the model doesn't see the future)
        self.register_buffer("bias", torch.tril(torch.ones(block_size, block_size))
                                     .view(1, 1, block_size, block_size))

    def forward(self, x):
        B, T, C = x.size()

        # Calculate query, key, values for all heads in batch and move head forward to be the batch dim
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
         # Split into heads (B, T, n_head, head_size) -> (B, n_head, T, head_size)
        k = k.view(B, T, self.n_head, self.head_size).transpose(1, 2)
        q = q.view(B, T, self.n_head, self.head_size).transpose(1, 2)
        v = v.view(B, T, self.n_head, self.head_size).transpose(1, 2)

        # Attention Scors (Scaled Dot-Product)
        # Causal Self-Attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / (k.size(-1)**0.5))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)

        y = att @ v # (B, n_head, T, head_size)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # Re-assemble all head outputs side by side

        return self.resid_dropout(self.c_proj(y))

class MLP(nn.Module):
  #Multi-Layer Perceptron (Feed-Forward Network)
  #The tokens starts processing the information after the Attention procedure. Attention procedure stands for
    def __init__(self, n_embd):
        #Initialize the MLP with two linear layers and a GELU activation function
        super().__init__()
        self.net = nn.Sequential(
            #here we are increasing the dimensionality from n_embd to 4*n_embd and then reducing it back to n_embd because this is
            #a common practice in transformer architectures to allow the model to learn more complex representations
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            #The GELU activation function introduces non-linearity, enabling the model to capture complex patterns in the data.
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
            #Dropout is applied after the second linear layer to prevent overfitting by randomly deactivating some neurons during training.
        )

    def forward(self, x):
        """The forward method defines how the input data flows through the MLP network."""
        return self.net(x)

class Block(nn.Module):
    """Here we are defining a Transformer Block that combines Causal Self-Attention and MLP with Layer Normalization and Residual Connections."""
    def __init__(self, n_embd, n_head):
        # Initialize the Block with LayerNorm, CausalSelfAttn, and MLP to process the input data.
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embd)
        self.attn = CausalSelfAttn(n_embd, n_head)
        self.ln2 = nn.LayerNorm(n_embd)
        self.mlp = MLP(n_embd)

    def forward(self, x):
        # Here x = x + self.CausalSelfAttn(self.LayerNorm_1(x)) implementation
        # Residual connection around the attention layer
        x = x + self.attn(self.ln1(x))
        # out = x + self.MLP(self.LayerNorm_2(x)) which is another residual connection around the MLP layer
        x = x + self.mlp(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    """Here we are defining the GPT Language Model that combines token and position embeddings,
      multiple Transformer Blocks, and a final output layer for language modeling."""
    def __init__(self, vocab_size):
        super().__init__()
        self.wte = nn.Embedding(vocab_size, n_embd)  # here the self.wte is the token embedding table which maps each character
        # to a high-dimensional vector representation
        self.wpe = nn.Embedding(block_size, n_embd)  # here the self.wpe is the position embedding table which provides positional information to the model
        self.drop = nn.Dropout(dropout)
        #self.drop applies dropout regularization to the embeddings to prevent overfitting
        self.blocks = nn.ModuleList([Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        #here we are creating a list of Transformer Blocks, each responsible for processing the input data
        self.ln_f = nn.LayerNorm(n_embd) # Final layer normalization before the output layer
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
        #finally the lm_head is a linear layer that maps the processed embeddings back to the vocabulary size for predicting the next character

        # Here we are tying the weights of the token embedding and the output layer. We do this to reduce the number of parameters in the model and improve generalization.
        self.wte.weight = self.lm_head.weight

    def forward(self, idx, targets=None):
        """This forward method defines how the input data flows through the GPT Language Model."""
        device = idx.device
        B, T = idx.shape
# Here B is the batch size and T is the sequence length and the idx is the input tensor containing character IDs.
        # tok_emb = WTE(idx) does the token embedding lookup which means converting character IDs to their corresponding embeddings.
        tok_emb = self.wte(idx)

        # pos_emb = WPE(pos) does the position embedding lookup that is providing positional information to the model.
        pos = torch.arange(0, T, dtype=torch.long, device=device)
        pos_emb = self.wpe(pos)

        # here we are adding the token embeddings and position embeddings together and applying dropout regularization for preventing overfitting.
        x = self.drop(tok_emb + pos_emb)

        # for each block in blocks, we are passing the data through multiple Transformer Blocks for processing for deeper understanding.
        for block in self.blocks:
            x = block(x)

        # here we are applying final layer normalization to the processed embeddings before the output layer
        # because it helps stabilize and improve the training of deep neural networks.
        x = self.ln_f(x)

        # here we are projecting the final embeddings to the vocabulary size to get the logits for next character prediction.
        logits = self.lm_head(x)
        # If targets are provided, we are calculating the cross-entropy loss between the predicted logits and the actual target character IDs.
        loss = None
        if targets is not None:
            #here we are reshaping the logits and targets to calculate the loss over the entire batch and sequence length.
            # B, T, C is equalized to the logits shape to the logits.shape because we need to flatten the logits and targets for
            # loss computation this means in basic we are converting the 3D tensor to 2D tensor for logits and 1D tensor for targets
            B, T, C = logits.shape
            #here below we are reshaping the logits and targets to calculate the loss over the entire batch and sequence length.
            logits = logits.view(B*T, C)
            #this line below reshapes the targets tensor from (B, T) to (B*T) which means flattening it into a 1D tensor.
            # This is necessary for the cross-entropy loss computation
            targets = targets.view(B*T)
            #here we are calculating the cross-entropy loss between the predicted logits and the actual target character IDs.
            loss = F.cross_entropy(logits, targets)
# Finally, we are returning the logits and the loss.
        return logits, loss


    def generate(self, idx, max_new_tokens):
        """Here this function generates new tokens given a context so that the model can produce text."""
        for _ in range(max_new_tokens):
            #this loop will run for the number of tokens we want to generate
            idx_cond = idx[:, -block_size:]
            #the idx_cond variable validates that the context length does not exceed block_size
            #idx[:, -block_size:] means taking the last 'block_size' tokens from idx
            #So if idx has more than block_size tokens, we only consider the most recent ones
            logits, _ = self(idx_cond)
            #logits, _ means we are only interested in the logits output
            #The logits represent the model's raw predictions for the next token probabilities
            #We are equalizing the context to self(idx_cond) to get the logits for the next token prediction.
            # We are doing this because the model generates the next token based on the provided context.
            logits = logits[:, -1, :]
            #Here, we are extracting the logits for the last time step
            #logits[:, -1, :] means taking all batches (:), the last time step (-1), and all classes (:)
            #This gives us the model's predictions for the next token based on the entire context
            # Apply temperature scaling and sample from the distribution
            probs = F.softmax(logits, dim=-1)

            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

#The training begins here

# Here below we are taking the whole sheakspeare text and length with block_size, and using it to train the model with
# input-output pairs dataset that has been iterated one character creation
train_dataset = CharDataset(text, block_size)
# Also below getting ready the dataset to be trained. Also in each step the code gives samples from the dataset based on the batch size
#by using shuffle=True, we ensure that the data is shuffled before each epoch that prevents the model from learning the order of the data
#This is important for generalization. num_workers=0 means that the data loading will be done in the main process.
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

# Model Starts
model = GPTLanguageModel(vocab_size=train_dataset.get_vocab_size())
#here above model is equalized to the G
m = model.to(device)
print(f"Count of mode: {sum(p.numel() for p in m.parameters())/1e6:.2f} Milyon")

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

print("Training Starts\n")
model.train()
#here we are setting the model to training mode using model.train()
data_iter = iter(train_loader)
#data_iter is an iterator that will provide batches of data from the train_loader
# which means it will give batches of input-output pairs for training

for iter_num in range(max_iters):
    try:
        #here we are getting the next batch of data from the data iterator
        xb, yb = next(data_iter)
        #this line fetches the next batch of input-output pairs (xb, yb) from the data iterator to be used for training.
    except StopIteration:
        # If the iterator is exhausted, we reinitialize it to start from the beginning of the dataset
        data_iter = iter(train_loader)
        #Then we fetch the next batch of data again this time it will work since we reinitialized the iterator
        xb, yb = next(data_iter)

#here we are moving the input and target tensors to the specified device (CPU or GPU) for computation
    xb, yb = xb.to(device), yb.to(device)
#below we are passing the input batch (xb) and target batch (yb) through the model to get the logits
# and loss to compute the predictions and the training loss.
    logits, loss = model(xb, yb)
#here we are performing the backpropagation and optimization steps to update the model's parameters
# based on the computed loss.
    optimizer.zero_grad(set_to_none=True)
    #this line clears the gradients of all optimized tensors to prevent accumulation from previous iterations
    # to ensure that the gradients are computed correctly for the current batch.
    loss.backward()
    #this line computes the gradients of the loss with respect to the model's parameters using backpropagation
    # to prepare for the optimization step.
    optimizer.step()
    #this line updates the model's parameters based on the computed gradients using the AdamW optimization algorithm
    if iter_num % 100 == 0:
        #here we are printing the current iteration number and the loss value every 100 iterations to monitor the training progress
        print(f"Step {iter_num}: Loss {loss.item():.4f}")
        #Prints the each 100 step and the current loss

print(f"Training Finished. Final Loss: {loss.item():.4f}")

#Evaluation and inference happens here

print("\n--- Inference ---")

# Tokenizer functions
def tokenize(s):
    return torch.tensor([train_dataset.stoi[c] for c in s], dtype=torch.long, device=device).unsqueeze(0)

def tokens_to_string(tokens):
    return ''.join([train_dataset.itos[i.item()] for i in tokens[0]])

model.eval()
with torch.no_grad():
    # Here below we are providing a context string to the model for generating text
    context_str = "O God, O God!"
    #here we are converting the context string into a tensor of character IDs using the tokenize function
    tokenized_context = tokenize(context_str)

    # this line generates new tokens based on the provided context
    y = model.generate(tokenized_context, max_new_tokens=500)

    # Here we are converting the generated token IDs back
    # into a string using the tokens_to_string function for readable output
    completion = tokens_to_string(y)

    print(completion)

Device used: cuda
Device: cuda
Count of mode: 85.20 Milyon
Training Starts

Step 0: Loss 4.3266
Step 100: Loss 3.3320
Step 200: Loss 3.3487
