In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import requests

# We are downloading and then reading the data.
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text = requests.get(url).text

# The dataset class which is inherited from Dataset Class of PyTorch
class CharDataset(Dataset):
    #This is the constructer function:
    #data: the entire Shakespeare text.
    #block_size: the maximum context length the model will see.
    #It will not see the whole context at once
    #only 128 characters at once for this case
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size
        chars = sorted(list(set(data)))
        # Here we are finding the unique characters- Vocabulary
        #set(data) → takes all unique characters
        #list(...) → converts the set to a list
        #sorted(...) → sorts alphabetically
        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }
        # Character -> Number and,  Number -> Character conversion dictionaries
        #We do this tokenization so the neural network understands the character
        #As long as we work with the same data, the character-index mapping always remains the same.
        #It's because from chars, the lists comes ordered
        self.vocab_size = len(chars)
        #for this case our vocab size is 65
        #This information is required for the model's embedding table, output layer, etc.


    def get_vocab_size(self):
        return self.vocab_size

    def __len__(self):
        return len(self.data) - self.block_size
        # Returns how many samples we can extract from the dataset
        #We extract the block size from the lenght of the data
        # so in the last block we don't have empty characters

    def __getitem__(self, idx):
        # Take a piece of text that is block_size + 1 characters long
        #We add the plus 1 since the last index is not included in the a:b form
        chunk = self.data[idx:idx + self.block_size + 1]
        # Convert characters to numbers
        dix = [self.stoi[s] for s in chunk]
        # Return the chunk and the shifted version as tensors
        # x: The input sequence you will provide to the model (character IDs)(0 to N-1)
        # y: The target sequence you want the model to predict (the next character IDs)(1 to N)
        #This is the next token prediction concept
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y



# Note: Since I was using Google Colab, I reduced the Batch Size to ensure sufficient RAM
batch_size = 32      # B: Reduced to manage RAM usage with the larger model
block_size = 128     # N: Context window size
#Batch size: 32 means the code updates the model after reading 32 pieces of text.
#Each of these 32 pieces has 128 characters
max_iters = 6000     # Total training steps
#This updating process will continue 6000 times
#I tried with 3000 but the result wasn't as good
#And I had a final loss value as 2.0769
learning_rate = 6e-5 # Lower learning rate for stability with deep model
#I first used 3e-4 but the loss didn't decrease enough with this value
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 768       #This means the number of embeddings, each character is presented by 768 attributes
n_head = 8
n_layer = 12       # Number of layers is 12, showing the depth of the code
dropout = 0.1      # Standart. This means that we randomly deactivate 10% of neurons during training.
                  #This is done to prevent Over-Fitting.

print(f"Device: {device}")

class CausalSelfAttn(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        assert n_embd % n_head == 0
        self.head_size = n_embd // n_head
        self.n_head = n_head
        self.n_embd = n_embd

        # Key, Query, Value projection
        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
        # Output projection
        self.c_proj = nn.Linear(n_embd, n_embd)

        self.attn_dropout = nn.Dropout(dropout)
        #When attention scores are calculated (after Softmax),
        #it randomly drops some connections. This prevents the model from memorizing
        #that, this word is 100% connected to that
        self.resid_dropout = nn.Dropout(dropout)

        # Causal Mask (So the model doesn't see the future)
        self.register_buffer("bias", torch.tril(torch.ones(block_size, block_size))
                                     .view(1, 1, block_size, block_size))

    def forward(self, x):
        B, T, C = x.size()

        # Calculate query, key, values for all heads in batch and move head forward to be the batch dim
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
         # Split into heads (B, T, n_head, head_size) -> (B, n_head, T, head_size)
        k = k.view(B, T, self.n_head, self.head_size).transpose(1, 2)
        q = q.view(B, T, self.n_head, self.head_size).transpose(1, 2)
        v = v.view(B, T, self.n_head, self.head_size).transpose(1, 2)

        # Attention Scors (Scaled Dot-Product)
        # Causal Self-Attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / (k.size(-1)**0.5))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)

        y = att @ v # (B, n_head, T, head_size)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # Re-assemble all head outputs side by side

        return self.resid_dropout(self.c_proj(y))

class MLP(nn.Module):
  #Multi-Layer Perceptron (Feed-Forward Network)
  #The tokens starts processing the information after the Attention procedure
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embd)
        self.attn = CausalSelfAttn(n_embd, n_head)
        self.ln2 = nn.LayerNorm(n_embd)
        self.mlp = MLP(n_embd)

    def forward(self, x):
        # x = x + self.CausalSelfAttn(self.LayerNorm_1(x))
        x = x + self.attn(self.ln1(x))
        # out = x + self.MLP(self.LayerNorm_2(x))
        x = x + self.mlp(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.wte = nn.Embedding(vocab_size, n_embd)  # Token embeddings
        self.wpe = nn.Embedding(block_size, n_embd)  # Position embeddings
        self.drop = nn.Dropout(dropout)

        self.blocks = nn.ModuleList([Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # Final LayerNorm
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)

        # Weight tying
        self.wte.weight = self.lm_head.weight

    def forward(self, idx, targets=None):
        device = idx.device
        B, T = idx.shape

        # tok_emb = WTE(idx)
        tok_emb = self.wte(idx)

        # pos_emb = WPE(pos)
        pos = torch.arange(0, T, dtype=torch.long, device=device)
        pos_emb = self.wpe(pos)

        # x = Dropout(tok_emb + pos_emb)
        x = self.drop(tok_emb + pos_emb)

        # for Block in Blocks: x = Block(x)
        for block in self.blocks:
            x = block(x)

        # x = Final_LayerNorm(x)
        x = self.ln_f(x)

        # logits = LM_Head(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

#The training begins here


train_dataset = CharDataset(text, block_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

# Model Starts
model = GPTLanguageModel(vocab_size=train_dataset.get_vocab_size())
m = model.to(device)
print(f"Model parametre sayısı: {sum(p.numel() for p in m.parameters())/1e6:.2f} Milyon")

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

print("Eğitim başlıyor (Büyük model olduğu için biraz daha yavaş olabilir)...")
model.train()
data_iter = iter(train_loader)

for iter_num in range(max_iters):
    try:
        xb, yb = next(data_iter)
    except StopIteration:
        data_iter = iter(train_loader)
        xb, yb = next(data_iter)

    xb, yb = xb.to(device), yb.to(device)

    logits, loss = model(xb, yb)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if iter_num % 100 == 0: # Sık raporlama
        print(f"Adım {iter_num}: Loss {loss.item():.4f}")

print(f"Eğitim tamamlandı. Final Loss: {loss.item():.4f}")

#Evaluation and inference happens here

print("\n--- Inference ---")

# Tokenizer functions
def tokenize(s):
    return torch.tensor([train_dataset.stoi[c] for c in s], dtype=torch.long, device=device).unsqueeze(0)

def tokens_to_string(tokens):
    return ''.join([train_dataset.itos[i.item()] for i in tokens[0]])

model.eval()
with torch.no_grad():
    # Starter context
    context_str = "O God, O God!"
    tokenized_context = tokenize(context_str)

    # Generate
    y = model.generate(tokenized_context, max_new_tokens=500)

    # Decode
    completion = tokens_to_string(y)

    print(completion)

Cihaz: cuda
Model parametre sayısı: 85.20 Milyon
Eğitim başlıyor (Büyük model olduğu için biraz daha yavaş olabilir)...
Adım 0: Loss 4.3788
Adım 100: Loss 3.3377
Adım 200: Loss 3.3308
Adım 300: Loss 3.2818
Adım 400: Loss 3.3214
Adım 500: Loss 3.3046
Adım 600: Loss 3.3011
Adım 700: Loss 3.2146
Adım 800: Loss 3.1510
Adım 900: Loss 3.0241
Adım 1000: Loss 2.9012
Adım 1100: Loss 2.7980
Adım 1200: Loss 2.7763
Adım 1300: Loss 2.6840
Adım 1400: Loss 2.5941
Adım 1500: Loss 2.5757
Adım 1600: Loss 2.4609
Adım 1700: Loss 2.4945
Adım 1800: Loss 2.4420
Adım 1900: Loss 2.4278
Adım 2000: Loss 2.3819
Adım 2100: Loss 2.3653
Adım 2200: Loss 2.3335
Adım 2300: Loss 2.3505
Adım 2400: Loss 2.2718
Adım 2500: Loss 2.2530
Adım 2600: Loss 2.2450
Adım 2700: Loss 2.1807
Adım 2800: Loss 2.1746
Adım 2900: Loss 2.1431
Adım 3000: Loss 2.1213
Adım 3100: Loss 2.0759
Adım 3200: Loss 2.0635
Adım 3300: Loss 2.0109
Adım 3400: Loss 2.0526
Adım 3500: Loss 2.0084
Adım 3600: Loss 1.9446
Adım 3700: Loss 1.9030
Adım 3800: Loss 1.