In [None]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

In [None]:
 ### Sample to just understand the multihead

context_length = 5 ## just a sentence length
batch_size = 1
n_embd = 12
n_head = 3
dropout = 0.2

### Two Approaches to Multi-Head Attention

#### 1. Single Linear Layer (Industry Standard Approach)
- In this method, **one Linear layer** generates **Q, K, and V together**.
- The layer projects the input from `d_model ‚Üí 3 √ó d_model` in a single forward pass.
- After that, the output is **reshaped and split into multiple heads**.
- This approach is **fast and efficient**, which is why it is used in almost all production-grade Transformer models (e.g., GPT, BERT, etc.).

#### 2. Separate Per-Head Linear Layers (Parallel Head Construction)
- In this method, **each attention head has its own independent Linear layers** for Q, K, and V.
- Heads operate **independently**, and their outputs are **combined at the end**.
- This approach is **conceptually simpler**, but **slower** because:
  - More linear layers
  - More parameter copies
  - Less efficient GPU batching

---

### ‚úÖ Why Approach 1 is Preferred

1. **Efficiency / Speed**
   - One linear layer for QKV means **fewer matrix multiplications** than separate layers.
   - Computation is done in a **single large matrix operation**, reducing overhead and improving GPU parallelism.

2. **Industry Usage**
   - Most production models (e.g., GPT, BERT) use this method because it is **fast and memory-efficient**.
   - The concatenated heads‚Äô weights are implemented with a **single linear layer** for simplicity and scalability.

3. **Memory Efficiency**
   - Fewer linear layers ‚Üí **fewer parameters** ‚Üí **less memory usage**, which is critical for large-scale models.

---

### ‚ùó Caveats / When Separate Heads Might Make Sense

- Separate per-head layers offer **more flexibility**: each head can learn **very different projections**.
- For **small models or teaching purposes**, the performance difference is **minimal**.
- **Specialized architectures** or **memory-constrained inference** may use variants like **multi-query attention**.
- The single-linear approach **assumes all heads have equal dimension**; if you need **different head sizes**, separate layers are required.

---

### üîé Summary / Conclusion

- ‚úÖ **Single Linear for QKV** is widely used in industry: **faster**, **memory-efficient**, and **scalable**.
- ‚ö†Ô∏è **Separate per-head layers** are **slower** and mostly used for **teaching** or **research experiments**.
- While **Approach 1 dominates in practice**, **Approach 2** can be useful for **custom or experimental models**.

## First approach to build the Multi Head

In [None]:
class MultiheadAttention(nn.Module):
    def __init__(self, n_head: int, d_model: int, context_length: int, dropout: float = 0.0, log_shape: bool = True):
        super().__init__()

        self.log_shape = log_shape
        self.n_head = n_head
        self.d_model = d_model

        # Each attention head must get the same number of dimensions.
        # Example: d_model = 12 and n_head = 3 ‚Üí d_head = 4
        assert d_model % n_head == 0, "Embedding size must be divisible by number of heads"
        self.d_head = d_model // n_head

        # This linear layer produces Q, K, and V together.
        # Input:  (B, T, d_model)
        # Output: (B, T, 3*d_model)
        # Because we need Q, K, V ‚Äî each of size d_model.
        self.qkv = nn.Linear(d_model, 3 * d_model, bias=False)

        # After attention finishes, all heads are merged back
        # and passed through this output projection.
        self.projection = nn.Linear(d_model, d_model, bias=False)
        self.dropout = nn.Dropout(dropout)

        # Causal mask:
        # Upper triangular matrix with True above diagonal.
        # This prevents a token from looking at future tokens.
        self_mask = torch.triu(
            torch.ones(context_length, context_length, dtype=torch.bool),
            diagonal=1
        )
        # register_buffer ensures this mask moves with the model (GPU/CPU)
        self.register_buffer("causal_mask", self_mask)


    def forward(self, x: torch.Tensor):
        B, T, D = x.shape

        '''
        Here we have 3 self-attention heads.
        Each head needs its own Q, K, V vectors, and all of them must have the same size.

        1. We pass 1 sentence ‚Üí it has 5 tokens ‚Üí each token has a 12-dimensional embedding.
        So the input shape is: (1, 5, 12)

        2. The qkv() layer is a Linear layer that creates all three matrices:
            Q, K, and V
        It does this by projecting the input from:
            d_model ‚Üí 3 * d_model
        So output shape becomes: (1, 5, 36)

        3. Now the 36 channels can be evenly divided into 3 heads.
        Because:
            d_model = 12
            n_heads = 3
            head_dim = 12 / 3 = 4

        4. Therefore, each head will receive:
            Q of size 4,
            K of size 4,
            V of size 4
        and this is repeated for all 3 heads.

        In short:
        - Input:       (1, 5, 12)
        - After qkv(): (1, 5, 36)
        - Split into heads: 3 heads √ó 4 dims each for Q, K, V
        '''


        qkv = self.qkv(x)
        qkv = qkv.view(B, T, 3, self.n_head, self.d_head)

        if self.log_shape:
            print(f"Shape of QKV: {qkv.shape}")

        # Step 3: Split into Q, K, V
        # Each has shape (B, T, n_head, d_head)
        q, k, v = qkv.unbind(dim=2)

        '''
    bcz  at present -> (B, T, n_head, d_head). and the multihead attention wants the (B, n_head, T, d_head)

    q = (B, heads, T, d_head)
    k = (B, heads, T, d_head)
    v = (B, heads, T, d_head)

    Each attention head now has access to all T tokens, each represented as a d_head-dim vector.

    '''
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        if self.log_shape:
            print("q:", q.shape, "k:", k.shape, "v:", v.shape)

        # scale factor = 1 / sqrt(d_head)
        scale = 1.0 / math.sqrt(self.d_head)


        '''
        k.transpose(-2, -1):  (B, heads, d_head, T)

        (B,heads,T,d_head)
            @
        (B,heads,d_head,T)
        --------------------------------
        ‚Üí (B, heads, T, T)

        attention score matrix of T*T :->  each head computes a 5*5 matrix
            scores[i,j] = how much token i attends to token j

        scake = 1 / sqrt(d_head)   ->  This avoids very large dot-products which would make softmax explode

        '''
        attention_weight = torch.matmul(q, k.transpose(-2, -1)) * scale


        #  token positions by setting them to -inf
        attention_weight = attention_weight.masked_fill(
            self.causal_mask[:T, :T],
            float('-inf')
        )

        #  Convert scores to probabilities
        # Softmax applied on last dim so each row sums to 1
        attention_weight_prob = F.softmax(attention_weight, dim=-1)
        attention_weight_prob = self.dropout(attention_weight_prob)

        #Multiply probabilities with V to compute context
        # result shape = (B, n_head, T, d_head)
        context = torch.matmul(attention_weight_prob, v)

        if self.log_shape:
            print("weights:", attention_weight_prob.shape, "context:", context.shape)

        '''
        ctx: (B, heads, T, d_head)
        We need to combine them into:
        (B, T, d_model) = (1, 5, 12)


        1. transpose back -> (B, T, heads, d_head)
        2. reshape to (B, T, d_model)
        '''
        out = context.transpose(1, 2).contiguous().view(B, T, self.d_model)

        # linear projection
        out = self.projection(out)

        if self.log_shape:
            print("Multi-Head output:", out.shape)

        return out, attention_weight_prob

## Second way to build the Multi Head

In [None]:
class Head(nn.Module):
    """ One head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        # causal mask: lower triangular matrix
        self.register_buffer('tril', torch.tril(torch.ones(context_length, context_length)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x shape: (B, T, D) -> batch, token length, Dimension of one token
        B, T, D = x.shape

        # project inputs to key, query, value
        k = self.key(x)    # (B, T, hs)
        q = self.query(x)  # (B, T, hs)

        # compute attention scores ("affinities")
        attention_weight = q @ k.transpose(-2, -1) * (k.shape[-1] ** -0.5)   # (B, T, T)

        # apply causal mask to prevent looking ahead
        attention_weight = attention_weight.masked_fill(self.tril[:T, :T] == 0, float('-inf'))

        # softmax to get attention probabilities
        attention_weight_prob = F.softmax(attention_weight, dim=-1)   # (B, T, T)
        attention_weight_prob = self.dropout(attention_weight_prob)

        # weighted sum of values
        v = self.value(x)                               # (B, T, hs)
        out = attention_weight_prob @ v                  # (B, T, hs)

        return out



#      nn.Sequential: layers are applied in order, automatically in forward.
#      nn.ModuleList: just stores the layers, but you decide manually in forward how to use them (more flexible).

class MultiHeadAttention(nn.Module):
    ''' Multiple heads of self-Attention in parallel '''

    def __init__(self, num_head, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_head)])
        self.proj = nn.Linear(num_head*head_size , n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [None]:
vocab = 13
d_model = 4

In [None]:
'''
	‚Ä¢	Token embeddings: Each token in the vocabulary has a unique learnable vector. Shape = (vocab_size, d_model).
	‚Ä¢	Example: "apple" and "orange" get completely different learned embeddings.
	‚Ä¢	Positional embeddings: Each position in the sequence (0, 1, 2, ‚Ä¶, context_length-1) has a unique vector. Shape = (context_length, d_model).
	‚Ä¢	Example: position 0 always has the same embedding vector, regardless of which token is there.
	‚Ä¢	It is not per-token, its per-position.

 only the context length positions are trained for positional embeddings, not every token individually
'''


class PositionalEmbeddings(nn.Module):
    def __init__(self, vocab: int , d_model: int, device):
        super().__init__()
        self.pos_embeddings = nn.Embedding(context_length,d_model, device=device)

    def forward(self, x:torch.Tensor):
        pos_emb = self.pos_embeddings(x)
        return pos_emb

In [None]:
pos = PositionalEmbeddings(vocab,d_model,device="cpu")

## Tokenizer

In [None]:
## simple tokenizer

class byteTokenizer:
    def __init__(self, file_path: str):
        with open(file_path, 'r', encoding='utf-8') as f:
            self.text = f.read()

        # Create sorted list of unique characters
        self.chars = sorted(set(self.text))
        print("Unique characters:", self.chars)
        print(f"Length of unique chars: {len(self.chars)}")

        # Mapping: char -> int and int -> char
        self.char_to_int = {ch: i for i, ch in enumerate(self.chars)}
        self.int_to_char = {i: ch for i, ch in enumerate(self.chars)}


    @property
    def vocab_size(self)-> int:
        return len(self.chars)


    def encode(self, s: str) -> list[int]:
        """Convert string to list of integers"""
        return [self.char_to_int[c] for c in s]

    def decode(self, l: list[int]) -> str:
        """Convert list of integers back to string"""
        return ''.join([self.int_to_char[i] for i in l])


tokenizer = byteTokenizer("/Users/abhaykumarsingh/Desktop/Gpt/Data/tulsidas.txt")
encoded = tokenizer.encode("‡§®‡§Æ‡§∏‡•ç‡§§‡•á")
decoded = tokenizer.decode(encoded)

print("Encoded:", encoded)
print("Decoded:", decoded)

Unique characters: ['\n', ' ', '(', ')', ',', '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '√≠', '‡§Å', '‡§Ç', '‡§É', '‡§Ö', '‡§Ü', '‡§á', '‡§à', '‡§â', '‡§ä', '‡§è', '‡§ê', '‡§ì', '‡§î', '‡§ï', '‡§ñ', '‡§ó', '‡§ò', '‡§ô', '‡§ö', '‡§õ', '‡§ú', '‡§ù', '‡§û', '‡§ü', '‡§†', '‡§°', '‡§¢', '‡§£', '‡§§', '‡§•', '‡§¶', '‡§ß', '‡§®', '‡§™', '‡§´', '‡§¨', '‡§≠', '‡§Æ', '‡§Ø', '‡§∞', '‡§≤', '‡§µ', '‡§∂', '‡§∑', '‡§∏', '‡§π', '‡§º', '‡§Ω', '‡§æ', '‡§ø', '‡•Ä', '‡•Å', '‡•Ç', '‡•É', '‡•á', '‡•à', '‡•ã', '‡•å', '‡•ç', '‡•§', '‡••', '‡•¶', '‚Äì']
Length of unique chars: 80
Encoded: [49, 54, 61, 75, 45, 71]
Decoded: ‡§®‡§Æ‡§∏‡•ç‡§§‡•á


## Summary of the Generation Pipeline of the Model


  * **Shape:** `[vocab_size]` (e.g., 50,000 dimensions).
  * **Values:** Unbounded real numbers (negative or positive). Higher value = higher likelihood.

We don't pick the highest number immediately (Greedy Decoding) because it's repetitive. Instead, we pass these logits through a series of **filters** (Temperature, Top\_k, Top\_p) to reshape the probability distribution before we "roll the dice" (sample).


## 1\. Temperature ($T$): The Logit Scaler

In implementation terms, Temperature is simply a **scalar division** applied to the logits *before* the Softmax function.

### The Math

Standard Softmax looks like this:
$$P_i = \frac{e^{x_i}}{\sum e^{x_j}}$$

With Temperature $T$, we modify the input $x$:
$$P_i = \frac{e^{x_i / T}}{\sum e^{x_j / T}}$$

### Implementation Logic

1.  **Input:** A tensor of logits (e.g., `[2.0, 4.0, -1.0]`).
2.  **Operation:** Divide the tensor by $T$.
3.  **Output:** Pass result to Softmax.

### Analysis

  * **If $T < 1$ (e.g., 0.1):**
      * You are dividing by a fraction, which acts like multiplication.
      * The gaps between numbers expand. A distinct winner becomes a *massive* winner after the exponential function ($e^x$) is applied.
      * **Result:** The distribution becomes "spiky" (low entropy). The model becomes deterministic.
  * **If $T > 1$ (e.g., 1.5):**
      * You are dividing by a large number. The values shrink closer to 0.
      * $e^0 = 1$. As values get closer to 0, their exponentials become similar.
      * **Result:** The distribution flattens (high entropy). Even "bad" words get a decent probability.




## 2\. Top\_k: The Rank-Based Hard Filter

Top\_k is a **sorting and masking** operation. It ignores probability mass and looks strictly at rank.

### Implementation Logic

We want to keep the $k$ highest logits and zero out the rest.

1.  **Sort:** Sort the logits (or probabilities) in descending order.
2.  **Cutoff:** Identify the value at index $k$.
3.  **Mask:** Create a boolean mask.
      * Any value **less than** the $k$-th value is set to negative infinity ($-\infty$).
4.  **Renormalize:** Apply Softmax again.
      * Because $e^{-\infty} = 0$, those tokens now have literally 0 probability.
      * The remaining probabilities are scaled up so they sum to 1.0 again.

### trade-off with **Temperature** which controls the distribution

  * **Pros:** Computationally cheap (just a sort/select). Guarantees you never sample from the "long tail" of garbage words.
  * **Cons:** It is static.
      * If the distribution is flat (many good words), $k=5$ cuts off valid options.
      * If the distribution is peaked (only 1 good word), $k=5$ forces the model to consider 4 bad options.


## 3\. Top\_p (Nucleus Sampling): The Cumulative Density Filter

Top\_p is smarter. It uses the **Cumulative Distribution Function (CDF)**. Instead of a fixed *number* of tokens, we want a fixed *mass* of probability.

### Implementation Logic

Let's say `top_p = 0.9`. We want the smallest set of words whose combined probability is 90%.

1.  **Sort:** Sort probabilities in descending order.
2.  **Cumsum:** Calculate the cumulative sum of the vector.
      * Example: `[0.5, 0.3, 0.1, 0.05...]` $\rightarrow$ `[0.5, 0.8, 0.9, 0.95...]`.
3.  **Threshold:** Find the first index where the cumulative sum exceeds $p$ (0.9).
4.  **Mask:**
      * Keep everything *before* that index.
      * Set everything *after* that index to $-\infty$ (or 0 probability).
5.  **Renormalize:** Rescale the remaining chunk so it sums to 1.0.

###  Advantage

This adapts to the model's confidence (entropy).

  * **Low Entropy context (Model is sure):** "The capital of France is..."
      * "Paris" might be 0.99. The cumulative sum hits 0.90 immediately.
      * **Result:** The pool size is effectively **1**.
  * **High Entropy context (Model is unsure):** "I want to eat..."
      * "Pizza" (0.1), "Burger" (0.1), "Salad" (0.1)...
      * It takes many words to stack up to 0.90.
      * **Result:** The pool size dynamically expands to **10 or 20**.



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional


class Decoder_Multi_Head_Attention(nn.Module):
    def __init__(self, n_head: int, d_model:int, dropout:float=0.0):
        super().__init__()
        assert d_model % n_head == 0   # should be divisible (d_model % n_head = 0)
        self.n_head = n_head
        self.d_head = d_model // n_head
        self.qkv = nn.Linear(d_model, 3 * d_model, bias=False)
        self.proj = nn.Linear(d_model, d_model, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x:torch.Tensor) -> None:
        B, T, C = x.shape
        qkv = self.qkv(x).view(B, T, 3, self.n_head, self.d_head)
        q, k, v = qkv.unbind(dim=2)
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)
        out = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout.p if self.training else 0.0, is_causal=True)
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        out = self.proj(out)
        return out

## End Feed Forward of One Block
class FeedForward(nn.Module):
    def __init__(self, d_model: int, hidden_times: int = 3, dropout: float=0.0) -> None:
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, hidden_times*d_model),
            nn.GELU(),
            nn.Linear(hidden_times*d_model, d_model),
            nn.Dropout(dropout)
        )

    def forward(self, x: torch.Tensor):
        return self.net(x)

## one single tranformer block
class Block(nn.Module):
    def __init__(self, d_model: int, n_head: int, dropout: float=0.2) -> None:
        super().__init__()
        self.lr1Norm = nn.LayerNorm(d_model)
        self.multiHead_attention = Decoder_Multi_Head_Attention(n_head, d_model, dropout)
        self.lr2Norm = nn.LayerNorm(d_model)
        self.ffn = FeedForward(d_model, dropout=dropout)

    def forward(self, x: torch.Tensor):
        x = x + self.multiHead_attention(self.lr1Norm(x))
        x = x + self.ffn(self.lr2Norm(x))

        return x



## multi block architecture - GPT

class TinyGPT(nn.Module):
    def __init__(self, vocab_size: int, context_length: int, n_block: int = 4, n_head: int = 4, d_model: int = 256, dropout: float = 0.0):
        super().__init__()
        self.embedings = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model
        self.context_length = context_length
        self.pos_embedings = nn.Embedding(context_length, d_model)
        self.blocks = nn.Sequential(*[Block(d_model, n_head, dropout) for _ in range(n_block)])
        self.out_head = nn.Linear(d_model, vocab_size)
        self.lrNorm = nn.LayerNorm(d_model)

        self.apply(self._init_weights)

    # initialising the weights in normal distribution near zero - this results better than random initialising

    def _init_weights(self, m):
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std= 0.02)
            if m.bias is not None:
                nn.init.zeros_(m.bias)
        elif isinstance(m, nn.Embedding):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)




    def forward(self, idx: torch.Tensor, targets: Optional[torch.Tensor]=None):
        B, T = idx.shape
        if T > self.context_length:
            idx = idx[:, -self.context_length:]
            T = idx.size(1)

        pos = torch.arange(0, T, device=idx.device).unsqueeze(0)
        x = self.embedings(idx) + self.pos_embedings(pos)
        x = self.blocks(x)
        x = self.lrNorm(x)
        logits = self.out_head(x)
        loss = None
        if targets is not None :
            loss = F.cross_entropy(logits.flatten(0, 1), targets.flatten())

        return logits, loss


    @torch.no_grad()
    def generate(
        self,
        prompt: str = "",
        max_new_tokens: int = 200,
        temperature: float = 1,
        top_k: Optional[int] = None,
        top_p: Optional[int] = None,
        tokenizer=None  # pass your byteTokenizer instance
    ) -> str:
        """
        Generate text from a string prompt.
        Returns decoded string.
        """
        self.eval()

        # Encode prompt

        if tokenizer is None:
            raise ValueError("Tokenizer must be provided to encode prompt.")
        idx = tokenizer.encode(prompt)
        idx = torch.tensor([idx], dtype=torch.long, device=next(self.parameters()).device)

        # Generate loop
        for _ in range(max_new_tokens):
            # Crop context if too long
            idx_cond = idx if idx.size(1) <= self.context_length else idx[:, -self.context_length:]

            # Forward pass
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / max(temperature , 1e-6)

            # Optional: top-k sampling
            if top_k is not None and top_k > 0:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')

            # Optional: top-p (nucleus) sampling
            if top_p is not None and top_p < 1.0:
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
                logits[indices_to_remove] = -float('Inf')

            # Sample
            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)

            # Append
            idx = torch.cat([idx, next_token], dim=1)


        return tokenizer.decode(idx[0].tolist())

In [3]:
class byteTokenizer:
    def __init__(self, file_path: str):
        with open(file_path, 'r', encoding='utf-8') as f:
            self.text = f.read()

        # Create sorted list of unique characters
        self.chars = sorted(set(self.text))
        print("Unique characters:", self.chars)
        print(f"Length of unique chars: {len(self.chars)}")

        # Mapping: char -> int and int -> char
        self.char_to_int = {ch: i for i, ch in enumerate(self.chars)}
        self.int_to_char = {i: ch for i, ch in enumerate(self.chars)}


    @property
    def vocab_size(self)-> int:
        return len(self.chars)


    def encode(self, s: str) -> list[int]:
        """Convert string to list of integers"""
        return [self.char_to_int[c] for c in s]

    def decode(self, l: list[int]) -> str:
        """Convert list of integers back to string"""
        return ''.join([self.int_to_char[i] for i in l])

## Dataset loader

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
# from tokenizer import byteTokenizer


class DatasetLoad(Dataset):
    def __init__(self, path: str, context_length: int):
        self.tokenizer = byteTokenizer(path)
        self.data = torch.tensor(self.tokenizer.encode(self.tokenizer.text), dtype=torch.long)
        self.context_length = context_length

    def __len__(self):
        return len(self.data)-self.context_length

    def __getitem__(self, idx):
        x = self.data[idx:idx+self.context_length]
        y = self.data[idx+1:idx+self.context_length+1]
        return x, y


def create_Dataloader(path: str, batch_size: int, context_length: int, train: float=0.8):
    dataset = DatasetLoad(path, context_length)
    n = len(dataset)
    train_size = int(n * train)
    val_size = n - train_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

    return train_loader, val_loader, dataset



In [None]:
data_path = "/Data/Ramayan.txt"

## Training 

# Training Setup 

This guide explains how we teach the AI model. Think of training a model like teaching a student: you need a schedule, a good teaching method (optimizer), and ways to be efficient (mixed precision).

---

## 1. Learning Rate Schedule: How Fast We Learn

**The Concept:** The "Learning Rate" (LR) is the size of the step the model takes when it learns something new.

* **Big Step:** Learns fast, but might miss the correct answer.
* **Small Step:** Precise, but takes forever.

We use a strategy called **Warmup ‚Üí Cosine Decay ‚Üí Minimum LR**.

### How it works (The Journey)

1.  **Linear Warmup (The Takeoff):**
    * **What happens:** We start with a learning rate of `0` and slowly increase it to the maximum (`peak_lr`).
    * **Why:** At the very beginning, the model knows nothing (the weights are random). If we try to learn too fast immediately, the model gets confused and unstable. We start slow to let it "warm up."

2.  **Cosine Decay (The Landing):**
    * **What happens:** After the warmup, we slowly lower the learning rate following a smooth curve (like a slide).
    * **Why:** As the model gets smarter, it needs to make smaller, more careful adjustments to find the perfect answer.

3.  **Minimum LR (The Engine Idle):**
    * **What happens:** We never let the learning rate hit exactly zero. We stop at a `min_lr`.
    * **Why:** We want the model to keep learning slightly until the very last second.



### üìå Key Numbers:
| Parameter | Value | Simple Explanation |
| :--- | :--- | :--- |
| `warmup_steps` | 1000 | Take 1000 steps to slowly speed up from 0. |
| `peak_lr` | `5e-5` | The top speed. |
| `min_lr` | `5e-6` | The slowest speed we allow at the end. |

---

## 2. The Optimizer: AdamW (The Teacher)

We use **AdamW**. This is the algorithm that actually updates the model's brain.

### Why "AdamW" and not just "Adam"?
Standard **Adam** is great, but it handles "Weight Decay" in a way that can be messy. **AdamW** fixes this.

### Key Concepts Simplified:

1.  **Betas (`Œ≤1`, `Œ≤2`): The Memory**
    * Think of these as the optimizer's "momentum."
    * If the optimizer sees the model is improving in a certain direction, **Betas** help it remember that speed and keep going that way.
    * **Your values (0.9, 0.95):** These are standard for Large Language Models (LLMs). They help the model react quickly to new information.

2.  **Weight Decay: The Cleaner**
    * **The Problem:** Sometimes models try to memorize the training data exactly (like a student memorizing answers instead of understanding the topic). This is called "overfitting."
    * **The Solution:** Weight decay penalizes the model for having "weights" that are too large or complex. It forces the model to keep things simple.
    * **Why AdamW is better:** In AdamW, this cleaning process is separate from the learning rate. This means the cleaning stays consistent even when the learning rate slows down.

---

## 3. Mixed Precision Training (Detailed Explanation)

This technique allows us to train larger models faster without buying more expensive computers.

### The Problem: Precision vs. Speed
Computers usually store numbers in **FP32** (32-bit Floating Point).
* **FP32:** Very precise (e.g., `0.123456789`), but takes up a lot of memory and is slower to calculate.
* **FP16:** Less precise (e.g., `0.1234`), but takes half the memory and is super fast.

### The Solution: Mixed Precision
We use **both** formats to get the best of both worlds.



### Step-by-Step Process:

1.  **Master Weights (FP32):**
    We keep a "Master Copy" of the model in high quality (FP32). This is our safety net.
2.  **Forward Pass (FP16):**
    When the model reads data and makes a guess, we convert the weights to the fast, lower quality format (FP16). This makes the calculation lightning fast.
3.  **Backward Pass (FP16):**
    When we calculate the errors (gradients), we also do this in FP16 to stay fast.
4.  **The Update (FP32):**
    We take those small error calculations and update the **Master Copy (FP32)**.
    * *Why?* Because the updates are often very tiny numbers. If we tried to update an FP16 model with a tiny number, the computer might think the number is zero and do nothing. The FP32 Master Copy is sensitive enough to catch these tiny changes.

### The "Loss Scaler" (Important!)
Because FP16 cannot handle very small numbers (they turn to zero), we use a trick called **Scaling**.
1.  **Multiply:** Before we calculate the error, we multiply the Loss by a huge number (e.g., 65,000). This makes the small numbers big enough for FP16 to see.
2.  **Calculate:** We do the math.
3.  **Divide:** Before updating the Master Weights, we divide by that same huge number to return the values to normal size.

---

## 4. Gradient Clipping (Detailed Explanation)

You asked about `torch.nn.utils.clip_grad_norm_`. This is a safety mechanism for training stability.

### The Analogy: Walking Down a Hill
Imagine training is like walking down a steep mountain to find the bottom (the lowest loss).
* Usually, you take reasonable steps.
* Sometimes, you encounter a "bad" piece of data that tells you to take a **massive jump**.
* If you take that jump, you might fly off the map and ruin everything. This is called an **Exploding Gradient**.

### How Clipping Works
**Gradient Clipping** puts a speed limit on your steps.

1.  **Calculate Norm:** The computer looks at the total size (magnitude) of all the proposed updates (gradients) for the whole model.
2.  **Check Limit:** You set a limit (e.g., `1.0`).
3.  **Clip (Cut):**
    * If the total size is **0.8**, nothing happens (it's under the limit).
    * If the total size is **5.0** (too big!), the computer shrinks the step down until the size is exactly **1.0**.



> **Crucial Note:** It shrinks the *size* of the step, but it keeps the *direction* exactly the same. You still go the right way, just with a safer, smaller step.

### Why use it?
* It prevents the training loss from suddenly spiking to `NaN` (Not a Number) or Infinity.
* It allows you to use higher learning rates without crashing the model.

---

## Summary Checklist

| Feature | What it is | Why we use it |
| :--- | :--- | :--- |
| **Cosine Decay** | Slows down learning smoothly | Helps land on the best solution. |
| **AdamW** | The "brain" updater | Handles model cleanup (weight decay) perfectly. |
| **Mixed Precision** | Using FP16 + FP32 | **2x Faster** training, uses **40% less memory**. |
| **Grad Scaler** | Multiplies small numbers | Prevents errors from vanishing in FP16. |
| **Gradient Clipping** | A speed limit for updates | Prevents the model from crashing due to bad data. |


In [None]:
import torch
import torch.optim as optim

from tqdm import tqdm
import time
from pathlib import Path

DATA_PATH = data_path
CONTEXT_LENGTH = 512
BATCH_SIZE = 64
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Training settings
PRECISION = "float16"
MAX_ITERS = 1000
EVAL_ITERS = 50
LOG_INTERVAL = 10
GRAD_CLIP = 1.0
LEARNING_RATE = 6e-3
WARMUP_ITERS = 150
MIN_LR = 6e-5
WEIGHT_DECAY = 0.1

# Model config
N_LAYER = 4
N_HEAD = 6
D_MODEL = 252
DROPOUT = 0.1

# runs/models path
CKPT_DIR = Path("/content/runs")
CKPT_DIR.mkdir(exist_ok=True)

def get_lr(it: int):
    if it < WARMUP_ITERS:
        return LEARNING_RATE * (it + 1) / WARMUP_ITERS
    if it > MAX_ITERS:
        return MIN_LR
    decay_ratio = (it - WARMUP_ITERS) / (MAX_ITERS - WARMUP_ITERS)
    coeff = 0.5 * (1.0 + torch.cos(torch.pi * torch.tensor(decay_ratio)))
    return MIN_LR + coeff * (LEARNING_RATE - MIN_LR)


@torch.no_grad()
def estimate_loss(model, train_loader, val_loader, device, eval_iters=EVAL_ITERS):
    model.eval()
    losses = {}
    for split, loader in [("train", train_loader), ("val", val_loader)]:
        total_loss = 0.0
        for _ in range(eval_iters):
            x, y = next(iter(loader))
            x, y = x.to(device), y.to(device)
            with torch.autocast(device_type="cuda", dtype=getattr(torch, PRECISION)):
                _, loss = model(x, y)
            total_loss += loss.item()
        losses[split] = total_loss / eval_iters
    model.train()
    return losses


def train():
    print(f"Using device: {DEVICE} | Precision: {PRECISION}")

    # Create data loaders
    train_loader, val_loader, dataset = create_Dataloader(
        path=DATA_PATH,
        batch_size=BATCH_SIZE,
        context_length=CONTEXT_LENGTH,
        train=0.9  # 90% train, 10% val
    )

    # Get vocab size from tokenizer (we need to extract it)
    vocab_size = dataset.tokenizer.vocab_size
    print(f"Vocab size: {vocab_size:,}")
    print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

    # Initialize model
    model = TinyGPT(
        vocab_size=vocab_size,
        context_length=CONTEXT_LENGTH,
        n_block=N_LAYER,
        n_head=N_HEAD,
        d_model=D_MODEL,
        dropout=DROPOUT
    ).to(DEVICE)

    # Compile the model for better training 
    if hasattr(torch, "compile"):
        print("Compiling model..")
        model = torch.compile(model)

    optimizer = optim.AdamW(
        model.parameters(),
        lr=LEARNING_RATE,
        betas=(0.9, 0.95),
        weight_decay=WEIGHT_DECAY
    )
    scaler = torch.GradScaler(enabled=(PRECISION == "float16"))

    print(f"Total parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M") # you can change this to billion if you have more bigger model, if you want. 1e9:.2f}B
    print("Starting training...\n")

    iter_num = 0
    best_val_loss = float("inf")
    start_time = time.time()

    data_iter = iter(train_loader)

    pbar = tqdm(range(MAX_ITERS), desc="Training")
    for iter_num in pbar:
        # Learning rate schedule
        lr = get_lr(iter_num)
        for g in optimizer.param_groups:
            g["lr"] = lr

        try:
            x, y = next(data_iter)
        except StopIteration:
            data_iter = iter(train_loader)
            x, y = next(data_iter)

        x, y = x.to(DEVICE), y.to(DEVICE)

        # Forward + backward with AMP
        with torch.autocast(device_type="cuda", dtype=getattr(torch, PRECISION)):
            _, loss = model(x, y)

        scaler.scale(loss).backward()

        if GRAD_CLIP > 0.0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)

        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad(set_to_none=True)

        # Logging
        if iter_num % LOG_INTERVAL == 0:
            tokens_per_sec = (BATCH_SIZE * CONTEXT_LENGTH * LOG_INTERVAL) / (time.time() - start_time + 1e-8)
            start_time = time.time()
            pbar.set_postfix({
                "loss": f"{loss.item():.4f}",
                "lr": f"{lr:.2e}",
                "tok/s": f"{tokens_per_sec:.0f}"
            })

        # Evaluation
        if iter_num % EVAL_ITERS == 0 or iter_num == MAX_ITERS - 1:
            losses = estimate_loss(model, train_loader, val_loader, DEVICE)
            print(f"\nStep {iter_num:,} | "
                  f"Train: {losses['train']:.4f} | "
                  f"Val: {losses['val']:.4f} | "
                  f"LR: {lr:.2e}")

            # Save best model
            if losses["val"] < best_val_loss:
                best_val_loss = losses["val"]
                torch.save({
                    "iter": iter_num,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "scaler_state_dict": scaler.state_dict(),
                    "val_loss": best_val_loss,
                    "config": {
                        "vocab_size": vocab_size,
                        "context_length": CONTEXT_LENGTH,
                        "n_block": N_LAYER,
                        "n_head": N_HEAD,
                        "d_model": D_MODEL,
                    }
                }, CKPT_DIR / "best_model.pt")
                print("New best model saved!")

        # Periodic checkpoint
        if iter_num % 200 == 0 and iter_num > 0:
            torch.save(model.state_dict(), CKPT_DIR / f"checkpoint_iter{iter_num}.pt")

    print(f"\nTraining finished! Best validation loss: {best_val_loss:.4f}")


In [16]:
train()

Using device: cuda | Precision: float16
Unique characters: ['\n', ' ', '!', '"', '&', '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '}', '√Å', '√Ü', '√â', '√ö', '√ú', '√†', '√°', '√¢', '√¶', '√ß', '√®', '√©', '√™', '√´', '√¨', '√≠', '√Æ', '√Ø', '√±', '√≤', '√¥', '√∂', '√π', '√∫', '√ª', '√º', '≈Ñ', '≈í', '≈ì', '≈ö', '≈õ', '«π', 'Œë', 'Œö', 'Œü', 'Œ†', 'Œ£', 'Œ¨', 'Œ≠', 'ŒÆ', 'ŒØ', 'Œ±', 'Œ≤', 'Œ≥', 'Œ¥', 'Œµ', 'Œ∑', 'Œ∏', 'Œπ', 'Œ∫', 'Œª', 'Œº', 'ŒΩ', 'Œæ', 'Œø', 'œÄ', 'œÅ', 'œÇ', 'œÉ', 'œÑ', 'œÖ', 'œÜ', 'œá', 'œâ', 'œå', 'œç', 'œé', '◊î', '◊ï', '◊ô', '◊¢', '◊¶', '·∏ç', '·πÖ', '·πá', '·πõ', '·π£', '·π≠', '·ºÄ', '·ºê', '·ºò', '·ºù', '·º°', '·º§', '·º∞', '·ºµ',

Training:   0%|          | 1/1000 [01:34<26:06:47, 94.10s/it, loss=5.2445, lr=4.00e-05, tok/s=6218]


Step 0 | Train: 4.9827 | Val: 4.9908 | LR: 4.00e-05
New best model saved!


Training:   5%|‚ñå         | 51/1000 [01:57<1:13:21,  4.64s/it, loss=2.5092, lr=2.04e-03, tok/s=205838]


Step 50 | Train: 2.5061 | Val: 2.5121 | LR: 2.04e-03
New best model saved!


Training:  10%|‚ñà         | 101/1000 [02:20<1:10:34,  4.71s/it, loss=2.4653, lr=4.04e-03, tok/s=200616]


Step 100 | Train: 2.4526 | Val: 2.4492 | LR: 4.04e-03
New best model saved!


Training:  15%|‚ñà‚ñå        | 151/1000 [02:44<1:07:29,  4.77s/it, loss=2.4336, lr=6.00e-03, tok/s=190642]


Step 150 | Train: 2.4407 | Val: 2.4377 | LR: 6.00e-03
New best model saved!


Training:  20%|‚ñà‚ñà        | 201/1000 [03:07<1:00:32,  4.55s/it, loss=2.4547, lr=5.95e-03, tok/s=188140]


Step 200 | Train: 2.4303 | Val: 2.4293 | LR: 5.95e-03
New best model saved!


Training:  25%|‚ñà‚ñà‚ñå       | 251/1000 [03:31<59:56,  4.80s/it, loss=2.3348, lr=5.80e-03, tok/s=194823]


Step 250 | Train: 2.3357 | Val: 2.3391 | LR: 5.80e-03
New best model saved!


Training:  30%|‚ñà‚ñà‚ñà       | 301/1000 [03:54<54:31,  4.68s/it, loss=2.0633, lr=5.56e-03, tok/s=193530]


Step 300 | Train: 2.0031 | Val: 2.0011 | LR: 5.56e-03
New best model saved!


Training:  35%|‚ñà‚ñà‚ñà‚ñå      | 351/1000 [04:18<51:50,  4.79s/it, loss=1.8022, lr=5.22e-03, tok/s=187446]


Step 350 | Train: 1.7648 | Val: 1.7604 | LR: 5.22e-03
New best model saved!


Training:  40%|‚ñà‚ñà‚ñà‚ñà      | 401/1000 [04:43<47:40,  4.78s/it, loss=1.6874, lr=4.82e-03, tok/s=180062]


Step 400 | Train: 1.6280 | Val: 1.6223 | LR: 4.82e-03
New best model saved!


Training:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 451/1000 [05:07<42:49,  4.68s/it, loss=1.6538, lr=4.35e-03, tok/s=180108]


Step 450 | Train: 1.5553 | Val: 1.5512 | LR: 4.35e-03
New best model saved!


Training:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 501/1000 [05:31<39:05,  4.70s/it, loss=1.5839, lr=3.84e-03, tok/s=179559]


Step 500 | Train: 1.5000 | Val: 1.4925 | LR: 3.84e-03
New best model saved!


Training:  55%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 551/1000 [05:56<36:18,  4.85s/it, loss=1.5503, lr=3.30e-03, tok/s=177843]


Step 550 | Train: 1.4554 | Val: 1.4531 | LR: 3.30e-03
New best model saved!


Training:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 601/1000 [06:21<31:56,  4.80s/it, loss=1.4794, lr=2.76e-03, tok/s=174017]


Step 600 | Train: 1.4202 | Val: 1.4134 | LR: 2.76e-03
New best model saved!


Training:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 651/1000 [06:45<27:52,  4.79s/it, loss=1.4380, lr=2.22e-03, tok/s=179989]


Step 650 | Train: 1.3884 | Val: 1.3792 | LR: 2.22e-03
New best model saved!


Training:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 701/1000 [07:10<24:09,  4.85s/it, loss=1.4003, lr=1.71e-03, tok/s=173588]


Step 700 | Train: 1.3610 | Val: 1.3529 | LR: 1.71e-03
New best model saved!


Training:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 751/1000 [07:34<19:30,  4.70s/it, loss=1.3768, lr=1.24e-03, tok/s=177902]


Step 750 | Train: 1.3374 | Val: 1.3294 | LR: 1.24e-03
New best model saved!


Training:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 801/1000 [07:59<16:01,  4.83s/it, loss=1.3567, lr=8.35e-04, tok/s=180850]


Step 800 | Train: 1.3229 | Val: 1.3052 | LR: 8.35e-04
New best model saved!


Training:  85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 851/1000 [08:24<11:54,  4.79s/it, loss=1.3556, lr=5.05e-04, tok/s=166902]


Step 850 | Train: 1.3030 | Val: 1.2915 | LR: 5.05e-04
New best model saved!


Training:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 901/1000 [08:48<07:44,  4.69s/it, loss=1.3818, lr=2.61e-04, tok/s=171934]


Step 900 | Train: 1.2902 | Val: 1.2814 | LR: 2.61e-04
New best model saved!


Training:  95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 951/1000 [09:13<03:53,  4.76s/it, loss=1.3219, lr=1.11e-04, tok/s=179957]


Step 950 | Train: 1.2815 | Val: 1.2744 | LR: 1.11e-04
New best model saved!


Training: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [09:37<00:00,  1.73it/s, loss=1.3496, lr=6.20e-05, tok/s=181518]


Step 999 | Train: 1.2739 | Val: 1.2712 | LR: 6.00e-05
New best model saved!

Training finished! Best validation loss: 1.2712





# Generation Pipeline

In [None]:
import torch
from pathlib import Path

CKPT_PATH = Path("/content/runs/best_model.pt")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Generation Settings
PROMPT = "ram"
MAX_NEW_TOKENS = 2000
TEMPERATURE = 0.8
TOP_K = 50

def load_best_model(ckpt_path, device):
    """Same loading logic as before to ensure config matches"""
    if not ckpt_path.exists():
        raise FileNotFoundError(f"Checkpoint not found at {ckpt_path}")

    checkpoint = torch.load(ckpt_path, map_location=device)
    config = checkpoint['config']

    model = TinyGPT(
        vocab_size=config['vocab_size'],
        context_length=config['context_length'],
        n_block=config['n_block'],
        n_head=config['n_head'],
        d_model=config['d_model'],
        dropout=0.0
    )

    state_dict = checkpoint['model_state_dict']
    unwanted_prefix = '_orig_mod.'
    for k, v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

    model.load_state_dict(state_dict)
    model.to(device)
    model.eval()
    return model

@torch.no_grad()
def stream_generate(model, tokenizer, prompt, max_new_tokens, temperature=1.0, top_k=None):
    """
    Generates text and prints it to stdout immediately as tokens are created.
    """
    # 1. Encode and setup
    idx = tokenizer.encode(prompt)
    idx = torch.tensor([idx], dtype=torch.long, device=DEVICE)

    print(f"Prompt: {prompt}", end="", flush=True)

    current_text = tokenizer.decode(idx[0].tolist())
    len_printed = len(current_text)

    # Generation Loop
    for _ in range(max_new_tokens):
        idx_cond = idx if idx.size(1) <= model.context_length else idx[:, -model.context_length:]

        # Forward pass
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :] / max(temperature, 1e-6)

        # Top-K Sampling
        if top_k is not None and top_k > 0:
            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
            logits[logits < v[:, [-1]]] = -float('Inf')

        probs = torch.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)

        idx = torch.cat([idx, next_token], dim=1)

        full_text = tokenizer.decode(idx[0].tolist())
        new_text = full_text[len_printed:]

        print(new_text, end="", flush=True) 
        len_printed += len(new_text)

    print("\n\n--- End of Generation ---")

try:
    model = load_best_model(CKPT_PATH, DEVICE)
    tokenizer = byteTokenizer(data_path)
    # Generation
    print(f"\n--- Streaming Generation (Temp: {TEMPERATURE}) ---\n")
    stream_generate(
        model=model,
        tokenizer=tokenizer, # Requires your tokenizer object
        prompt=PROMPT,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        top_k=TOP_K
    )

except Exception as e:
    print(f"An error occurred: {e}")

Unique characters: ['\n', ' ', '!', '"', '&', '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '}', '√Å', '√Ü', '√â', '√ö', '√ú', '√†', '√°', '√¢', '√¶', '√ß', '√®', '√©', '√™', '√´', '√¨', '√≠', '√Æ', '√Ø', '√±', '√≤', '√¥', '√∂', '√π', '√∫', '√ª', '√º', '≈Ñ', '≈í', '≈ì', '≈ö', '≈õ', '«π', 'Œë', 'Œö', 'Œü', 'Œ†', 'Œ£', 'Œ¨', 'Œ≠', 'ŒÆ', 'ŒØ', 'Œ±', 'Œ≤', 'Œ≥', 'Œ¥', 'Œµ', 'Œ∑', 'Œ∏', 'Œπ', 'Œ∫', 'Œª', 'Œº', 'ŒΩ', 'Œæ', 'Œø', 'œÄ', 'œÅ', 'œÇ', 'œÉ', 'œÑ', 'œÖ', 'œÜ', 'œá', 'œâ', 'œå', 'œç', 'œé', '◊î', '◊ï', '◊ô', '◊¢', '◊¶', '·∏ç', '·πÖ', '·πá', '·πõ', '·π£', '·π≠', '·ºÄ', '·ºê', '·ºò', '·ºù', '·º°', '·º§', '·º∞', '·ºµ', '·º∏', '·ºº', '·ºΩ', '·ΩÅ', '·Ωâ', '·Ωç