<a href="https://colab.research.google.com/github/BraedynL0530/PortfolioWebsite/blob/master/Multi_head_classfication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from torch.autograd import forward_ad
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
from dataclasses import  dataclass
torch.manual_seed(42)

#config
@dataclass
class config:
  vocab_size: int  #unique words
  block_size: int  #how far back(context) it can see, memory/ how many tokens back
  n_layer: int   # stacked blocks, more layers more reasoning more train time
  n_head: int   # attentions per layer, how many "eyes" looking for a new pattern
  n_embd: int   #size of vector for each token
  dropout: float  #prevents overfitting by stopping random paths
  pad_token_id: int



"""
self attention: part 1 of transformer
Q K V, query key value. helps use the two embeddings to learn diffrent meanings for words and give the diffrent vectors even if the same word
below is theory class is optimized, it condences the prjections into one huge vector and splits. other than that its nearly identical just more efficent
"""

"""
#learnable compenets
q_prog = nn.Linear(C, C, bias =False)
k_prog = nn.Linear(C, C, bias =False)
v_prog = nn.Linear(C, C, bias =False)

#weights
q_prog.weight.data = torch.randn(C,C)
q_prog.weight.data = torch.randn(C,C)
q_prog.weight.data = torch.randn(C,C)

#preform projection
q = q_prog(x)
k = k_prog(x)
v = v_prog(x)

scores = q @ k.transpose(-2,-1)
print("scores",scores)



Attention(Q,K,V)=softmax(​QK^⊤/dk​​)V

d_k = k.size(-1)#last dimesion of
scaled_scores = scores / math.sqrt(d_k)
attention_weights = F.softmax(scaled_scores, dim=1)
print("scaled scores", scaled_scores)
print("scaled scores -> percentages", attention_weights)

# aggreation Last part of attention!
output = attention_weights @ v
print("output!:",output)

"""

# Core logic for MultiHead
class CausalSelfAttention(nn.Module): #no longer casual masking, bi directional
  def __init__(self, config :config):
    super().__init__()
    assert config.n_embd % config.n_head == 0
    self.n_head = config.n_head
    self.n_embd = config.n_embd
    self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False) # Fuzed layer = more efficent
    self.attn_drop = nn.Dropout(config.dropout)
    self.register_buffer( # part of causal masking
        "bias",# buffer name
        torch.tril(torch.ones(config.block_size,config.block_size))
        .view(1,1, config.block_size, config.block_size)
    )

    self.c_proj = nn.Linear(config.n_embd,config.n_embd)

  def forward(self, x,pad_mask=None):
    B, T, C = x.size()
    head_dim = C // self.n_head

    # project once -> split
    qkv = self.c_attn(x)
    q, k, v = qkv.split(C, dim=2)

    # reshape into heads
    q = q.view(B, T, self.n_head, head_dim).transpose(1, 2)
    k = k.view(B, T, self.n_head, head_dim).transpose(1, 2)
    v = v.view(B, T, self.n_head, head_dim).transpose(1, 2)

    # attention
    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(head_dim))
    #att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf")) # prevents it from seeing future tokens. Removed for bidirectional

    # Prevent attending to padding tokens (BEFORE softmax now)
    if pad_mask is not None:
      att = att.masked_fill(
          pad_mask[:, None, None, :T] == 0,
          float("-inf")
      )

    att = F.softmax(att, dim=-1)
    att = self.attn_drop(att)

    # aggregate :3
    y = att @ v

    # merge heads
    y = y.transpose(1, 2).contiguous().view(B, T, C)

    # final projection
    y = self.c_proj(y)
    return y


class MLP(nn.Module):
  def __init__(self, config :config):
    super().__init__()
    self.fc = nn.Linear(config.n_embd, 4 * config.n_embd) #expands dimestions, think of it as more room to think / combining features
    self.proj = nn.Linear(4 * config.n_embd, config.n_embd) # condenses back so it can be added back to attetion
    self.drop = nn.Dropout(config.dropout) #refer to config

  def forward(self, x):
    x = self.fc(x)
    x = F.gelu(x) # makes x nonlinear so fc and proj dont just merge into one straight line
    x =self.proj(x)
    x = self.drop(x)

    return x

class Block(nn.Module): #residual connection
  def __init__(self, config : config): #litterly just does f(x) + x instead of f(x) so mlp dosesnt relearn it takes the learned/trained data and keeps it
    super().__init__()
    self.ln_1 = nn.LayerNorm(config.n_embd)
    self.attn = CausalSelfAttention(config)
    self.ln_2 = nn.LayerNorm(config.n_embd)
    self.mlp = MLP(config)

  def forward(self, x, pad_mask=None):  # ← Added pad_mask parameter
    # focus (the "+")
    x = x + self.attn(self.ln_1(x), pad_mask=pad_mask)
    x = x + self.mlp(self.ln_2(x))
    return x

In [None]:

class NLP(nn.Module):
  def __init__(self, config: config):
    super().__init__()
    # Input
    self.wte = nn.Embedding(config.vocab_size, config.n_embd)
    self.wpe = nn.Embedding(config.block_size, config.n_embd)
    self.drop = nn.Dropout(config.dropout)
    self.config = config
    self.pad_token_id = config.pad_token_id




    # Processing, makes a stack/block / LAYER for deeper understanding
    # Data flows through sequncesnsy so more refined/better understanding
    self.h = nn.ModuleList([Block(config) for _ in range(config.n_layer)])

    #output layers
    self.ln_f = nn.LayerNorm(config.n_embd) # final layer norm
    self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias = False) #language model head, parrel prediction(linear) makes raw score for each possible next token , good for training, and throws away the
    #rest(all but last vector) if not traning
    # Above makes raw score for each possible next token


    self.lm_head.weight = self.wte.weight

    self.apply(self._init_weights)

  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
        nn.init.normal_(module.weight, mean=0.0, std=0.02)
        if module.bias is not None:
            nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
        nn.init.normal_(module.weight, mean=0.0, std=0.02)

  def forward(self, idx): #idx = input targets = inputs shifter one pos to left
    B, T = idx.size()

    assert T <= self.config.block_size, f"Sequence length {T} exceeds block_size {self.config.block_size}"

    tok_emb = self.wte(idx)
    tok_emb[idx == self.pad_token_id] = 0
    pos = torch.arange(T, device=idx.device).unsqueeze(0)
    pos_emb = self.wpe(pos)
    x = self.drop(tok_emb + pos_emb)

    pad_mask = (idx != self.pad_token_id).float()  # ← Mask: 1 for real tokens, 0 for padding

    # Process through transformer blocks
    for block in self.h:
        x = block(x, pad_mask=pad_mask)  # ← Pass mask through each block

    # Final layer norm
    x = self.ln_f(x)

    return x

