# Making Chat Bot to Just HangOut.. :)

# Train The Model

In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer
import numpy as np 
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
#specailTokens = {"additional_special_tokens": ["<context>", "<sep>", "<end>"]}
#tokenizer.add_special_tokens(specailTokens)

batchsiz = 64
blocksiz = 128  
epochs = 6000
evalIntervals = 500
lr = 3e-4
device = "cuda" if torch.cuda.is_available() else "cpu"
evalIters = 200
nemb = 256
nhead = 6
nlayers = 6
dropout = 0.2

with open("/kaggle/input/freind-chatbot-final-model/train_personality.csv", 'r', encoding='utf-8') as file:
    lines = file.readlines()
    txt = lines[:100000]

"""
def prepData(row, tokenizer):
    context = row["Context"]
    response = row["Response"]
    combined = f"<context> {context} <sep> {response} <end>"
    tokens = tokenizer(
        combined,
        return_tensors="pt",
        truncation=True,
        max_length=blocksiz, 
        padding="max_length"
    )["input_ids"]
    return tokens.squeeze(0)

def enc(txt, tokenizer):
    tokens = tokenizer(
        txt, 
        return_tensors="pt", 
        truncation=True, 
        max_length=blocksiz, 
        padding=True
)["input_ids"]

    return tokens.flatten()
"""

def enc(txt, tokenizer):
    tokens = tokenizer(
        txt,
        return_tensors="pt",
        truncation=True,
        padding=True,
    )["input_ids"]
    return tokens.flatten()


#data = enc(txt, tokenizer) #dtype=torch.long // no need Cos a already a tensor
#data = torch.cat([prepData(row, tokenizer) for _, row in df.iterrows()])
data = torch.tensor(enc(txt, tokenizer))
n = int(0.9*len(data))
trainData = data[:n]
valData = data[n:]
vocabsiz = tokenizer.vocab_size

def getBatch(split):
    data = trainData if split == "train" else valData
    ix = torch.randint(len(data) - blocksiz, (batchsiz, ))
    x = torch.stack([data[i:i+blocksiz] for i in ix])
    y = torch.stack([data[i+1:i+blocksiz+1] for i in ix])
    x, y = x.to(device), y.to(device)

    return x, y 
    
@torch.no_grad()
def estimateLoss():
    out = { }
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(evalIters)
        for k in range(evalIters):
            x, y = getBatch(split)
            logits, loss = model(x, y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out 

class Head(nn.Module):
    def __init__(self, headsiz):
        super().__init__()
        self.key = nn.Linear(nemb, headsiz, bias=False)
        self.quary = nn.Linear(nemb, headsiz, bias=False)
        self.value = nn.Linear(nemb, headsiz, bias=False)
        self.dropout = nn.Dropout(dropout)

        self.register_buffer("tril", torch.tril(torch.ones(blocksiz, blocksiz)))
    
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.quary(x)

        w = q @ k.transpose(-2, -1)  * (k.size(-1)**-0.5)
        w = w.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        w = F.softmax(w, dim=-1)
        w = self.dropout(w)

        v = self.value(x)
        out = w @ v 
        return out 

class MultiHeadAttention(nn.Module):
    def __init__(self, nhead, headsiz):
        super().__init__()
        self.heads = nn.ModuleList([Head(headsiz) for _ in range(nhead)])
        self.proj = nn.Linear(headsiz * nhead, nemb)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out 

class FeedForwardNetwork(nn.Module):
    def __init__(self, nemb):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(nemb, 4 * nemb), 
            nn.ReLU(), 
            nn.Linear(4 * nemb, nemb), 
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class TransformerBlock(nn.Module):
    def __init__(self, nemb, nhead):
        super().__init__()
        headsiz = nemb // nhead
        self.sa = MultiHeadAttention(nhead, headsiz)
        self.ffn = FeedForwardNetwork(nemb)
        self.ln_1 = nn.LayerNorm(nemb)
        self.ln_2 = nn.LayerNorm(nemb)

    def forward(self, x):
        x = x + self.sa(self.ln_1(x))
        x = x + self.ffn(self.ln_2(x))
        return x 

class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.wte = nn.Embedding(vocabsiz, nemb)
        self.wpe = nn.Embedding(blocksiz, nemb)
        self.block = nn.Sequential(*[TransformerBlock(nemb, nhead=nhead) for _ in range(nlayers)])
        self.ln_final = nn.LayerNorm(nemb)
        self.lm_head = nn.Linear(nemb, vocabsiz)
    
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, ix, targt=None):
        B, T = ix.shape
        
        tokEmb = self.wte(ix)
        posEmb = self.wpe(torch.arange(T, device=device))
        x = tokEmb + posEmb
        x = self.block(x)
        x = self.ln_final(x)

        logits = self.lm_head(x)

        if targt is None:
            loss = None
        else:
            B, T, C = logits.shape 
            logits = logits.view(B*T, C)
            targt = targt.view(B*T)
            loss = F.cross_entropy(logits, targt)
        
        return logits, loss 
    
    def generate(self, ix, maxNewTok, tokenizer):
        for _ in range(maxNewTok):
            ixCond = ix[:, -blocksiz:]
            logits, loss = self(ixCond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            ixNxt = torch.multinomial(probs, num_samples=1)
            ix = torch.cat((ix, ixNxt), dim=1)
        
        gen = tokenizer.decode(ix[0].cpu().numpy().tolist(), skip_special_tokens=True)
        return gen 

model = GPT()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

useCompile = False
if useCompile:
    model = torch.compile(model)
    print("using Compile")
else:
    print("not Using Compile")

optim = torch.optim.AdamW(model.parameters(), lr=lr)

lossi = [ ]
trainLoss = [ ]
valLoss = [ ]
#scaler = torch.cuda.amp.GradScaler()
print("start Training")
for i in range(epochs):
    if i % evalIntervals == 0 or i == epochs - 1:
        losses = estimateLoss()
        trainLoss.append(losses["train"].item())
        valLoss.append(losses['val'].item())
        lossi.append(i)
        print(f"Step: {i} | train loss {losses['train']:.4f} | val loss {losses['val']:.4f}")
    
    xb, yb = getBatch("train")
    #with torch.cuda.amp.autocast():
    logits, loss = model(xb, yb)

    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()
    #scaler.scale(loss).backward()
    #scaler.step(optim)
    #scaler.update()



def saveCheckpnt(model, optimizer, epoch, loss, file):
    checkPnt = {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "epoch": epoch,
        "loss": loss,
    }
    torch.save(checkPnt, file)

# Saving model checkpoint
saveCheckpnt(model, optim, epochs-1, valLoss[-1], "FreindBotModelTrain.pth")

tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

  data = torch.tensor(enc(txt, tokenizer))


29.957632 M parameters
not Using Compile
start Training
Step: 0 | train loss 10.7144 | val loss 10.7140
Step: 500 | train loss 0.8200 | val loss 0.8206
Step: 1000 | train loss 0.6990 | val loss 0.7115
Step: 1500 | train loss 0.6557 | val loss 0.6715
Step: 2000 | train loss 0.6368 | val loss 0.6464
Step: 2500 | train loss 0.6131 | val loss 0.6282
Step: 3000 | train loss 0.5905 | val loss 0.6144
Step: 3500 | train loss 0.5759 | val loss 0.6067
Step: 4000 | train loss 0.5633 | val loss 0.5919
Step: 4500 | train loss 0.5557 | val loss 0.5831
Step: 5000 | train loss 0.5413 | val loss 0.5831
Step: 5500 | train loss 0.5342 | val loss 0.5792
Step: 5999 | train loss 0.5290 | val loss 0.5697


# Genarate From The Model

In [2]:

#generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)  # Initial context
genTxt = model.generate(context, maxNewTok=500, tokenizer=tokenizer)
print(genTxt)


 my dad was conservative before. i once became a student he is an artist. i love my name is jacob. i live in texas.,"hello . my roommate is 2 . what kind of a cabin is yours going to get married
i vacation all of the time
what is your favorite color ? mine is blue .
no , i own 112 f47 siblings . . . i bread
that blows . i hear that , even though ."
"i never heard of
