# Train Model,,.

In [None]:
import torch
import torch.nn as nn 
import torch.nn.functional as F 
from transformers import AutoTokenizer
import pandas as pd 

with open("/kaggle/input/200232823/train.csv", 'r', encoding='utf-8') as file:
    txt = file.read()

# hyper para
blocksiz = 128
batchsiz = 64
epochs = 700
evalIntervals = 200
evaliters = 50
nemb = 42
nhead = 1
nlayers = 1
dropout = 0.0 
device = "cuda" if torch.cuda.is_available() else "cpu"
lr = 1e-2

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")

specialTok = {
    "sep_token" : "<|sep|>"
}
# tokenizer already have eos and pad
tokenizer.add_special_tokens(specialTok)

vocabsiz = tokenizer.vocab_size
print(vocabsiz)

def enc(txt, tokenizer):
    tokens = tokenizer(
        txt,
        return_tensors="pt", 
        truncation=True,
        padding=True,
        add_special_tokens=True
    )["input_ids"]

    return tokens.flatten()

data = enc(txt, tokenizer)
n = int(0.9*len(data))
trainData = data[:n]
valData = data[n:]

trainData = torch.tensor(trainData, dtype=torch.long)
valData = torch.tensor(valData, dtype=torch.long)
print(f"Train Data Shape: {trainData.shape}")
print(f"Validation Data Shape: {valData.shape}")

def getBatch(split):
    data = trainData if split == "train" else valData
    ix = torch.randint(0, len(data) - blocksiz, (batchsiz,))
    x = torch.stack([data[i:i+blocksiz] for i in ix])
    y = torch.stack([data[i+1:i+blocksiz+1] for i in ix])
    x, y = x.to(device), y.to(device)

    return x, y 

@torch.no_grad()
def estimateLoss():
    out = { }
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(evaliters)
        for k in range(evaliters):
            x, y = getBatch(split)
            logits, loss = model(x, y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    
    model.train()
    return out 

class Head(nn.Module):
    def __init__(self, headsiz):
        super().__init__()
        self.key = nn.Linear(nemb, headsiz, bias=False)
        self.quary = nn.Linear(nemb, headsiz, bias=False)
        self.value = nn.Linear(nemb, headsiz, bias=False)
        
        self.register_buffer("tril", torch.tril(torch.ones(blocksiz, blocksiz)))
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.quary(x)

        w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
        w = w.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        w = F.softmax(w, dim=-1)
        w = self.dropout(w)
        
        v = self.value(x)
        out = w @ v 
        return out 

class MultiHeadAttention(nn.Module):
    def __init__(self, nhead, headsiz):
        super().__init__()
        self.heads = nn.ModuleList([Head(headsiz) for _ in range(nhead)])
        self.proj = nn.Linear(headsiz * nhead, nemb)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out 

class FeedForwardNetwork(nn.Module):
    def __init__(self, nemb):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(nemb, 4 * nemb), 
            nn.GELU(approximate="tanh"),
            nn.Linear(4 * nemb, nemb), 
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, nemb, nhead):
        super().__init__()
        headsiz = nemb // nhead
        self.sa = MultiHeadAttention(nhead, headsiz)
        self.ffn = FeedForwardNetwork(nemb)
        self.ln_1 = nn.LayerNorm(nemb)
        self.ln_2 = nn.LayerNorm(nemb)  
    
    def forward(self, x):
        x = x + self.sa(self.ln_1(x))
        x = x + self.ffn(self.ln_2(x))
        return x 


class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        newVocabSiz = len(tokenizer)
        self.wte = nn.Embedding(newVocabSiz, nemb)
        torch.nn.init.normal_(model.wte.weight, mean=0.0, std=0.02)

        self.wpe = nn.Embedding(blocksiz, nemb)
        self.block = nn.Sequential(*[Block(nemb, nhead=nhead) for _ in range(nlayers)])
        self.ln_f = nn.LayerNorm(nemb)
        
        self.lm_head = nn.Linear(nemb, vocabsiz)
        torch.nn.init.normal_(model.lm_head.weight, mean=0.0, std=0.02)
        
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, ix, targt=None):
        B, T = ix.shape
        tokEmb = self.wte(ix)
        posEmb = self.wpe(torch.arange(T, device=device))

        x = tokEmb + posEmb
        x = self.block(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targt is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targt = targt.view(B * T)
            loss = F.cross_entropy(logits, targt)
        return logits, loss 

########### Truncate generally means to shorten something by cutting off a portion of it, ####

    def generate(self, ix, max_new_tokens, tokenizer, tempertaure=1.0, top_k=None):
        for _ in range(max_new_tokens):
            # Truncate input to the context window size
            ix_cond = ix[:, -min(blocksiz, ix.shape[1]):]
            logits, _ = self(ix_cond)
            logits = logits[:, -1, :]

            if tempertaure != 1.0:
                logits = logits / tempertaure
            
            if top_k is not None :
                val, indeces = torch.topk(logits, k=top_k, dim=-1)
                logits = torch.zeros_like(logits).scatter_(-1, indeces, val)
            
            probs = F.softmax(logits, dim=-1)

            ixNxt = torch.multinomial(probs, num_samples=1)
            ix = torch.cat((ix, ixNxt), dim=-1)
        
        return tokenizer.decode(ix[0].cpu().numpy().tolist(), skip_special_tokens=True)
        

            
# after add token all this line, [when init model]
newVocabSiz = len(tokenizer)
model = GPT()
model.wte = nn.Embedding(newVocabSiz, nemb)
torch.nn.init.normal_(model.wte.weight, mean=0.0, std=0.02)
m = model.to(device)


# Use compile
useCompile = False 
if useCompile:
    model = torch.compile(model)
    print("Using compile")
else:
    print("not using Compile")

#--------------
# save checkpoint path
def saveCheckpoint(model, optim, epoch, loss, file):
    chekPnt = {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optim.state_dict(),
        "epoch": epoch,
        "loss": loss,
        
    }   
    torch.save(chekPnt, file)
#--------------


# Add Mixed Precision
from torch.cuda.amp import GradScaler, autocast
scaler = GradScaler()

optim = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-3)
# lr schedular
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=epochs)

#ploting loss
traini=[ ]
trainloss_i=[]
valloss_i=[]

for i in range(epochs):
    if i % evaliters == 0 or i == epochs - 1:
        losses = estimateLoss()
        trainloss_i.append(losses["train"].item())
        valloss_i.append(losses["val"].item())
        traini.append(i)
        
        print(f"Epoch {i}/{epochs} | Train loss {losses['train']:.4f} | Val loss {losses['val']:.4f}")
    
    xb, yb = getBatch("train")
    with autocast():
        logits, loss = model(xb, yb)

    scaler.scale(loss).backward()
    scaler.step(optim)
    scaler.update()

    optim.zero_grad(set_to_none=True)

    saveIntervals = 50 if i <= 200 else 400
    if i % saveIntervals == 0 or i == epochs - 1:
        saveCheckpoint(
            model,
            optim,
            i,
            valloss_i[-1] if valloss_i else float("inf"), 
            "Therapy_bot_Trained_model.pth"
        ) 


# Model Finetune 

In [14]:
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.utils
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import pandas as pd

checkPointPath = '/kaggle/input/200232823/Therapy_bot_Trained_model.pth'
batchsiz = 64
lr=1e-4
epochs = 100
device = "cuda" if torch.cuda.is_available() else "cpu"
nemb = 42
nhead = 2
nlayers = 2
blocksiz=128
dropout=0.0

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
specialTok = {
    "sep_token" : "<|sep|>"
}
# tokenizer already have eos and pad
tokenizer.add_special_tokens(specialTok)
tokenizer.model_max_length = 128
vocabsiz = tokenizer.vocab_size
class TherapyDataset(Dataset):
    def __init__(self, data, tokenizer, blocksiz=128):
        self.data = data
        self.tokenizer = tokenizer
        self.blocksiz = blocksiz
            
    def __len__(self,):
        return len(self.data)
    
    def __getitem__(self, ix):
        row = self.data.iloc[ix]
    
        # Ensure column names match your dataset
        context = row["Context"]  # Replace with actual column name if different
        response = row["Response"]  # Replace with actual column name if different
    
        # Prepare input text and tokenize
        inputTxt = f"{context} <|sep|> {response}"
        enc = self.tokenizer(
            inputTxt,
            max_length=self.blocksiz,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
    
        return enc["input_ids"].squeeze(0), enc["attention_mask"].squeeze(0), enc["input_ids"].squeeze(0)

data = pd.read_csv('/kaggle/input/200232823/finetune_train.csv', encoding='utf-8')
dataset = TherapyDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=batchsiz, shuffle=True, )

#### GPT MODEL ####

class Head(nn.Module):
    def __init__(self, headsiz):
        super().__init__()
        self.key = nn.Linear(nemb, headsiz, bias=False)
        self.quary = nn.Linear(nemb, headsiz, bias=False)
        self.value = nn.Linear(nemb, headsiz, bias=False)
        
        self.register_buffer("tril", torch.tril(torch.ones(blocksiz, blocksiz)))
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.quary(x)

        w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
        w = w.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        w = F.softmax(w, dim=-1)
        w = self.dropout(w)
        
        v = self.value(x)
        out = w @ v 
        return out 

class MultiHeadAttention(nn.Module):
    def __init__(self, nhead, headsiz):
        super().__init__()
        self.heads = nn.ModuleList([Head(headsiz) for _ in range(nhead)])
        self.proj = nn.Linear(headsiz * nhead, nemb)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out 

class FeedForwardNetwork(nn.Module):
    def __init__(self, nemb):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(nemb, 4 * nemb), 
            nn.GELU(approximate="tanh"),
            nn.Linear(4 * nemb, nemb), 
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, nemb, nhead):
        super().__init__()
        headsiz = nemb // nhead
        self.sa = MultiHeadAttention(nhead, headsiz)
        self.ffn = FeedForwardNetwork(nemb)
        self.ln_1 = nn.LayerNorm(nemb)
        self.ln_2 = nn.LayerNorm(nemb)  
    
    def forward(self, x):
        x = x + self.sa(self.ln_1(x))
        x = x + self.ffn(self.ln_2(x))
        return x 

class GPT(nn.Module):
    def __init__(self, tokenizer):
        super().__init__()
        new_vocab_size = len(tokenizer)
        self.wte = nn.Embedding(new_vocab_size, nemb)
        torch.nn.init.normal_(self.wte.weight, mean=0.0, std=0.02)

        self.wpe = nn.Embedding(blocksiz, nemb)
        self.block = nn.Sequential(*[Block(nemb, nhead=nhead) for _ in range(nlayers)])
        self.ln_f = nn.LayerNorm(nemb)

        self.lm_head = nn.Linear(nemb, new_vocab_size)
        torch.nn.init.normal_(self.lm_head.weight, mean=0.0, std=0.02)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input_ids, attention_mask, labels=None):
        B, T = input_ids.shape
        tok_emb = self.wte(input_ids)  # Token embeddings
        pos_emb = self.wpe(torch.arange(T, device=device))  # Positional embeddings

        x = tok_emb + pos_emb
        x = self.block(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if labels is not None:
            logits = logits.view(-1, logits.size(-1))
            labels = labels.view(-1)
            loss = F.cross_entropy(logits, labels)

        return logits, loss 

model = GPT(tokenizer).to(device)

checkpoint = torch.load(checkPointPath, map_location=device)
model.state_dict(checkpoint["model_state_dict"])

optim = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=epochs)

for i in range(epochs):
    model.train()
    lossi = 0
    for batch in dataloader:
        input_ids, attn_mask, label = [x.to(device) for x in batch]

        optim.zero_grad()
        logits, loss = model(input_ids, attn_mask, label)
        loss.backward()
        lossi += loss.item()
    
    scheduler.step()

    print(f"Epoch {i + 1}/{epochs}, Loss: {lossi / len(dataloader):.4f}")

finteTuneCheckPntPth = 'model_finetune.pth'
torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optim.state_dict(),
    "epoch": epochs,
}, finteTuneCheckPntPth)
print(f"Fine-tuning completed. Model saved at {finteTuneCheckPntPth}.")




  checkpoint = torch.load(checkPointPath, map_location=device)
  model.state_dict(checkpoint["model_state_dict"])


Epoch 1/100, Loss: 10.8270
Epoch 2/100, Loss: 10.8271
Epoch 3/100, Loss: 10.8271
Epoch 4/100, Loss: 10.8271
Epoch 5/100, Loss: 10.8271
Epoch 6/100, Loss: 10.8271
Epoch 7/100, Loss: 10.8270
Epoch 8/100, Loss: 10.8271
Epoch 9/100, Loss: 10.8271
Epoch 10/100, Loss: 10.8271
Epoch 11/100, Loss: 10.8271
Epoch 12/100, Loss: 10.8270
Epoch 13/100, Loss: 10.8270
Epoch 14/100, Loss: 10.8271
Epoch 15/100, Loss: 10.8270
Epoch 16/100, Loss: 10.8271
Epoch 17/100, Loss: 10.8271
Epoch 18/100, Loss: 10.8271
Epoch 19/100, Loss: 10.8271
Epoch 20/100, Loss: 10.8271
Epoch 21/100, Loss: 10.8271
Epoch 22/100, Loss: 10.8271
Epoch 23/100, Loss: 10.8271
Epoch 24/100, Loss: 10.8271
Epoch 25/100, Loss: 10.8271
Epoch 26/100, Loss: 10.8271
Epoch 27/100, Loss: 10.8270
Epoch 28/100, Loss: 10.8271
Epoch 29/100, Loss: 10.8271
Epoch 30/100, Loss: 10.8271
Epoch 31/100, Loss: 10.8270
Epoch 32/100, Loss: 10.8270
Epoch 33/100, Loss: 10.8271
Epoch 34/100, Loss: 10.8271
Epoch 35/100, Loss: 10.8271
Epoch 36/100, Loss: 10.8270
E

In [17]:

finetuneCheckPoint = '/kaggle/working/model_finetune.pth'
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
tokenizer.add_special_tokens({"sep_token": "<|sep|>"})

model = GPT(tokenizer).to(device)
checkpoint = torch.load(finteTuneCheckPntPth, map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()


def chat_with_model():
    while True:
        user_input = input("You: ")
        if user_input.lower() in ["exit", "quit"]:
            break

        input_text = f"<|sep|> {user_input}"  # Adjust to your format
        input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).input_ids.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=None)
            logits = outputs[0][:, -1, :]  # Unpack logits from the first element of the tuple
            next_token = torch.argmax(logits, dim=-1)  # Predict the next token
            generated_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=1)

        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        print(f"Lacan: {generated_text}")

chat_with_model()

  checkpoint = torch.load(finteTuneCheckPntPth, map_location=device)


You:  hi, i feel anxious.


Lacan:  hi, i feel anxious. ABS


KeyboardInterrupt: Interrupted by user