# Finte Tune The Model

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sklearn.preprocessing import StandardScaler
import os

device = "cuda" if torch.cuda.is_available() else "cpu"

batchsiz = 64
blocksiz = 128
nemb = 256
epochs = 80
nlayers = 6
nhead = 6
dropout = 0.2
lr = 3e-4
scaler = StandardScaler()

# Define the Dataset
class TextDataset(Dataset):
    def __init__(self, txt, tokenizer, blocksiz, scaler=None):
        self.tokenizer = tokenizer
        self.blocksiz = blocksiz
        self.scaler = scaler
        self.data = self.tokenizer(
            txt,
            return_tensors="pt",
            max_length=blocksiz,
            padding=True,
            truncation=True
        )["input_ids"]
        
        # Ensure that 'data' is an integer tensor (token indices)
        self.data = self.data.long()  # Convert to long if it's not already
        
        if self.scaler:
            # Apply scaling to features (if necessary) but not the token IDs
            self.data = self.data.float()  # Make sure it's a float tensor (necessary for scaling)
            self.data = self.data.view(-1, self.data.size(-1))  # Reshape to 2D (batch, features)
            self.data = torch.tensor(self.scaler.fit_transform(self.data), dtype=torch.float32)  # Apply scaling and convert back to tensor
    
    def __len__(self):
        return self.data.size(0)

    def __getitem__(self, ix):
        x = self.data[ix, :-1]
        y = self.data[ix, 1:]
        return x, y

# Define the Transformer Block components

class Head(nn.Module):
    def __init__(self, headsiz):
        super().__init__()
        self.key = nn.Linear(nemb, headsiz, bias=False)
        self.quary = nn.Linear(nemb, headsiz, bias=False)
        self.value = nn.Linear(nemb, headsiz, bias=False)
        self.dropout = nn.Dropout(dropout)

        self.register_buffer("tril", torch.tril(torch.ones(blocksiz, blocksiz)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.quary(x)

        w = q @ k.transpose(-2, -1)  * (k.size(-1)**-0.5)
        w = w.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        w = F.softmax(w, dim=-1)
        w = self.dropout(w)

        v = self.value(x)
        out = w @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, nhead, headsiz):
        super().__init__()
        self.heads = nn.ModuleList([Head(headsiz) for _ in range(nhead)])
        self.proj = nn.Linear(headsiz * nhead, nemb)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForwardNetwork(nn.Module):
    def __init__(self, nemb):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(nemb, 4 * nemb),
            nn.GELU(approximate='tanh'),
            nn.Linear(4 * nemb, nemb),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class TransformerBlock(nn.Module):
    def __init__(self, nemb, nhead):
        super().__init__()
        headsiz = nemb // nhead
        self.sa = MultiHeadAttention(nhead, headsiz)
        self.ffn = FeedForwardNetwork(nemb)
        self.ln_1 = nn.LayerNorm(nemb)
        self.ln_2 = nn.LayerNorm(nemb)

    def forward(self, x):
        x = x + self.sa(self.ln_1(x))
        x = x + self.ffn(self.ln_2(x))
        return x

class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.wte = nn.Embedding(vocabsiz, nemb)
        self.wpe = nn.Embedding(blocksiz, nemb)
        self.block = nn.Sequential(*[TransformerBlock(nemb, nhead=nhead) for _ in range(nlayers)])
        self.ln_final = nn.LayerNorm(nemb)
        self.lm_head = nn.Linear(nemb, vocabsiz)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, ix, targt=None):
        B, T = ix.shape

        tokEmb = self.wte(ix)
        posEmb = self.wpe(torch.arange(T, device=device))
        x = tokEmb + posEmb
        x = self.block(x)
        x = self.ln_final(x)

        logits = self.lm_head(x)

        if targt is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targt = targt.view(B*T)
            loss = F.cross_entropy(logits, targt)

        return logits, loss

    def generate(self, ix, maxNewTok, tokenizer):
        for _ in range(maxNewTok):
            ixCond = ix[:, -min(blocksiz, ix.shape[1]):]
            logits, loss = self(ixCond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            ixNxt = torch.multinomial(probs, num_samples=1)
            ix = torch.cat((ix, ixNxt), dim=1)

        gen = tokenizer.decode(ix[0].cpu().numpy().tolist(), skip_special_tokens=True)
        return gen

# Model Setup
model_checkpoint_path = "/kaggle/input/freind-chatbot-final-model/FriendBotModelTrain.pth"
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
vocabsiz = tokenizer.vocab_size 
model = GPT()
m = model.to(device)
checkPntPath = 'FreindBotModelTrain.pth'
model.load_state_dict(torch.load(model_checkpoint_path, map_location=device), strict=False)
model.eval()

for param in model.block[:nlayers//2].parameters():
    param.requires_grad = False

# Prepare the dataset and dataloader
with open("/kaggle/input/freind-chatbot-final-model/finetune_personality.csv", 'r', encoding='utf-8') as file:
    lines = file.readlines()
    txt = lines[:130000]

dataset = TextDataset(txt, tokenizer, blocksiz, scaler=scaler)
dataloader = DataLoader(dataset, batch_size=batchsiz, shuffle=True, pin_memory=True)

# Optimizer Setup
accum_steps = 4
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
model.train()

# Training Loop
for epoch in range(epochs):
    running_loss = 0.0
    for step, (ix, target) in enumerate(dataloader):
        ix, target = ix.long().to(device), target.long().to(device)

        logits, loss = model(ix, target)
        loss = loss / accum_steps
        loss.backward()

        if (step + 1) % accum_steps == 0 or (step + 1) == len(dataloader):
            optimizer.step()
            optimizer.zero_grad()

        running_loss += loss.item() * accum_steps

    print(f"Epoch: {epoch + 1}/{epochs} | Loss: {running_loss / len(dataloader):.4f}")

# Save the fine-tuned model and tokenizer
torch.save(model.state_dict(), "FineTunedFreindBot.pth")
os.makedirs("fineTuneFreindTokenizer", exist_ok=True)
tokenizer.save_pretrained("fineTuneFreindTokenizer")


  model.load_state_dict(torch.load(model_checkpoint_path, map_location=device), strict=False)


Epoch: 1/80 | Loss: 0.4897
Epoch: 2/80 | Loss: 0.1066
Epoch: 3/80 | Loss: 0.1018


KeyboardInterrupt: 

# Chat With The Friend Model

In [1]:
def chat_with_model(model, tokenizer, device, max_tokens=50):
    print("Chat with the model! Type 'exit' to end the conversation.")
    # Set the model to evaluation mode
    model.eval()
    
    while True:
        # Get user input
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Exiting the chat. Goodbye!")
            break

        # Tokenize the input
        input_ids = tokenizer(user_input, return_tensors="pt")["input_ids"].to(device)

        # Generate a response
        with torch.no_grad():
            response_ids = model.generate(
                input_ids=input_ids,
                maxNewTok=max_tokens,
                tokenizer=tokenizer
            )
        
        # Decode the response
        response_text = tokenizer.decode(
            response_ids[0].cpu().numpy().tolist(), skip_special_tokens=True
        )
        
        # Print the response
        print(f"FriendBot: {response_text}")
# Ensure the model and tokenizer are loaded
model_checkpoint_path = "/kaggle/working/FineTunedFreindBot.pth"
model.load_state_dict(torch.load(model_checkpoint_path, map_location=device))
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/fineTuneFreindTokenizer")
chat_with_model(model, tokenizer, device)

NameError: name 'model' is not defined