## Importing tokenizer and loading it

In [None]:
from src.tokenizer.regex_tokenizer import RegexTokenizer
tokenizer = RegexTokenizer()

In [None]:
tokenizer.load(model_file=r"src\tokenizer\tokenizer_model.model")

In [None]:
def get_vocab_size(tokenizer):

    vocab = tokenizer.vocab
    return len(vocab)

In [None]:
get_vocab_size(tokenizer)

## Loading dataset

In [None]:
import json
dataset = []
with open("datasets\fine_tuned_dataset.jsonl", "r", encoding="utf-8") as f:
   for line in f:
       dataset.append(json.loads(line))

In [None]:
dataset[5]

## Checking if the block size is not exceeded

In [None]:
max_tokens = 0
block_size = 1024

for data in dataset:
    concat_msg = ""
    for msg in data:
        content = msg['content']
        concat_msg += content + "\n"
        
    tokens = tokenizer.encode(concat_msg)
    max_tokens = max(max_tokens, len(tokens))

    if len(tokens) > block_size:
        print(f"The tokens which exceed block size with length: {len(tokens)}")

#### Creating encoded data for fine tuning

In [None]:
import re
def clean_format_text(data: list[list[dict]],
                      block_size: int,
                     tokenizer):

    fine_tuned_data = []

    for conversation in data:
        concat_msg = "<|startoftext|>"
        for message in conversation:
            role = message["role"]
            content = re.sub(r"[^\w\s.,?<>|]", "", message['content']).strip()
            # content = remove_persona(content)

            if role == "user":
                concat_msg += f"<|User|>{content}"

            else:
                concat_msg += f"<|Assistant|>{content}"
                sample = concat_msg + "<|endoftext|>"

                encoded_msg = tokenizer.encode(sample,
                                              allowed_special="all")
                
                if len(encoded_msg) <= block_size:
                    fine_tuned_data.append(encoded_msg)

    return fine_tuned_data

In [None]:
fine_tuned = clean_format_text(data=dataset,
                               block_size=1024,
                               tokenizer=tokenizer)

In [None]:
print("Total data for fine tuning:", len(fine_tuned))

In [None]:
print(tokenizer.decode(fine_tuned[2]))

#### Checking the max length and min length tokens from the encoded data

In [None]:
max_sequence_length = 0
min_token = len(fine_tuned[0])
for i in fine_tuned:
    max_sequence_length = max(max_sequence_length, len(i))
    min_token = min(min_token, len(i))

print("Max length tokens:", max_sequence_length)
print("Minimum length tokens:", min_token)

#### Applying padding to make the shapes as same

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
from typing import Tuple, Dict
from transformers import get_cosine_schedule_with_warmup
from tqdm import tqdm
import numpy as np
import re

In [None]:
def apply_pad(data: list[list[int]], max_sequence_length: int,
             padding_token: int) -> torch.Tensor:

    tensors = []
    for i in range(len(data)):
        tensor = torch.tensor(data[i])
        padded_tensor = F.pad(
            input=tensor,
            pad=(0, block_size - len(tensor)),
            value=padding_token
        )
        tensors.append(padded_tensor)

    return torch.stack(tensors)

In [None]:
train_data_tensor = apply_pad(data=fine_tuned,
                             max_sequence_length=1024,
                             padding_token=3077)
print("The shape of train data shape:", train_data_tensor.shape, "\n")
print("\n Train data tensor padded:", train_data_tensor)

In [None]:
split_index = int(0.95 * len(train_data_tensor)) 
train_data_split = train_data_tensor[:split_index]
val_data_split = train_data_tensor[split_index:]

print("Train data split shape:", train_data_split.shape)
print("Validation data split shape:", val_data_split.shape)

#### Creating DataLoaders

In [None]:
class FineTunedDataset(Dataset):

    def __init__(self, padding_token: int, 
                 data: torch.Tensor, device: str,
                 tokenizer, assist_token: int,
                 special_tokens
                ) -> Tuple[torch.Tensor, torch.Tensor]:
        
        self.data = data
        self.padding_token = padding_token
        self.device = device
        self.tokenizer = tokenizer
        self.special_tokens = special_tokens
        self.assist_token = assist_token


    def __getitem__(self, index):
        
       sample = self.data[index]
       x = sample.to(self.device)
       y = sample[1:].to(self.device)
       padded_tensor = torch.tensor([self.padding_token], device=self.device)
       y = torch.cat((y, padded_tensor))
       masked_y = self._masked(x, y)

       return x, masked_y

    def __len__(self):
        return len(self.data)
        
    def _masked(self, x: torch.Tensor,
               y: torch.Tensor):

        mask = torch.zeros_like(y, dtype=torch.bool)
        special_token_tensor = torch.tensor(self.special_tokens, device=self.device)
        mask |= torch.isin(y, special_token_tensor)

        try:
            assist_pos = (x == self.assist_token).nonzero(as_tuple=True)[0].item()
            mask[:assist_pos+1] = True

        except Exception:
            pass

        y[mask] = self.padding_token
        return y   

In [None]:
batch_size = 16
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
special_tokens = [tokenizer.encode(tok, allowed_special='all')[0] for tok in tokenizer.special_tokens
                 if tok not in ["<|endoftext|>", "<|PAD|>"]]
assist_token = tokenizer.encode('<|Assistant|>', allowed_special='all')


train_dataset = FineTunedDataset(data=train_data_split,
                                padding_token=3077,
                                device=device,
                                tokenizer=tokenizer,
                                assist_token=assist_token,
                                special_tokens=special_tokens
                                )

train_loader = DataLoader(dataset=train_dataset,
                         batch_size=batch_size,
                         shuffle=True)

val_dataset = FineTunedDataset(data=val_data_split,
                              padding_token=3077,
                              device=device,
                              tokenizer=tokenizer,
                              assist_token=assist_token,
                              special_tokens=special_tokens
                              )
val_loader = DataLoader(dataset=val_dataset,
                       batch_size=batch_size,
                       shuffle=False)

In [None]:
tokenizer.special_tokens['<|endoftext|>']

#### Initializing the model with hyperparameters

model_id: 1KudWncwbEhANs3_WU2Jrk5IeoKYxrCpH

In [None]:
from src.model.GPTModel import GPTLanguageModel

block_size= 1024
n_embedding = 384
n_head = 8
n_layer = 6
dropout = 0.2
vocab_size = get_vocab_size(tokenizer)
padding_token = 3077
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GPTLanguageModel(
    vocab_size=vocab_size,
    n_embedding=n_embedding,
    n_head=n_head, block_size=block_size,
    n_layer=n_layer, dropout=dropout, 
    padding_token=padding_token, device=device)


model = model.to(device)
model

In [None]:
print("Model has",sum(p.numel() for p in model.parameters())/1e6, "M parameters")

## Estimate loss

In [None]:
@torch.inference_mode()
def estimate_loss(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader) -> Dict[str, float]:

  total_loss = {}
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model.eval()
    
  for split, loader in [('train', train_loader), ('val', val_loader)]:
    losses = torch.zeros(len(loader))

    for i, (x,y) in enumerate(loader):
      with torch.inference_mode():
       _, loss = model(x, y)
      #  print(_.shape)
      losses[i] = loss.item()
    total_loss[split] = losses.mean().item()

  model.train()
  return total_loss

## Checkpoint saving

In [None]:
def save_checkpoint(model: GPTLanguageModel,
                   optimizer: torch.optim.Optimizer,
                   epoch: int, loss: float,
                   file_path: str,global_step: int=None,
                   scheduler=None) -> None:

    checkpoint = {
        "model_state_dict": model.state_dict(),
        "epoch": epoch,
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss
    }

    torch.save(checkpoint, file_path)

In [None]:
lr_old = 1e-5
new_batch = 16
old_batch = 32

lr_new = lr_old * (new_batch/old_batch)
lr_new

## Training Loop

In [None]:

max_iters = 10
learning_rate = 5e-6
weight_decay = 0.01
# total_steps = len(train_loader) * max_iters
# warmup_steps = int(0.03 * total_steps)
eval_interval = len(train_loader) // 5
start_epoch = 0
global_step = 0


checkpoint = torch.load("/kaggle/input/model_fine_8/pytorch/default/1/fine_tuned_checkpoint8 (1).pth",
                       map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
print("Preatrained Model loaded successfully!.....\n")

optimizer = torch.optim.AdamW(params=model.parameters(), 
                             lr=learning_rate, 
                              weight_decay=weight_decay
                             )

start_epoch = checkpoint.get("epoch", 0) + 1
    
train_loss = []
val_loss = []
epoch_losses = []

for epoch in range(start_epoch, max_iters):
    model.train()
    epoch_loss = 0.0
    num_batches = 0
    
    for batch_idx, (x_batch, y_batch) in tqdm(
        iterable=enumerate(train_loader),desc=f"Epoch {epoch+1}/{max_iters}",
        total=len(train_loader)): 

        if global_step % eval_interval == 0 and global_step > 0:
                model.eval()
                losses = estimate_loss(model=model,
                             train_loader=train_loader,
                             val_loader=val_loader)
            
                print(f"step {global_step} |"
                      f"Train Loss: {losses['train']:.4f} |"
                      f"Validation Loss: {losses['val']:.4f}")
            
                train_loss.append(losses['train'])
                val_loss.append(losses['val'])
                model.train()
        

        logits, loss = model(x_batch, y_batch)
        
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() 
        num_batches += 1
        global_step += 1
        
            
    avg_epoch_loss = epoch_loss / num_batches
    epoch_losses.append(avg_epoch_loss)
        
    print(f"\Iteration: {epoch+1} completed | average train loss:{avg_epoch_loss:.4f}\n")

 
    save_checkpoint(model=model,optimizer=optimizer,
                    epoch=epoch,loss=avg_epoch_loss,
                    file_path=f"/kaggle/working/fine_tuned_checkpoint{epoch+1}.pth"
                   )


print("Fine tuning completed!.......")

#### Plotting Training and Validation Loss

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,6))
eval_steps = np.arange(len(train_loss)) * eval_interval
plt.plot(eval_steps, train_loss, 'b-',  label="Train Loss", linewidth=2)
plt.plot(eval_steps, val_loss, 'r-', label="Validation Loss", linewidth=2)
plt.title("Training Loss vs Validation Loss")
plt.xlabel("Training steps")
plt.ylabel("Loss")
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig(f"loss_fine_tuned{epoch+1}.png")
plt.show()

#### Plotting epoch loss

In [None]:
if epoch_losses:
    plt.plot(range(1, len(epoch_losses)+1), epoch_losses, color='purple', 
             marker='o', linewidth=2, markersize=6)
    plt.xlabel("Epoch")
    plt.ylabel("Average Loss")
    plt.title("Average Loss per Epoch")
    plt.grid(True, alpha=0.3)
plt.savefig(f"/kaggle/working/average_loss_fine_tuned_{epoch+1}.png")
plt.show()

### Inference

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
checkpoint = torch.load("/kaggle/input/check_10/pytorch/default/1/fine_tuned_checkpoint10 (1).pth",
                       map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

In [None]:
def order_response(text: str) -> str:
    text = re.sub(r"^[A-D]\.\s*", "", text, flags=re.MULTILINE)
    text = re.sub(r"\n+", " ", text)
    return text.strip()

In [None]:
prompt = "Given the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?"
input_ids = tokenizer.encode(prompt)
input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)

model.eval()
with torch.inference_mode():
    output = model.generate(input_ids, 200,
                           block_size, 0.8,
                           top_k=50, top_p=0.9)
output = output.squeeze().tolist()
response_tokens = output[input_ids.shape[1]:]
response = tokenizer.decode(response_tokens)
response = order_response(response)
print("User:", prompt, "\n")
print("Assistant:", response.replace("<|endoftext|>", ""))

In [None]:
def get_input_tokens(turns: list[dict]) -> list[int]:

    formatted_input = ""
    for turn in turns:
        role = turn['role']
        content = turn['content']
        formatted_input += f"<|startoftext|><|User|>{content}"

    formatted_input += f"|Assistant|>"
    
    input_tokens = tokenizer.encode(formatted_input, allowed_special='all')
    input_tokens = torch.tensor(input_tokens, dtype=torch.long)
    input_tokens = input_tokens.unsqueeze(0).to(device)
    return input_tokens


def generate_message(input_tokens: list[int]):
    model_answer = ""
    
    model.eval()
    while True:
    
        try:
            output_tokens = model.generate(
                input_tokens=input_tokens,max_new_tokens=1,
                block_size=1024, top_k=50, top_p=0.9,
                temperature=0.9
            )
    
            last_generated_tokens = output_tokens[0, -1].item()
            
            if last_generated_tokens == tokenizer.special_tokens['<|endoftext|>']:
                break
    
    
            input_tokens = torch.cat((input_tokens, output_tokens[:, -1:]), dim=1)
            model_answer += tokenizer.decode([last_generated_tokens])
    
        except Exception:
            continue
            
        model_answer = order_response(model_answer)

    return model_answer

In [None]:
user_msg = "Given the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?"
turns = [{
    "role": 'user',
    "content": user_msg
}]

input_tokens = get_input_tokens(turns)
model_answer = generate_message(input_tokens)

turns.append({
    "role": 'assistant',
    "content": model_answer
})

In [None]:
for turn in turns:
    role = turn['role']
    if role == 'user':
        print("User:", turn['content'] + "\n")

    elif role == 'assistant':
        print("Assistant:", turn['content'])