# Import Library

In [1]:
import torch
import wandb
import numpy as np
import pandas as pd
import re
import os
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq, TrainerCallback

from datasets import Dataset
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu

from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from torch.amp import autocast  

from collections import defaultdict


# Cleaning Data

In [2]:
#Load Data
df = pd.read_csv('/kaggle/input/layoutlm/medquad.csv')

# Sampel Data
print("Data Sample")
print(df.head())

#Null value
print("Null Value Data")
print(df.isnull().sum())

duplicates = df.duplicated(['question'], keep=False).sum()
print(f"Total duplicates in 'question' column: {duplicates}")

# Check for duplicate rows

duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

# Remove duplicate rows
df = df.drop_duplicates()

# Reset the index after removing duplicates
df.reset_index(drop=True, inplace=True)

#Delete Unused column
df = df.drop(columns=['source', 'focus_area'])

#Table Info
print("Table Info")
print(df.info())

# Apply the function
df = df.drop_duplicates(subset='question', keep='first').reset_index(drop=True)
df = df.drop_duplicates(subset='answer', keep='first').reset_index(drop=True)

#Drop rows with null values
df.dropna(inplace=True)

#Checking again of null values
print("Null Value Data")
print(df.isnull().sum())

#Checking again of the data info
print(df.info())

#Check for Unique Data
print(f"Unique questions: {df['question'].nunique()}")
print(f"Unique answers: {df['answer'].nunique()}")

df['question'] = df['question'].str.lower().str.strip().apply(lambda x: re.sub(r'\s+', ' ', x))
df['answer'] = df['answer'].str.lower().str.strip().apply(lambda x: re.sub(r'\s+', ' ', x))
print(df.head())

Data Sample
                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer           source  \
0  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   
1  Nearly 2.7 million people have glaucoma, a lea...  NIHSeniorHealth   
2  Symptoms of Glaucoma  Glaucoma can develop in ...  NIHSeniorHealth   
3  Although open-angle glaucoma cannot be cured, ...  NIHSeniorHealth   
4  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   

  focus_area  
0   Glaucoma  
1   Glaucoma  
2   Glaucoma  
3   Glaucoma  
4   Glaucoma  
Null Value Data
question       0
answer         5
source         0
focus_area    14
dtype: int64
Total duplicates in 'question' column: 2319
Number of duplicate rows: 48
Table Info
<class 

# Architecting Model

In [3]:
# callback to track metrics
class MetricsCallback(TrainerCallback):
    def __init__(self):
        self.metrics = defaultdict(list)
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            for key, value in logs.items():
                if isinstance(value, (int, float)):
                    self.metrics[key].append(value)
    
    def plot_metrics(self):
        plt.figure(figsize=(15, 10))
        
        # Plot training and validation loss
        plt.subplot(2, 1, 1)
        if 'loss' in self.metrics:
            plt.plot(self.metrics['loss'], label='Training Loss')
        if 'eval_loss' in self.metrics:
            # Interpolate eval_loss to match training loss points
            eval_steps = len(self.metrics['eval_loss'])
            train_steps = len(self.metrics['loss'])
            eval_indices = np.linspace(0, train_steps-1, eval_steps)
            plt.plot(eval_indices, self.metrics['eval_loss'], label='Validation Loss', marker='o')
        plt.title('Training and Validation Loss')
        plt.xlabel('Training Steps')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True)

        # Plot exact match score
        plt.subplot(2, 1, 2)
        if 'eval_exact_match' in self.metrics:
            plt.plot(self.metrics['eval_exact_match'], label='Exact Match Score', marker='o')
        if 'eval_bleu_score' in self.metrics:
            plt.plot(self.metrics['eval_bleu_score'], label='BLEU Score', marker='o')
        plt.title('Evaluation Metrics')
        plt.xlabel('Evaluation Steps')
        plt.ylabel('Score')
        plt.legend()
        plt.grid(True)

        plt.tight_layout()
        plt.savefig('training_metrics.png')
        plt.close()

# Load T5-small model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocess function for seq2seq task
def preprocess_function(batch):
    inputs = [f"question: {q}" for q in batch["question"]]
    targets = [f"{a}" for a in batch["answer"]]
    
    model_inputs = tokenizer(
        inputs, 
        max_length=128, 
        truncation=True, 
        padding=True,  
        return_tensors='pt'
    )
    labels = tokenizer(
        targets, 
        max_length=128, 
        truncation=True, 
        padding=True,
        return_tensors='pt'
    )
    
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["attention_mask"] = model_inputs["attention_mask"]
    return model_inputs

# Train-test split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Preprocess datasets
train_dataset = train_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=train_dataset.column_names,
    num_proc=4
)
val_dataset = val_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=val_dataset.column_names,
    num_proc=4
)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=300,
    save_steps=300,
    save_total_limit=3,
    learning_rate=5e-5,  
    num_train_epochs=10, 
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16,
    lr_scheduler_type="cosine",  
    warmup_ratio=0.1,  
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,
    gradient_accumulation_steps=4, 
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    max_grad_norm=1.0,              
    dataloader_num_workers=4,      
    ddp_find_unused_parameters=False,
    group_by_length=True,
)

# Label smoothing 
class AdaptiveLabelSmoothingLoss(CrossEntropyLoss):
    def __init__(self, smoothing=0.1, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.smoothing = smoothing
        self.confidence = 1.0 - smoothing
        
    def forward(self, input, target):
        log_prob = torch.nn.functional.log_softmax(input, dim=-1)
        weight = input.new_ones(input.size()) * \
                 self.smoothing / (input.size(-1) - 1.)
        weight.scatter_(-1, target.unsqueeze(-1), self.confidence)
        return torch.mean(torch.sum(-weight * log_prob, dim=-1))

# data collator with length-based batching
class SmartDataCollator(DataCollatorForSeq2Seq):
    def __call__(self, features):
        # Sort by length for more efficient batching
        features = sorted(features, key=lambda x: len(x['input_ids']))
        return super().__call__(features)

data_collator = SmartDataCollator(
    tokenizer=tokenizer,
    model=model,
    padding=True,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Ensure non-empty decoded results
    if not decoded_preds or not decoded_labels:
        print("Empty predictions or labels detected.")
        return {"exact_match": 0, "bleu_score": 0}

    # Calculate Exact Match
    exact_matches = [int(pred.strip() == label.strip()) for pred, label in zip(decoded_preds, decoded_labels)]
    exact_match_score = np.mean(exact_matches) * 100

    # Calculate BLEU Score
    try:
        bleu_score = corpus_bleu(
            [[label.split()] for label in decoded_labels],
            [pred.split() for pred in decoded_preds]
        ) * 100
    except ZeroDivisionError:
        bleu_score = 0

    return {"exact_match": exact_match_score, "bleu_score": bleu_score}


# Trainer with metrics callback
metrics_callback = MetricsCallback()

# Initialize Trainer
model.config.label_smoothing = 0.1
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[metrics_callback]
)

# model summary function
def print_detailed_model_summary(model, show_architecture=False):
    print("\nModel Summary:")
    print("-" * 50)
    print(f"Model Type: T5-base")
    
    # Count parameters by layer type
    layer_params = defaultdict(int)
    for name, param in model.named_parameters():
        layer_type = name.split('.')[0]
        layer_params[layer_type] += param.numel()
    
    # Print total parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"Total Parameters: {total_params:,}")
    print(f"Trainable Parameters: {trainable_params:,}")
    print(f"Non-trainable Parameters: {total_params - trainable_params:,}")
    
    # Print parameters by layer type
    print("\nParameters by Layer Type:")
    for layer_type, params in layer_params.items():
        print(f"{layer_type}: {params:,} parameters ({params/total_params*100:.2f}%)")
    
    if show_architecture:
        # Print model architecture
        print("\nModel Architecture:")
        print("-" * 50)
        for name, module in model.named_modules():
            if len(name) > 0:  # Skip the root module
                print(f"{name}: {module.__class__.__name__}")
        print("-" * 50)
    else:
        print("")

# Training workflow
print_detailed_model_summary(model)
trainer.train()
metrics_callback.plot_metrics()

# Final evaluation
# test_results = trainer.evaluate(test_dataset)
# print("\nTest Set Results:")
# print(json.dumps(test_results, indent=2))

# Save the model
trainer.save_model("./t5_chatbot_model")
tokenizer.save_pretrained("./t5_chatbot_tokenizer")
model_path = "./t5_chatbot_model.h5"
torch.save(model.state_dict(), model_path)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/11570 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2893 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)



Model Summary:
--------------------------------------------------
Model Type: T5-base
Total Parameters: 222,903,552
Trainable Parameters: 222,903,552
Non-trainable Parameters: 0

Parameters by Layer Type:
shared: 24,674,304 parameters (11.07%)
encoder: 84,954,240 parameters (38.11%)
decoder: 113,275,008 parameters (50.82%)



  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss,Validation Loss,Exact Match,Bleu Score
300,1.7559,1.620738,0.0,0.151595
600,1.6209,1.550666,0.034566,0.150061
900,1.5969,1.54037,0.034566,0.146597


  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


# Testing using Input

In [7]:
# Load the trained T5 model and tokenizer
model_path = "/kaggle/working/t5_chatbot_model"
tokenizer_path = "/kaggle/working/t5_chatbot_tokenizer"

tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
model.eval() 

def generate_response(question):
    input_ids = tokenizer(f"question: {question} </s>", return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(
        input_ids,
        max_length=128,
        num_beams=5,  
        no_repeat_ngram_size=2,  
        top_k=50,  
        top_p=0.95,  
        temperature=1.0  
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
response = generate_response("What causes diabetes ?")
print(response)

what causes diabetes? the exact cause of diabetes is unknown.
