In [1]:
pip install rouge-score sacrebleu evaluate torchsummary

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchsummary

# Import Library

In [2]:
import torch
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import evaluate
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from transformers import BartConfig,BartForConditionalGeneration,BartTokenizer
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq, TrainerCallback, Seq2SeqTrainingArguments

from datasets import Dataset
from sklearn.model_selection import train_test_split

from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch.utils.data import TensorDataset
from torchsummary import summary

from collections import defaultdict
warnings.filterwarnings("ignore")

# Cleaning Data

In [4]:

df = pd.read_csv('/kaggle/input/layoutlm/medquad.csv')
print("Data Sample")
print(df.head())
print("Null Value Data")
print(df.isnull().sum())
total_duplicates = df.duplicated(['question'], keep=False)
print(f"Total duplicates in 'question' column: {total_duplicates.sum()}")
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")
df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)
print("Table Info")
print(df.info())
df = df.drop_duplicates(subset='question', keep='first').reset_index(drop=True)
df = df.drop_duplicates(subset='answer', keep='first').reset_index(drop=True)
df.dropna(inplace=True)
print("Null Value Data")
print(df.isnull().sum())
print(df.info())
df['question'] = df['question'].fillna('')
df['answer'] = df['answer'].fillna('')
df['prompt'] = df['question'] + ' ' + df['answer']

Data Sample
                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer           source  \
0  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   
1  Nearly 2.7 million people have glaucoma, a lea...  NIHSeniorHealth   
2  Symptoms of Glaucoma  Glaucoma can develop in ...  NIHSeniorHealth   
3  Although open-angle glaucoma cannot be cured, ...  NIHSeniorHealth   
4  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   

  focus_area  
0   Glaucoma  
1   Glaucoma  
2   Glaucoma  
3   Glaucoma  
4   Glaucoma  
Null Value Data
question       0
answer         5
source         0
focus_area    14
dtype: int64
Total duplicates in 'question' column: 2319
Number of duplicate rows: 48
Table Info
<class 

# Architecting Model

In [10]:
model_name = "facebook/bart-large"
config = BartConfig.from_pretrained(model_name)
config.dropout_rate = 0.2
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name, config=config)

# Tie weights explicitly
model.resize_token_embeddings(len(tokenizer))

# Print model architecture summary
# Print detailed model summary
print("\nDetailed Model Summary:")
print("=" * 50)

def summarize_model_by_type(model):
    layer_summary = defaultdict(int)
    param_summary = defaultdict(int)

    for name, module in model.named_modules():
        layer_type = type(module).__name__
        layer_summary[layer_type] += 1
        param_summary[layer_type] += sum(p.numel() for p in module.parameters())

    print(f"{'Layer Type':<30}{'Count':<10}{'Parameters':<15}")
    print("=" * 55)
    for layer_type, count in layer_summary.items():
        print(f"{layer_type:<30}{count:<10}{param_summary[layer_type]:<15,}")

summarize_model_by_type(model)

# Preprocess function for seq2seq task
def preprocess_function(batch):
    inputs = [f"question: {q}" for q in batch['question']]
    targets = [f"{a}" for a in batch['answer']]
    
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=64,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
    
    labels["input_ids"][labels["input_ids"] == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Train-test split
train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Preprocess datasets
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=32,  
    remove_columns=train_dataset.column_names,
    num_proc=4,   
)

val_dataset = val_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=32,  
    remove_columns=val_dataset.column_names,
    num_proc=4,  
)


# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=1000,  
    save_steps=1000,  
    save_total_limit=2,  
    learning_rate=3e-5,   
    num_train_epochs=1,   
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4, 
    lr_scheduler_type="cosine_with_restarts",  
    warmup_ratio=0.15,  
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,   
    logging_dir="./logs",
    logging_steps=50,  
    load_best_model_at_end=True,
    metric_for_best_model="eval_exact_match",
    greater_is_better=True,
    report_to="none",
    gradient_accumulation_steps=1,   
    max_grad_norm=0.5,
    optim="adamw_torch_fused",  
    generation_max_length=64,  
    generation_num_beams=4,
    dataloader_num_workers=4,   
    group_by_length=True, 
    remove_unused_columns=True,
)

training_args.label_smoothing_factor = 0.1
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=model,  
    padding='longest',  
)

# Create function to show exact match, BLEU and ROUGE
def compute_metrics(eval_pred, tokenizer):
    # Unpack predictions and labels
    predictions, labels = eval_pred
    
    # Handle case where predictions might be a tuple
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in labels with pad token for decoding
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Text normalization function
    def normalize_text(text):
        """Normalize text for consistent comparison"""
        text = text.strip().lower()
        text = re.sub(r'\s+', ' ', text)  # Remove extra whitespaces
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        return text
    
    # Normalize predictions and labels
    decoded_preds = [normalize_text(pred) for pred in decoded_preds]
    decoded_labels = [normalize_text(label) for label in decoded_labels]
    
    # Compute Exact Match
    exact_matches = [pred == label for pred, label in zip(decoded_preds, decoded_labels)]
    exact_match_accuracy = np.mean(exact_matches)
    
    # Load metrics
    bleu_metric = evaluate.load("bleu")
    rouge_metric = evaluate.load("rouge")
    
    # Compute BLEU score
    bleu = bleu_metric.compute(
        predictions=decoded_preds, 
        references=[[label] for label in decoded_labels]
    )
    bleu_score = bleu["bleu"]
    
    # Compute ROUGE score
    rouge = rouge_metric.compute(
        predictions=decoded_preds, 
        references=decoded_labels
    )
    rouge_l = rouge["rougeL"]
    
    return {
        "exact_match": exact_match_accuracy,
        "BLEU": bleu_score,
        "ROUGE-L": rouge_l,
    }

# Initialize data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding='longest',
    return_tensors="pt"
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda eval_pred: compute_metrics(eval_pred, tokenizer)
)

# Train the model
trainer.train()

# Save the model and tokenizer
trainer.save_model("./chatbot_model")
tokenizer.save_pretrained("./chatbot_tokenizer")
model_path = "./chatbot_model.h5"
torch.save(model.state_dict(), model_path)


Detailed Model Summary:
Layer Type                    Count     Parameters     
BartForConditionalGeneration  1         406,291,456    
BartModel                     1         406,291,456    
BartScaledWordEmbedding       1         51,471,360     
BartEncoder                   1         203,678,720    
BartLearnedPositionalEmbedding2         2,101,248      
ModuleList                    2         352,714,752    
BartEncoderLayer              12        151,154,688    
BartSdpaAttention             36        151,142,400    
Linear                        193       404,063,232    
LayerNorm                     62        126,976        
GELUActivation                24        0              
BartDecoder                   1         254,084,096    
BartDecoderLayer              12        201,560,064    


Map (num_proc=4):   0%|          | 0/12288 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2169 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Exact Match,Bleu,Rouge-l
1000,2.9655,2.902961,0.001383,0.269141,0.374712


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


# Generate Responds

In [21]:
model_path = "/kaggle/working/chatbot_model"
tokenizer_path = "/kaggle/working/chatbot_tokenizer"

tokenizer = BartTokenizer.from_pretrained(tokenizer_path)
model = BartForConditionalGeneration.from_pretrained(model_path)
model.eval() 

def generate_response(question):
    input_ids = tokenizer(f"question: {question} </s>", return_tensors="pt").input_ids.to(model.device)
    outputs = model.generate(
        input_ids,
        max_length=128,
        num_beams=5,  
        no_repeat_ngram_size=2,  
        top_k=50,  
        top_p=0.95,  
        temperature=1.0  
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
response = generate_response("What causes brain cancer ?")
print(response)

What causes brain cancer? Brain cancer is caused by a combination of genetic and environmental factors. The most common genetic cause is a mutation in the BRCA1 gene. This gene provides instructions for making a protein called bile duct beta-glucosaminidase, which is found in many tissues
