In [1]:
!pip install nlpaug
!pip install torchsummary
!pip install datasets

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting gdown>=4.0.0 (from nlpaug)
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown, nlpaug
Successfully installed gdown-5.2.0 nlpaug-1.1.11
Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl.metadata (296 bytes)
Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1


# Import Library

In [2]:
import pandas as pd
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os
import torch
from datasets import Dataset

# Cleaning Data

In [3]:
os.environ["WANDB_DISABLED"] = "true"
df = pd.read_csv('/kaggle/input/layoutlm/medquad.csv')
print("Data Sample")
print(df.head())
print("Null Value Data")
print(df.isnull().sum())
total_duplicates = df.duplicated(['question'], keep=False)
print(f"Total duplicates in 'question' column: {total_duplicates.sum()}")
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")
df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)
print("Table Info")
print(df.info())
df = df.drop_duplicates(subset='question', keep='first').reset_index(drop=True)
df = df.drop_duplicates(subset='answer', keep='first').reset_index(drop=True)
df.dropna(inplace=True)
print("Null Value Data")
print(df.isnull().sum())
print(df.info())
df['question'] = df['question'].fillna('')
df['answer'] = df['answer'].fillna('')
df['prompt'] = df['question'] + ' ' + df['answer']

Data Sample
                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer           source  \
0  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   
1  Nearly 2.7 million people have glaucoma, a lea...  NIHSeniorHealth   
2  Symptoms of Glaucoma  Glaucoma can develop in ...  NIHSeniorHealth   
3  Although open-angle glaucoma cannot be cured, ...  NIHSeniorHealth   
4  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   

  focus_area  
0   Glaucoma  
1   Glaucoma  
2   Glaucoma  
3   Glaucoma  
4   Glaucoma  
Null Value Data
question       0
answer         5
source         0
focus_area    14
dtype: int64
Total duplicates in 'question' column: 2319
Number of duplicate rows: 48
Table Info
<class 

# Architecting Model

In [4]:
# Split data into training and validation sets
train_data, val_data = train_test_split(df['prompt'], test_size=0.1, random_state=42)

# Save the train and validation data to text files
train_data.to_csv('/kaggle/working/train.txt', index=False, header=False)
val_data.to_csv('/kaggle/working/val.txt', index=False, header=False)

# Load pre-trained GPT-2 tokenizer 
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Use the PyTorch version of GPT-2
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
# Define a custom compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # Flatten both predictions and labels to ignore padding
    flattened_predictions = predictions.flatten()
    flattened_labels = labels.flatten()
    mask = flattened_labels != -100  # Exclude padding (-100 is used by Hugging Face)
    filtered_predictions = flattened_predictions[mask]
    filtered_labels = flattened_labels[mask]
    accuracy = accuracy_score(filtered_labels, filtered_predictions)
    return {"accuracy": accuracy}
# Create dataset for training and evaluation
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='/kaggle/working/train.txt',
    block_size=128
)

val_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='/kaggle/working/val.txt',
    block_size=128
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/chatbot_model',         
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_steps=10,
    save_steps=4100,  
    prediction_loss_only=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)
trainer.train()
# eval_results = trainer.evaluate()
model.save_pretrained('/kaggle/working/chatbot_model.h5')
tokenizer.save_pretrained('/kaggle/working/chatbot_model.h5')
# print(f"Validation Results: {eval_results}")
model_path = "./model.h5"
torch.save(model.state_dict(), model_path)
print("Model and tokenizer saved successfully.")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
10,3.043
20,2.8056
30,2.5868
40,2.5756
50,2.5449
60,2.3096
70,2.4716
80,2.4022
90,2.3133
100,2.2748


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Model and tokenizer saved successfully.


# Testing Using Inputs

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('/kaggle/working/chatbot_model.h5')
tokenizer = GPT2Tokenizer.from_pretrained('/kaggle/working/chatbot_model.h5')

# Function to generate chatbot response
def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=150, num_return_sequences=1, no_repeat_ngram_size=2, temperature=0.7)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example usage
prompt = "what is flu?"
response = generate_response(prompt)
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


what is flu? Flu is a cold-like illness that causes fever, headache, and shortness of breath. It can also cause fever and other symptoms. Flu can be caused by a variety of causes, including:
  
- cold weather  - cold temperatures - flu
 - flu-related illnesses  -- cold illnesses
 flu  flu - other illnesses, such as pneumonia
 influenza  influenza - pneumonia  cold flu, flu or cold viruses
 cold virus  other diseases,such as cold diseases cold cold disease cold illness other cold infections
 or flu cold infection cold or influenza
 and cold  or  cough  and flu cough
or  a cough and  coughing
and  an infection
"What are the symptoms
