In [None]:
!pip install transformers datasets torch

In [13]:
import pandas as pd


data = pd.read_csv('/kaggle/input/layoutlm/medquad.csv')


data = data.dropna(subset=['question', 'answer'])


print(data[['question', 'answer']].head())


                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer  
0  Glaucoma is a group of diseases that can damag...  
1  Nearly 2.7 million people have glaucoma, a lea...  
2  Symptoms of Glaucoma  Glaucoma can develop in ...  
3  Although open-angle glaucoma cannot be cured, ...  
4  Glaucoma is a group of diseases that can damag...  


In [16]:
import pandas as pd
from datasets import Dataset

# Load your dataset (replace with the path to your dataset)
data = pd.read_csv('/kaggle/input/layoutlm/medquad.csv')

# Fill missing values (if any) with a placeholder
data['answer'].fillna("No Answer", inplace=True)

# Prepare the dataset with input (question) and target (answer)
def preprocess_data(row):
    return {
        'input_text': f"Q: {row['question']} A:",
        'target_text': row['answer']
    }

# Apply the preprocessing function to the dataset
formatted_data = data.apply(preprocess_data, axis=1)

# Convert to Hugging Face Dataset format
hf_dataset = Dataset.from_pandas(pd.DataFrame(formatted_data.tolist()))
hf_dataset = hf_dataset.train_test_split(test_size=0.1)  # Split into training and validation sets


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['answer'].fillna("No Answer", inplace=True)


In [17]:
from transformers import GPT2Tokenizer

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Add the padding token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_data(example):
    source = tokenizer(example['input_text'], padding="max_length", truncation=True, max_length=512)
    target = tokenizer(example['target_text'], padding="max_length", truncation=True, max_length=512)
    
    return {
        'input_ids': source['input_ids'],
        'attention_mask': source['attention_mask'],
        'labels': target['input_ids']  # GPT-2 uses the same tokenizer for both input and output
    }

# Apply the tokenization to the dataset
tokenized_dataset = hf_dataset.map(tokenize_data, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])




Map:   0%|          | 0/14770 [00:00<?, ? examples/s]

Map:   0%|          | 0/1642 [00:00<?, ? examples/s]

In [20]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-medquad",
    overwrite_output_dir=True,
    eval_strategy="steps",  # Changed from "evaluation_strategy" to "eval_strategy"
    save_strategy="steps",  # Save the model after a specific number of steps
    eval_steps=500,         # Evaluate every 500 steps
    save_steps=500,         # Save the model every 500 steps
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
)

# Start fine-tuning the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./gpt2-finetuned-medquad")


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011114003811114041, max=1.0…

Step,Training Loss,Validation Loss
500,3.4657,3.275872
1000,2.9984,3.287554
1500,2.7667,3.153626
2000,3.393,3.119809
2500,3.8953,3.039443
3000,2.7795,3.046404
3500,2.8491,3.05705
4000,2.7624,3.030465
4500,2.8303,2.981403
5000,2.8893,2.987535


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [30]:
import torch

# Ensure the model is moved to the correct device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the pad_token_id to eos_token_id (end of sentence token)
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

def generate_answer_gpt2(question):
    input_text = f"Q: {question} A:"
    
    # Tokenize the input and move the input to the correct device
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Generate the answer with attention mask
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=150,
        num_beams=5,
        early_stopping=True
    )
    
    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_text

# Example usage on a single question from your dataset
question_example = "What causes Glaucoma ?"
generated_answer = generate_answer_gpt2(question_example)
print(f"Generated Answer: {generated_answer}")


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Answer: Q: What causes Glaucoma ? A: glau glau glau glau glau glau glau gl glau glau glau glau glau glau glau glau glau glau glauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauauau
