In [None]:
!pip install transformers datasets openai

In [2]:
import pandas as pd
from datasets import Dataset


data = pd.read_csv('/kaggle/input/layoutlm/medquad.csv')

data['answer'].fillna("No Answer", inplace=True)

def preprocess_data(row):
    return {
        'input_text': f"question: {row['question']} </s>",
        'target_text': row['answer'] + " </s>"
    }


formatted_data = data.apply(preprocess_data, axis=1)

hf_dataset = Dataset.from_pandas(pd.DataFrame(formatted_data.tolist()))
hf_dataset = hf_dataset.train_test_split(test_size=0.1)  # Split into training and validation sets


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['answer'].fillna("No Answer", inplace=True)


In [3]:
from transformers import T5Tokenizer


tokenizer = T5Tokenizer.from_pretrained("t5-base")


def tokenize_data(example):
    source = tokenizer(example['input_text'], padding="max_length", truncation=True, max_length=512)
    target = tokenizer(example['target_text'], padding="max_length", truncation=True, max_length=512)

    return {
        'input_ids': source['input_ids'],
        'attention_mask': source['attention_mask'],
        'labels': target['input_ids']  
    }

tokenized_dataset = hf_dataset.map(tokenize_data, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/14770 [00:00<?, ? examples/s]



Map:   0%|          | 0/1642 [00:00<?, ? examples/s]

In [10]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained("t5-base")


training_args = TrainingArguments(
    output_dir="./t5-finetuned-medquad",
    overwrite_output_dir=True,
    eval_strategy="epoch",  
    learning_rate=3e-5,
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=2,   
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    save_strategy="epoch",
    load_best_model_at_end=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
)


trainer.train()


trainer.save_model("./t5-finetuned-medquad")


Epoch,Training Loss,Validation Loss
1,1.0227,0.932083
2,0.8466,0.884382
3,0.8428,0.872469


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


In [13]:
import torch

# Ensure the model is moved to the correct device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to generate an answer using T5
def generate_answer_t5(question):
    input_text = f"question: {question}"  # Avoid manually adding </s>
    
    # Tokenize the input and move the input to the correct device
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    input_ids = inputs["input_ids"].to(device)  # Move input_ids to the same device as the model
    attention_mask = inputs["attention_mask"].to(device)  # Move attention_mask to the same device

    # Generate the answer
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=150,
        num_beams=5,
        early_stopping=True
    )

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated_text

# Example usage on a single question from the dataset
question_example = "What is glaucoma?"
generated_answer = generate_answer_t5(question_example)
print(f"Question: {question_example}")
print(f"Generated Answer: {generated_answer}")


Question: What is glaucoma?
Generated Answer: Glaucoma is a condition that affects the eyes. It is caused by glaucoma, which is a type of glaucoma that affects the eyes. It is a condition that affects the eye's ability to see clearly. Glaucoma is a type of glaucoma that affects the eye's ability to see clearly. Glaucoma is a type of glaucoma that affects the eye's ability to see clearly. Glaucoma is a condition that affects the eye's ability to see clearly. Glaucoma is a condition that affects the eye's ability to see clearly
