# Medical question and answer

# Preparing the Dataset.

In [18]:
from datasets import load_dataset, DatasetDict

# Q&A modeling
data_qa =  load_dataset("json", data_files="./src/data/medical_meadow_medqa.json")

def split_dataset(data):
  # Perform a train-test split
  data = data["train"].train_test_split(test_size=0.1)

  # Further split the training data into training and validation sets
  test_data = data["test"]
  train_data = data["train"].train_test_split(test_size=0.1)
  valid_data = train_data["test"]
  train_data = train_data["train"]
  # Print dataset splits
  print("Train data:", len(train_data))
  print("Validation data:", len(valid_data))
  print("Test data:", len(test_data))

  return train_data, valid_data, test_data

train_data_wiki, valid_data_wiki, test_data_wiki = split_dataset(data_qa)
# Create a combined DatasetDict
data_wiki = DatasetDict({
    "train": train_data_wiki,
    "validation": valid_data_wiki,
    "test": test_data_wiki
})


Train data: 8244
Validation data: 916
Test data: 1018


In [19]:
print(data_wiki)

print(data_wiki['train'][10])

print(data_wiki['train'][10]['input'])
print(data_wiki['train'][10]['output'])

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 8244
    })
    validation: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 916
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 1018
    })
})
{'instruction': 'Please answer with one of the option in the bracket', 'input': 'Q:A 67-year-old man presents to his primary care physician primarily complaining of a tremor. He said that his symptoms began approximately 1 month ago, when his wife noticed his right hand making "abnormal movements" while watching television. His tremor worsens when he is distracted and improves with purposeful action, such as brushing his teeth or combing his hair. He reports to having occasional headaches during times of stress. His wife notices he walks with "poor" posture and he finds himself having trouble staying asleep. He has a past medical history of migraine, genera

In [59]:
def prepare_features(examples, tokenizer, model_type, max_length=512):
    if model_type in ["bert", "distilbert", "roberta", "distilroberta-base"]:
        questions = []
        contexts = []
        start_positions = []
        end_positions = []

        for input_text, answer_text in zip(examples["input"], examples["output"]):
            # Parse the input into question and context (naively split by first "Q:" and find the choices)
            if input_text.startswith("Q:"):
                q_part = input_text[2:].strip()
            else:
                q_part = input_text.strip()

            # Try to split by the first instance of { to extract context (the MC options)
            try:
                q_split = q_part.split("{", 1)
                question = q_split[0].strip()
                context = "{" + q_split[1].strip()  # re-add '{' to preserve formatting
            except:
                question = q_part
                context = ""

            questions.append(question)
            contexts.append(context)

            # Try to find answer span (assumes answer appears in context string exactly)
            answer = answer_text.split(":", 1)[-1].strip()
            answer_start = context.find(answer)

            if answer_start == -1:
                # Fallback: skip or raise a warning
                print(f"‚ö†Ô∏è Answer span not found in context:\nAnswer: {answer}\nContext: {context}\n")
                start_positions.append(0)
                end_positions.append(0)
            else:
                start_positions.append(answer_start)
                end_positions.append(answer_start + len(answer))

        tokenized=tokenizer(
            questions,
            contexts,
            truncation="only_second",
            max_length=max_length,
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors=None,  # <- return plain lists
        )

        offset_mapping = tokenized.pop("offset_mapping")
        start_pos_token = []
        end_pos_token = []

        for i, offsets in enumerate(offset_mapping):
            start_char = start_positions[i]
            end_char = end_positions[i]
            sequence_ids = tokenized.sequence_ids(i)

            # Find the tokens that match the character span
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(offsets) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Match character start/end to token positions
            token_start = token_end = token_start_index
            for idx in range(token_start_index, token_end_index + 1):
                start, end = offsets[idx]
                if start <= start_char < end:
                    token_start = idx
                if start < end_char <= end:
                    token_end = idx
                    break

            start_pos_token.append(token_start)
            end_pos_token.append(token_end)

        tokenized["start_positions"] = start_pos_token
        tokenized["end_positions"] = end_pos_token

        return tokenized

    elif model_type in ["gpt2", "t5", "t5-small"]:
        # As defined previously for generative QA
        model_inputs = tokenizer(
            examples["input"],
            max_length=max_length,
            padding="max_length",
            truncation=True
        )

        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                examples["output"],
                max_length=32,
                padding="max_length",
                truncation=True
            )

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs


def tokenization_dataset(dataset_dict, tokenizer, max_length=512):
    # Tokenize each split in the DatasetDict using the prepare_features function
    model_type = tokenizer.name_or_path.split("/")[-1]  # Extract model type from tokenizer name
    print(f"Model type: {model_type}")
    tokenized_dataset_dict = {}
    for split in dataset_dict.keys():
        tokenized_dataset_dict[split] = dataset_dict[split].map(
            lambda x: prepare_features(x, tokenizer, model_type, max_length),
            batched=True,
            remove_columns=dataset_dict[split].column_names,
            desc=f"Tokenizing {split} split"
        )

    return DatasetDict(tokenized_dataset_dict)

# Modeling and representation

In [57]:
from transformers import (AutoModelForQuestionAnswering,
                          TrainingArguments,
                          Trainer,
                          AutoTokenizer,
                          GPT2ForQuestionAnswering,
                          T5ForQuestionAnswering)

#distilBerttokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")
#distilBertQA = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilroberta-base")

GPT2tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
GPT2tokenizer.pad_token = GPT2tokenizer.eos_token
GPT2QA = GPT2ForQuestionAnswering.from_pretrained("openai-community/gpt2")

T5Tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
T5QA = T5ForQuestionAnswering.from_pretrained("google-t5/t5-small")

Some weights of GPT2ForQuestionAnswering were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at google-t5/t5-small and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
#train_data_distilBert = tokenization_dataset(data_wiki,distilBerttokenizer)

train_data_GPT2= tokenization_dataset(data_wiki, GPT2tokenizer)

train_data_T5 = tokenization_dataset(data_wiki, T5Tokenizer)

Model type: gpt2


Tokenizing train split:   0%|          | 0/8244 [00:00<?, ? examples/s]



Tokenizing validation split:   0%|          | 0/916 [00:00<?, ? examples/s]

Tokenizing test split:   0%|          | 0/1018 [00:00<?, ? examples/s]

Model type: t5-small


Tokenizing train split:   0%|          | 0/8244 [00:00<?, ? examples/s]

Tokenizing validation split:   0%|          | 0/916 [00:00<?, ? examples/s]

Tokenizing test split:   0%|          | 0/1018 [00:00<?, ? examples/s]

# Fine-Tuning.


TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING = {
    "t5": ["q", "v"],
    "mt5": ["q", "v"],
    "bart": ["q_proj", "v_proj"],
    "gpt2": ["c_attn"],
    "bloom": ["query_key_value"],
    "blip-2": ["q", "v", "q_proj", "v_proj"],
    "opt": ["q_proj", "v_proj"],
    "gptj": ["q_proj", "v_proj"],
    "gpt_neox": ["query_key_value"],
    "gpt_neo": ["q_proj", "v_proj"],
    "bert": ["query", "value"],
    "roberta": ["query", "value"],
    "xlm-roberta": ["query", "value"],
    "electra": ["query", "value"],
    "deberta-v2": ["query_proj", "value_proj"],
    "deberta": ["in_proj"],
    "layoutlm": ["query", "value"],
    "llama": ["q_proj", "v_proj"],
    "chatglm": ["query_key_value"],
    "gpt_bigcode": ["c_attn"],
    "mpt": ["Wqkv"],
}

In [None]:
import peft
from peft import LoraConfig, get_peft_model, PeftModel

def define_model(model, lora_config):
    peft_model = get_peft_model(model, lora_config)
    print(peft_model.print_trainable_parameters())
    return peft_model


# TARGET_MODULES
# https://github.com/huggingface/peft/blob/39ef2546d5d9b8f5f8a7016ec10657887a867041/src/peft/utils/other.py#L220
lora_config_distilRoberta = LoraConfig(
        r=4, #As bigger the R bigger the parameters to train.
        lora_alpha=1, # a scaling factor that adjusts the magnitude of the weight matrix. Usually set to 1
        target_modules=["query", "value"], #You can obtain a list of target modules in the URL above.
        lora_dropout=0.05, #Helps to avoid Overfitting.
        bias="lora_only", # this specifies if the bias parameter should be trained.
        task_type="QUESTION_ANS"
    )

lora_config_GPT2 = LoraConfig(
        r=4, #As bigger the R bigger the parameters to train.
        lora_alpha=1, # a scaling factor that adjusts the magnitude of the weight matrix. Usually set to 1
        target_modules=["c_attn"], #You can obtain a list of target modules in the URL above.
        lora_dropout=0.05, #Helps to avoid Overfitting.
        bias="lora_only", # this specifies if the bias parameter should be trained.
        task_type="QUESTION_ANS"
    )

lora_config_T5 = LoraConfig(
        r=4, #As bigger the R bigger the parameters to train.
        lora_alpha=1, # a scaling factor that adjusts the magnitude of the weight matrix. Usually set to 1
        target_modules=["q", "v"], #You can obtain a list of target modules in the URL above.
        lora_dropout=0.05, #Helps to avoid Overfitting.
        bias="lora_only", # this specifies if the bias parameter should be trained.
        task_type="QUESTION_ANS"
    )

# peft_distilRoberta = define_model(distilBertQA, lora_config_distilRoberta)
peft_GPT2 = define_model(GPT2QA, lora_config_GPT2)
peft_T5 = define_model(T5QA, lora_config_T5)

trainable params: 176,642 || all params: 124,590,340 || trainable%: 0.1418
None
trainable params: 148,482 || all params: 60,656,132 || trainable%: 0.2448
None




In [29]:

# def print_lora_trainable_parameters(model):
#     print("Trainable Parameters (requires_grad=True):\n")
#     for name, param in model.named_parameters():
#         if param.requires_grad:
#             print(f"{name}: {param.numel()} parameters")

# # Print only trainable parameters
# print("DistilRoberta Trainable Parameters:")
# print_lora_trainable_parameters(peft_distilRoberta)
# print("\nGPT2 Trainable Parameters:")
# print_lora_trainable_parameters(peft_GPT2)
# print("\nT5 Trainable Parameters:")
# print_lora_trainable_parameters(peft_T5)


# Model Evaluation

In [68]:
#Creating the TrainingArgs
import transformers
from transformers import TrainingArguments, Trainer

import evaluate
import numpy as np

# Load all relevant metrics
squad_metric = evaluate.load("squad")
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")

def compute_metrics(p, tokenizer):
    predictions, labels = p
    print(f"Predictions: {predictions}")
    print(f"Labels: {labels}")

    # Convert predictions (logits) to ids
    pred_ids = np.argmax(predictions, axis=-1)  # Use argmax to get the predicted class/word id

    # Decode token predictions into text using the tokenizer
    pred_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in pred_ids]

    # Decode labels into reference texts (assuming labels are token ids, not structured data)
    ref_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]

    # Print one example of Q&A
    example_idx = 0  # You can choose a specific index to print
    print(f"Example {example_idx + 1}:")
    print(f"Predicted Answer: {pred_texts[example_idx]}")
    print(f"Reference Answer: {ref_texts[example_idx]}")
    print()

    # Compute ROUGE and BLEU scores
    rouge_results = rouge_metric.compute(predictions=pred_texts, references=ref_texts)
    bleu_results = bleu_metric.compute(predictions=pred_texts, references=[[ref] for ref in ref_texts])

    # Compute SQuAD scores
    squad_preds = [{"id": str(i), "prediction_text": pred} for i, pred in enumerate(pred_texts)]
    squad_refs = [{"id": str(i), "answers": {"text": [ref]}} for i, ref in enumerate(ref_texts)]

    squad_results = squad_metric.compute(predictions=squad_preds, references=squad_refs)

    # Combine all results into one dictionary
    results = {
        "exact_match": squad_results["exact_match"],
        "f1": squad_results["f1"],
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
        "rougeLsum": rouge_results["rougeLsum"],
        "bleu": bleu_results["bleu"]
    }

    return results



from transformers import DataCollatorWithPadding
from torch.utils.data import default_collate
import torch

class QADataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.collator = DataCollatorWithPadding(tokenizer=tokenizer)

    def __call__(self, features):
        # Extract start and end positions separately
        start_positions = [f.pop("start_positions") for f in features]
        end_positions = [f.pop("end_positions") for f in features]

        # Collate everything else using Hugging Face padding collator
        batch = self.collator(features)

        # Manually add back the start/end positions
        batch["start_positions"] = torch.tensor(start_positions, dtype=torch.long)
        batch["end_positions"] = torch.tensor(end_positions, dtype=torch.long)

        return batch


from transformers import DataCollatorForSeq2Seq
from torch.nn.utils.rnn import pad_sequence
import torch

class QAGenerativeCollator:
    def __init__(self, tokenizer, model=None, max_input_length=512, max_target_length=64):
        self.tokenizer = tokenizer
        self.model = model
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __call__(self, batch):
        # Each batch example is a dict with "input" and "output"
        inputs = [example["input"] for example in batch]
        targets = [example["output"] for example in batch]

        # Tokenize the inputs
        model_inputs = self.tokenizer(
            inputs,
            padding=True,
            truncation=True,
            max_length=self.max_input_length,
            return_tensors="pt"
        )

        # Tokenize the targets
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                targets,
                padding=True,
                truncation=True,
                max_length=self.max_target_length,
                return_tensors="pt"
            )

        # T5 expects -100 for ignored label tokens
        labels_input_ids = labels["input_ids"]
        labels_input_ids[labels_input_ids == self.tokenizer.pad_token_id] = -100

        model_inputs["labels"] = labels_input_ids

        return model_inputs


class DataTrainer:
    def __init__(self, model, dataset_train, dataset_valid, tokenizer, data_collator):
        self.model = model
        self.dataset_train = dataset_train
        self.dataset_valid = dataset_valid
        self.tokenizer = tokenizer
        self.data_collator = data_collator
        self.training_args = TrainingArguments(
            output_dir="results",
            #eval_strategy="epoch",
            eval_strategy="steps",
            eval_steps=1,
            learning_rate=3e-2,
            auto_find_batch_size=True,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            push_to_hub=False,
            #save_strategy="epoch",
            save_strategy="steps",
            save_steps=1,
            report_to="tensorboard",  # üëà this disables wandb & enables tensorboard
            logging_dir="logs",
            load_best_model_at_end=True,
        )

        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.dataset_train,
            eval_dataset=self.dataset_valid,
            tokenizer=self.tokenizer,
            #data_collator=self.data_collator,
            compute_metrics=lambda p: compute_metrics(p, self.tokenizer),
        )

    def train(self):
        self.trainer.train()

In [64]:
#collator_distilBert = QADataCollator(distilBerttokenizer)
collator_GPT2 = QAGenerativeCollator(GPT2tokenizer)
collator_T5 = QAGenerativeCollator(T5Tokenizer)

In [65]:
# trainer_distilBert = DataTrainer(
#     model=peft_distilRoberta,
#     dataset_train=train_data_distilBert['train'],
#     dataset_valid=train_data_distilBert['validation'],
#     tokenizer=distilBerttokenizer,
#     data_collator=collator_distilBert
# )
# trainer_distilBert.train()

In [69]:
trainer_GPT2 = DataTrainer(
    model=peft_GPT2,
    dataset_train=train_data_GPT2['train'],
    dataset_valid=train_data_GPT2['validation'],
    tokenizer=GPT2tokenizer,
    data_collator=collator_GPT2
)
trainer_GPT2.train()

  0%|          | 0/1548 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


ValueError: The model did not return a loss from the inputs, only the following keys: start_logits,end_logits. For reference, the inputs it received are input_ids,attention_mask.

## Test fine tunned model