In [2]:
from datasets import load_dataset
from datasets import DatasetDict
import torch

dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_test_split = dataset['train'].train_test_split(test_size=0.2)

dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})

dataset


README.md:   0%|          | 0.00/233 [00:00<?, ?B/s]

medDataset_processed.csv:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 13125
    })
    validation: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 3282
    })
})

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-mini-instruct").to(device)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  26%|##5       | 1.27G/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

In [5]:
def prepare_data_for_inference(examples):
    # Format the input for BioGPT
    inputs = [f"Q: {q}" for q in examples['Question']]
    return inputs

# Prepare the validation dataset
validation_questions = prepare_data_for_inference(dataset['validation'])


In [7]:
from tqdm import tqdm

def generate_baseline_predictions(model, tokenizer, questions):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    
    for question in tqdm(questions, desc="Generating predictions"):
        inputs = tokenizer(f"{question}", return_tensors="pt").to(device)
        outputs = model.generate(inputs['input_ids'], max_length=70)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(response)
    
    return predictions

# Generate predictions for the validation dataset
predictions = generate_baseline_predictions(model, tokenizer, validation_questions)


Generating predictions: 100%|██████████| 3282/3282 [4:24:03<00:00,  4.83s/it]  


In [8]:
!pip install evaluate
!pip install rouge-score
# Load ROUGE for evaluation
import evaluate

# Load ROUGE for evaluation
rouge = evaluate.load("rouge")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=c4af62f30e82c2eaf3dfb3229f979d6282622995674aedaaeb58064b3be545b1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [9]:
# Prepare references (ground-truth answers)
references = dataset['validation']['Answer'][:2134]
predictions = predictions[:2134]


In [10]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({
    'predictions': predictions,
    'references': references
})

# Save as a CSV file
df.to_csv('/kaggle/working/predictions_references.csv', index=False)

In [11]:
scores = rouge.compute(predictions=predictions, references=references)
print(scores)

{'rouge1': 0.21472259344370082, 'rouge2': 0.08029767957560288, 'rougeL': 0.157391704090382, 'rougeLsum': 0.16312747688787407}


In [12]:
bleu = evaluate.load("bleu")
scores = bleu.compute(predictions=predictions, references=references)
print(scores)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.002595754640371493, 'precisions': [0.46081096213949035, 0.16159009313699985, 0.0818318238418067, 0.04782906988140743], 'brevity_penalty': 0.019866699110842545, 'length_ratio': 0.20330532328835169, 'translation_length': 98229, 'reference_length': 483160}
