In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-mini-instruct").to(device)

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  66%|######6   | 3.28G/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

In [4]:
from datasets import load_dataset
from datasets import DatasetDict
import torch

dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_test_split = dataset['train'].train_test_split(test_size=0.2)

dataset = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

dataset


DatasetDict({
    train: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 13125
    })
    test: Dataset({
        features: ['qtype', 'Question', 'Answer'],
        num_rows: 3282
    })
})

In [5]:
def prepare_data_for_inference(examples):
    inputs = [f"Q: {q}\nThink like a medical professional step by step." for q in examples['Question']]
    return inputs

test_questions = prepare_data_for_inference(dataset['test'])

In [6]:
import warnings
warnings.filterwarnings("ignore", message="The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\nSetting `pad_token_id` to `eos_token_id`:None for open-end generation.")

In [8]:
from tqdm import tqdm

def generate_baseline_predictions(model, tokenizer, questions):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    
    for question in tqdm(questions, desc="Generating predictions"):
        inputs = tokenizer(f"{question}", return_tensors="pt").to(device)
        outputs = model.generate(inputs['input_ids'], max_length=70)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(response)
    
    return predictions

predictions = generate_baseline_predictions(model, tokenizer, test_questions)

Generating predictions:   0%|          | 0/3282 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Generating predictions: 100%|██████████| 3282/3282 [3:19:45<00:00,  3.65s/it]  


In [9]:
!pip -q install evaluate
!pip -q install rouge-score

import evaluate
references = dataset['test']['Answer']

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
rouge = evaluate.load("rouge")
scores = rouge.compute(predictions=predictions, references=references)
print(scores)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.17937144193498764, 'rouge2': 0.06156423640657316, 'rougeL': 0.13169716671422033, 'rougeLsum': 0.13355506567934036}


In [11]:
bleu = evaluate.load("bleu")
scores = bleu.compute(predictions=predictions, references=references)
print(scores)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.0021513336194622468, 'precisions': [0.3986281880596236, 0.1183092579995465, 0.06006245568101238, 0.033595569876274566], 'brevity_penalty': 0.021781601379091, 'length_ratio': 0.20718133437175493, 'translation_length': 153228, 'reference_length': 739584}


In [12]:
import pandas as pd

df = pd.DataFrame({
    'predictions': predictions,
    'references': references
})


df.to_csv('/kaggle/working/predictions_references.csv', index=False)