In [2]:
# import os
# os.environ["TOGETHER_API_KEY"] = "your_api_key"
# os.environ["HUGGINGFACE_TOKEN"] = "your_huggingface_token"


In [3]:
!pip install datasets
!pip install together
!pip install evaluate
!pip -q install rouge-score
!pip -q install nltk

from huggingface_hub import login
login(token=os.getenv("HUGGINGFACE_TOKEN"))

from datasets import load_dataset
from tqdm import tqdm
import os
from together import Together
import evaluate
from rouge_score import rouge_scorer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

client = Together(api_key=os.getenv("TOGETHER_API_KEY"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [25]:
!pip -q install bert_score
from bert_score import score

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

## Split Dataset

In [4]:
dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset")
dataset_split = dataset['train'].train_test_split(test_size=0.2, seed=42)
train_dataset, test_dataset = dataset_split['train'], dataset_split['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/233 [00:00<?, ?B/s]

medDataset_processed.csv:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

In [6]:
test_dataset

Dataset({
    features: ['qtype', 'Question', 'Answer'],
    num_rows: 3282
})

## Infer and evaluate

In [7]:
print(test_dataset[:5])

{'qtype': ['inheritance', 'information', 'information', 'susceptibility', 'prevention'], 'Question': ['Is D-bifunctional protein deficiency inherited ?', 'What is (are) Tourette syndrome ?', 'What is (are) Compulsive Gambling ?', 'Who is at risk for Pancreatic Neuroendocrine Tumors (Islet Cell Tumors)? ?', 'How to prevent Medullary Sponge Kidney ?'], 'Answer': ['This condition is inherited in an autosomal recessive pattern, which means both copies of the gene in each cell have mutations. The parents of an individual with an autosomal recessive condition each carry one copy of the mutated gene, but they typically do not show signs and symptoms of the condition.', "Tourette syndrome is a complex disorder characterized by repetitive, sudden, and involuntary movements or noises called tics. Tics usually appear in childhood, and their severity varies over time. In most cases, tics become milder and less frequent in late adolescence and adulthood.  Tourette syndrome involves both motor tics,

In [27]:
# Generate a response from the model
def infer_subjective(question):
    prompt = f"Question: {question}\nThink like a medical professional step by step."
    stream = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt}],
        stream=True,
        max_tokens=200
    )
    response = ""
    for chunk in stream:
        response += chunk.choices[0].delta.content or ""
    return response.strip()

In [30]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")

predictions = []
references = []

for i in tqdm(range(3281), desc="Generating predictions"):
    question = test_dataset['Question'][i]
    answer = test_dataset['Answer'][i]
    references.append(answer)

    # Generate predicted answer using the LLM inference function
    predicted_answer = infer_subjective(question)
    predictions.append(predicted_answer)
    # print(f"Predicted Answer: {predicted_answer}")
    # print(f"Reference Answer: {answer}")

# Evaluation metrics
rouge_scores = rouge.compute(predictions=predictions, references=references)
bleu_scores = bleu.compute(predictions=predictions, references=references)
meteor_scores = meteor.compute(predictions=predictions, references=references)

print("ROUGE Scores:", rouge_scores)
print("BLEU Score:", bleu_scores)
print("METEOR Score:", meteor_scores)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Generating predictions: 100%|██████████| 3281/3281 [1:16:09<00:00,  1.39s/it]


ROUGE Scores: {'rouge1': 0.28159776858463303, 'rouge2': 0.07827387363509199, 'rougeL': 0.15931899835760366, 'rougeLsum': 0.1988268792080733}
BLEU Score: {'bleu': 0.03885608222537417, 'precisions': [0.30614531340508705, 0.07362193256482844, 0.025971861623745703, 0.011692800643659827], 'brevity_penalty': 0.7596611137288506, 'length_ratio': 0.7843857978506326, 'translation_length': 576602, 'reference_length': 735100}
METEOR Score: {'meteor': 0.21436677342875066}


In [33]:
# Save predictions and references to CSV
import pandas as pd

df = pd.DataFrame({
    'predictions': predictions,
    'references': references
})

df.to_csv('/content/predictions_references.csv', index=False)

In [1]:
!pip -q install bert_score
from bert_score import score
import pandas as pd
import torch

df = pd.read_csv('/content/predictions_references.csv')
predictions = df['predictions'].tolist()
references = df['references'].tolist()

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

P, R, F1 = score(predictions, references, lang="en", device=device)

print(f"Mean BERTScore F1: {F1.mean().item()}")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mean BERTScore F1: 0.8288264870643616
