In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install groq rouge-score bert-score

Collecting groq
  Downloading groq-0.13.1-py3-none-any.whl.metadata (14 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading groq-0.13.1-py3-none-any.whl (109 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=38670a95f2c6afe59c1fb36090ef0c2019e6d0d10c81f281c0a7a642749939d4
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a2

In [3]:
import os
import csv
from groq import Groq
import pandas as pd
from rouge_score import rouge_scorer
from bert_score import score
from IPython.display import display
import warnings

In [4]:
warnings.filterwarnings("ignore", category=UserWarning, module="transformers")

# Groq

In [5]:
# Initialize the Groq client with API key from an environment variable
client = Groq(
    api_key=os.getenv("GROQ_API_KEY", "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")
)

def llama_api(prompt_content, max_tokens=300):
    """
    Calls Groq's Llama API with the given prompt and a token limit.
    """
    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt_content}],
            model="gemma-7b-it",    # Model Selection
            max_tokens=max_tokens  # Limit the output to 300 tokens
        )
        return chat_completion.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error calling Groq LLM API: {e}")
        return "Error: No response generated."

In [6]:
# List of diseases
diseases = [
 'diabetes', 'pneumonia', 'asthma', 'gout', 'malaria', 'migraine',
 'appendicitis', 'diarrhea', 'endometriosis', 'epilepsy', 'fever',
 'fibromyalgia', 'gallstones', 'GERD', 'heart attack', 'hemorrhoids',
 'hepatitis A', 'hepatitis B', 'hepatitis C', 'hernia', 'HIV', 'hives',
 'kidney stones', 'obesity', 'osteoarthritis', 'osteoporosis',
 "Parkinson's disease", 'psoriatic arthritis', 'cirrhosis', 'delirium', 'epilepsy',
 'heart failure', 'peptic ulcer', 'urinary tract infection (UTI)', 'melanoma',
 'gastritis', 'glaucoma', 'tachycardia', 'anemia', 'pancreatitis',
 'tuberculosis', 'chickenpox', 'mumps', 'tetanus', 'polio', 'lupus',
 'hemophilia', 'Chronic Fatigue Syndrome (CFS)',
]

In [7]:
# Paths to input/output files
output_file = "/content/drive/MyDrive/P2/LLM/LLL-Advice/Gemma_advice.csv"
evaluation_results_file = "/content/drive/MyDrive/P2/LLM/LLL-Advice/Gemma_BERTScore.csv"
reference_files = {
    "MayoClinic": "/content/drive/MyDrive/P2/LLM/LLL-Advice/sources/MayoClinic.csv",
    "Healthline": "/content/drive/MyDrive/P2/LLM/LLL-Advice/sources/Healthline.csv",
    "MedlinePlus": "/content/drive/MyDrive/P2/LLM/LLL-Advice/sources/MedlinePlus.csv",
}

In [8]:
# Load references into a dictionary
reference_data = {}
for source, path in reference_files.items():
    reference_data[source] = pd.read_csv(path)

# Prompt LLM to tell disease cause

In [9]:
# Generate disease causes
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Disease", "Cause"])  # Header with a column for causes

    for disease in diseases:
        # Create the prompt
        prompt = f"What are the main causes of {disease}?"
        response = llama_api(prompt, max_tokens=300)  # Pass the max_tokens limit

        # Ensure response is properly formatted
        cause = response.strip()

        # Write the cause into the CSV
        writer.writerow([disease, cause])

# Evaluate with ROUGE and BERTScore

In [10]:
# Evaluation with ROUGE and BERTScore
llm_responses = pd.read_csv(output_file)

In [11]:
import os
os.environ["HF_TOKEN"] = "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"

In [12]:
# Initialize evaluation lists for BERTScore and ROUGE
bert_scores_healthline = []
bert_scores_medlineplus = []
bert_scores_mayoclinic = []

rouge1_healthline = []
rouge2_healthline = []
rougeL_healthline = []

rouge1_medlineplus = []
rouge2_medlineplus = []
rougeL_medlineplus = []

rouge1_mayoclinic = []
rouge2_mayoclinic = []
rougeL_mayoclinic = []

def evaluate_response(reference, candidate):
    try:
        rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        # ROUGE
        rouge_result = rouge_scorer_instance.score(reference, candidate)

        # BERTScore
        P, R, F1 = score([candidate], [reference], lang="en", verbose=False)

        return rouge_result, F1.mean().item()
    except Exception as e:
        print(f"Error in evaluation: {e}")
        return None, None

# Evaluate responses against each reference
for _, row in llm_responses.iterrows():
    disease = row["Disease"]
    candidate = row["Cause"]

    # Evaluate against Healthline
    healthline_row = reference_data["Healthline"][reference_data["Healthline"]["disease"] == disease]
    if not healthline_row.empty:
        reference_healthline = healthline_row.iloc[0]["cause"]
        rouge_result, bert_healthline = evaluate_response(reference_healthline, candidate)

        # Extract ROUGE scores
        if rouge_result:
            rouge1_healthline.append(rouge_result['rouge1'].fmeasure)
            rouge2_healthline.append(rouge_result['rouge2'].fmeasure)
            rougeL_healthline.append(rouge_result['rougeL'].fmeasure)
        else:
            rouge1_healthline.append(None)
            rouge2_healthline.append(None)
            rougeL_healthline.append(None)
    else:
        bert_healthline = None
        rouge1_healthline.append(None)
        rouge2_healthline.append(None)
        rougeL_healthline.append(None)

    # Evaluate against MedlinePlus
    medlineplus_row = reference_data["MedlinePlus"][reference_data["MedlinePlus"]["disease"] == disease]
    if not medlineplus_row.empty:
        reference_medlineplus = medlineplus_row.iloc[0]["cause"]
        rouge_result, bert_medlineplus = evaluate_response(reference_medlineplus, candidate)

        # Extract ROUGE scores
        if rouge_result:
            rouge1_medlineplus.append(rouge_result['rouge1'].fmeasure)
            rouge2_medlineplus.append(rouge_result['rouge2'].fmeasure)
            rougeL_medlineplus.append(rouge_result['rougeL'].fmeasure)
        else:
            rouge1_medlineplus.append(None)
            rouge2_medlineplus.append(None)
            rougeL_medlineplus.append(None)
    else:
        bert_medlineplus = None
        rouge1_medlineplus.append(None)
        rouge2_medlineplus.append(None)
        rougeL_medlineplus.append(None)

    # Evaluate against MayoClinic
    mayoclinic_row = reference_data["MayoClinic"][reference_data["MayoClinic"]["disease"] == disease]
    if not mayoclinic_row.empty:
        reference_mayoclinic = mayoclinic_row.iloc[0]["cause"]
        rouge_result, bert_mayoclinic = evaluate_response(reference_mayoclinic, candidate)

        # Extract ROUGE scores
        if rouge_result:
            rouge1_mayoclinic.append(rouge_result['rouge1'].fmeasure)
            rouge2_mayoclinic.append(rouge_result['rouge2'].fmeasure)
            rougeL_mayoclinic.append(rouge_result['rougeL'].fmeasure)
        else:
            rouge1_mayoclinic.append(None)
            rouge2_mayoclinic.append(None)
            rougeL_mayoclinic.append(None)
    else:
        bert_mayoclinic = None
        rouge1_mayoclinic.append(None)
        rouge2_mayoclinic.append(None)
        rougeL_mayoclinic.append(None)

    # Append BERTScore results
    bert_scores_healthline.append(bert_healthline)
    bert_scores_medlineplus.append(bert_medlineplus)
    bert_scores_mayoclinic.append(bert_mayoclinic)

# Save evaluation results
evaluation_results = llm_responses.copy()
evaluation_results["BERTScore_Healthline"] = bert_scores_healthline
evaluation_results["BERTScore_MedlinePlus"] = bert_scores_medlineplus
evaluation_results["BERTScore_MayoClinic"] = bert_scores_mayoclinic

evaluation_results["ROUGE1_Healthline"] = rouge1_healthline
evaluation_results["ROUGE2_Healthline"] = rouge2_healthline
evaluation_results["ROUGEL_Healthline"] = rougeL_healthline

evaluation_results["ROUGE1_MedlinePlus"] = rouge1_medlineplus
evaluation_results["ROUGE2_MedlinePlus"] = rouge2_medlineplus
evaluation_results["ROUGEL_MedlinePlus"] = rougeL_medlineplus

evaluation_results["ROUGE1_MayoClinic"] = rouge1_mayoclinic
evaluation_results["ROUGE2_MayoClinic"] = rouge2_mayoclinic
evaluation_results["ROUGEL_MayoClinic"] = rougeL_mayoclinic

evaluation_results.to_csv(evaluation_results_file, index=False)

# Display the evaluation results
display(evaluation_results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Unnamed: 0,Disease,Cause,BERTScore_Healthline,BERTScore_MedlinePlus,BERTScore_MayoClinic,ROUGE1_Healthline,ROUGE2_Healthline,ROUGEL_Healthline,ROUGE1_MedlinePlus,ROUGE2_MedlinePlus,ROUGEL_MedlinePlus,ROUGE1_MayoClinic,ROUGE2_MayoClinic,ROUGEL_MayoClinic
0,diabetes,**1. Genetic Factors:**\n\n* Strong family his...,0.799798,0.805979,0.790912,0.189024,0.01227,0.079268,0.238636,0.011494,0.147727,0.184127,0.025559,0.08254
1,pneumonia,**1. Bacterial infections:**\n\n* Streptococcu...,0.821217,0.826919,0.797551,0.282895,0.072848,0.164474,0.249158,0.067797,0.175084,0.115163,0.026975,0.072937
2,asthma,**1. Genetic Factors:**\n\n* Strong family his...,0.812237,0.813966,0.809356,0.199288,0.028674,0.106762,0.185185,0.022388,0.118519,0.282443,0.061538,0.145038
3,gout,**1. Genetic Predisposition:**\n\n* Genetic fa...,0.841984,0.83497,0.841851,0.398734,0.076433,0.158228,0.401015,0.081633,0.213198,0.405714,0.114943,0.217143
4,malaria,**1. Environmental factors:**\n\n* Tropical an...,0.821084,0.827585,0.82959,0.258333,0.016807,0.15,0.335664,0.074941,0.181818,0.336,0.075067,0.170667
5,migraine,**1. Neurotransmitter Imbalances:**\n\n* Alter...,0.82015,0.805676,0.830467,0.385675,0.099723,0.165289,0.290541,0.047619,0.135135,0.418182,0.077626,0.177273
6,appendicitis,**1. Bacterial Infection:**\n- Most common cau...,0.824092,0.823109,0.821011,0.19403,0.045455,0.149254,0.165289,0.016807,0.115702,0.3,0.072464,0.2
7,diarrhea,**1. Infectious causes:**\n\n* Viral infection...,0.819982,0.823113,0.799448,0.360759,0.095541,0.189873,0.326241,0.05,0.148936,0.261954,0.050104,0.10395
8,endometriosis,**1. Genetic factors:**\n\n* Family history of...,0.801979,0.803157,0.811533,0.195122,0.053452,0.093126,0.199005,0.045,0.099502,0.246154,0.080495,0.135385
9,epilepsy,**1. Structural abnormalities in the brain:**\...,0.807928,0.807488,0.796897,0.309859,0.056872,0.122066,0.153846,0.017391,0.08547,0.208877,0.020997,0.099217
