# **Environment Setup**

In [3]:
!pip install --upgrade openai

Collecting openai
  Downloading openai-1.57.4-py3-none-any.whl.metadata (24 kB)
Downloading openai-1.57.4-py3-none-any.whl (390 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.3/390.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.5
    Uninstalling openai-1.54.5:
      Successfully uninstalled openai-1.54.5
Successfully installed openai-1.57.4


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [4]:
!pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [5]:
import pandas as pd
import json
import re
from datasets import load_dataset
import random
import openai
import os
from sklearn.model_selection import train_test_split
import jsonlines
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score
from sentence_transformers import SentenceTransformer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import numpy as np


In [7]:
#import os
os.environ["OPENAI_API_KEY"]  # Replace with actual API key

from openai import OpenAI
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# **1. Load Dataset : MedQuad**

In [6]:
# Load only the 'train' split
medquad_dataset = load_dataset("keivalya/MedQuad-MedicalQnADataset", split="train")

# Convert to DataFrame
medquad_df = pd.DataFrame(medquad_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/233 [00:00<?, ?B/s]

medDataset_processed.csv:   0%|          | 0.00/22.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16407 [00:00<?, ? examples/s]

In [7]:
unique_qtype_counts = medquad_df['qtype'].value_counts()

# Print the counts
print(unique_qtype_counts)

qtype
information        4535
symptoms           2748
treatment          2442
inheritance        1446
frequency          1120
genetic changes    1087
causes              727
exams and tests     653
research            395
outlook             361
susceptibility      324
considerations      235
prevention          210
stages               77
complications        46
support groups        1
Name: count, dtype: int64


# **2. Data Cleaning and Preprocessing**

**Check NULL values, if any drop them**

In [9]:
# Check for null values in each column
null_counts = medquad_df.isnull().sum()

# Print the null counts
print(null_counts)

qtype       0
Question    0
Answer      0
dtype: int64


**Check DUPLICATE values, if any drop them**

In [10]:
# Check for duplicates based on all columns
duplicates = medquad_df.duplicated()

# Print the number of duplicates
print(f"Number of duplicates: {duplicates.sum()}")

Number of duplicates: 48


In [11]:
# Drop duplicate rows, keeping only the first occurrence
medquad_df = medquad_df.drop_duplicates(keep='first')

# Reset the index
medquad_df = medquad_df.reset_index(drop=True)

In [12]:
medquad_df

Unnamed: 0,qtype,Question,Answer
0,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,LCMV infections can occur after exposure to fr...
1,symptoms,What are the symptoms of Lymphocytic Choriomen...,LCMV is most commonly recognized as causing ne...
2,susceptibility,Who is at risk for Lymphocytic Choriomeningiti...,Individuals of all ages who come into contact ...
3,exams and tests,How to diagnose Lymphocytic Choriomeningitis (...,"During the first phase of the disease, the mos..."
4,treatment,What are the treatments for Lymphocytic Chorio...,"Aseptic meningitis, encephalitis, or meningoen..."
...,...,...,...
16354,symptoms,What are the symptoms of Familial visceral myo...,What are the signs and symptoms of Familial vi...
16355,information,What is (are) Pseudopelade of Brocq ?,Pseudopelade of Brocq (PBB) is a slowly progre...
16356,symptoms,What are the symptoms of Pseudopelade of Brocq ?,What are the signs and symptoms of Pseudopelad...
16357,treatment,What are the treatments for Pseudopelade of Br...,Is there treatment or a cure for pseudopelade ...


## **Selecting a random sample size of 1000 for the project**

In [13]:
# Sample 1000 records randomly from the DataFrame
medquad_1000_sample_df = medquad_df.sample(n=1000, random_state=42).reset_index(drop=True)

In [14]:
medquad_1000_sample_df

Unnamed: 0,qtype,Question,Answer
0,exams and tests,How to diagnose Childhood Liver Cancer ?,Tests that examine the liver and the blood are...
1,information,What is (are) Familial esophageal achalasia ?,Familial esophageal achalasia refers to a clus...
2,frequency,How many people are affected by leukocyte adhe...,Leukocyte adhesion deficiency type 1 is estima...
3,information,What is (are) Food Allergy ?,Food allergy is an abnormal response to a food...
4,research,what research (or clinical trials) is being do...,The mission of the National Institute of Neuro...
...,...,...,...
995,symptoms,What are the symptoms of Hyperlipoproteinemia ...,What are the signs and symptoms of Hyperlipopr...
996,frequency,How many people are affected by Tourette syndr...,Although the exact incidence of Tourette syndr...
997,treatment,What are the treatments for Dementia ?,Drugs to specifically treat Alzheimers disease...
998,treatment,What are the treatments for retinitis pigmento...,These resources address the diagnosis or manag...


## **Defining system prompt and converting data to gpt_4o_mini format, then saving as jsonl file "medquad_gpt4o_mini_formatted_data.json"**

"You are a knowledgeable and empathetic virtual medical assistant trained to provide accurate, concise, and evidence-based responses to medical queries. Your primary goal is to assist users with their health-related questions, guide them toward better understanding of medical conditions, and suggest actions they may take based on their concerns. Always include a disclaimer that the information is not a substitute for professional medical advice. Avoid providing definitive diagnoses or treatment plans and encourage users to consult a licensed healthcare provider for personalized care."

In [15]:
# Define the system prompt
system_prompt = (
    "You are a knowledgeable and empathetic virtual medical assistant trained to provide accurate, concise, and evidence-based responses to medical queries. Your primary goal is to assist users with their health-related questions, guide them toward better understanding of medical conditions, and suggest actions they may take based on their concerns. Always include a disclaimer that the information is not a substitute for professional medical advice. Avoid providing definitive diagnoses or treatment plans and encourage users to consult a licensed healthcare provider for personalized care."
)

In [16]:
# Define the system prompt
system_prompt = (
    "You are a knowledgeable and empathetic virtual medical assistant trained to provide accurate, concise, and evidence-based responses to medical queries. Your primary goal is to assist users with their health-related questions, guide them toward better understanding of medical conditions, and suggest actions they may take based on their concerns. Always include a disclaimer that the information is not a substitute for professional medical advice. Avoid providing definitive diagnoses or treatment plans and encourage users to consult a licensed healthcare provider for personalized care."
)

# Function to convert DataFrame into GPT-4o-mini format
def convert_to_gpt4o_format(df):
    gpt4o_format_data = []
    for _, row in df.iterrows():
        gpt4o_format_data.append({
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": row["Question"]  # User question from the DataFrame
                },
                {
                    "role": "assistant",
                    "content": row["Answer"]  # Assistant response from the DataFrame
                }
            ]
        })
    return gpt4o_format_data

# Convert the medquad_1000_sample_df into GPT-4o-mini format
medquad_gpt4o_mini_formatted_data = convert_to_gpt4o_format(medquad_1000_sample_df)

# Save to JSONL file for inspection or later use
with jsonlines.open("medquad_gpt4o_mini_formatted_data.jsonl", "w") as writer:
    writer.write_all(medquad_gpt4o_mini_formatted_data)

print("Data converted to GPT-4o-mini format and saved as medquad_gpt4o_mini_formatted_data.jsonl.")

Data converted to GPT-4o-mini format and saved as medquad_gpt4o_mini_formatted_data.jsonl.


### **Splitting the data "medquad_gpt4o_mini_formatted_data.json" into train_data 80%, test_data 10%, validation_data 10% and saved them as jsonl files**

In [17]:
# Split the data into train (80%), validation (10%), and test (10%) sets
train_data, temp_data = train_test_split(medquad_gpt4o_mini_formatted_data, test_size=0.2, random_state=42)
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Check the number of data points in each split
print(f"Number of records in Training Set: {len(train_data)}")
print(f"Number of records in Validation Set: {len(validation_data)}")
print(f"Number of records in Test Set: {len(test_data)}")

# Save the splits into separate JSONL files
with jsonlines.open("train_data.jsonl", "w") as writer:
    writer.write_all(train_data)

with jsonlines.open("validation_data.jsonl", "w") as writer:
    writer.write_all(validation_data)

with jsonlines.open("test_data.jsonl", "w") as writer:
    writer.write_all(test_data)

print("Data split into training, validation, and testing sets and saved as JSONL files.")

Number of records in Training Set: 800
Number of records in Validation Set: 100
Number of records in Test Set: 100
Data split into training, validation, and testing sets and saved as JSONL files.


In [18]:
print(train_data[:1])

[{'messages': [{'role': 'system', 'content': 'You are a knowledgeable and empathetic virtual medical assistant trained to provide accurate, concise, and evidence-based responses to medical queries. Your primary goal is to assist users with their health-related questions, guide them toward better understanding of medical conditions, and suggest actions they may take based on their concerns. Always include a disclaimer that the information is not a substitute for professional medical advice. Avoid providing definitive diagnoses or treatment plans and encourage users to consult a licensed healthcare provider for personalized care.'}, {'role': 'user', 'content': 'What is (are) Stroke ?'}, {'role': 'assistant', 'content': 'There are two kinds of stroke. The most common kind of stroke is called ischemic stroke. It accounts for approximately 80 percent of all strokes. An ischemic stroke is caused by a blood clot that blocks or plugs a blood vessel in the brain. The other kind of stroke is cal

In [19]:
print(validation_data[:1])

[{'messages': [{'role': 'system', 'content': 'You are a knowledgeable and empathetic virtual medical assistant trained to provide accurate, concise, and evidence-based responses to medical queries. Your primary goal is to assist users with their health-related questions, guide them toward better understanding of medical conditions, and suggest actions they may take based on their concerns. Always include a disclaimer that the information is not a substitute for professional medical advice. Avoid providing definitive diagnoses or treatment plans and encourage users to consult a licensed healthcare provider for personalized care.'}, {'role': 'user', 'content': 'What are the symptoms of Dyssegmental dysplasia Silverman-Handmaker type ?'}, {'role': 'assistant', 'content': 'What are the signs and symptoms of Dyssegmental dysplasia Silverman-Handmaker type? The Human Phenotype Ontology provides the following list of signs and symptoms for Dyssegmental dysplasia Silverman-Handmaker type. If t

In [20]:
print(test_data[:1])

[{'messages': [{'role': 'system', 'content': 'You are a knowledgeable and empathetic virtual medical assistant trained to provide accurate, concise, and evidence-based responses to medical queries. Your primary goal is to assist users with their health-related questions, guide them toward better understanding of medical conditions, and suggest actions they may take based on their concerns. Always include a disclaimer that the information is not a substitute for professional medical advice. Avoid providing definitive diagnoses or treatment plans and encourage users to consult a licensed healthcare provider for personalized care.'}, {'role': 'user', 'content': 'What is (are) Unverricht-Lundborg disease ?'}, {'role': 'assistant', 'content': 'Unverricht-Lundborg disease is an inherited form of progressive myoclonus epilepsy that is characterized by episodes of involuntary muscle jerking or twitching (myoclonus) that increase in frequency and severity over time. Episodes of myoclonus may be

In [21]:
print(test_data[:5])

[{'messages': [{'role': 'system', 'content': 'You are a knowledgeable and empathetic virtual medical assistant trained to provide accurate, concise, and evidence-based responses to medical queries. Your primary goal is to assist users with their health-related questions, guide them toward better understanding of medical conditions, and suggest actions they may take based on their concerns. Always include a disclaimer that the information is not a substitute for professional medical advice. Avoid providing definitive diagnoses or treatment plans and encourage users to consult a licensed healthcare provider for personalized care.'}, {'role': 'user', 'content': 'What is (are) Unverricht-Lundborg disease ?'}, {'role': 'assistant', 'content': 'Unverricht-Lundborg disease is an inherited form of progressive myoclonus epilepsy that is characterized by episodes of involuntary muscle jerking or twitching (myoclonus) that increase in frequency and severity over time. Episodes of myoclonus may be

# **1. ZERO SHOT LEARNING**

In [22]:
# Zero-shot prompt 1
system_prompt = (
    "You are a knowledgeable and empathetic virtual medical assistant trained to provide accurate, concise, and evidence-based responses to medical queries. Your primary goal is to assist users with their health-related questions, guide them toward better understanding of medical conditions, and suggest actions they may take based on their concerns. Always include a disclaimer that the information is not a substitute for professional medical advice. Avoid providing definitive diagnoses or treatment plans and encourage users to consult a licensed healthcare provider for personalized care."
)
# Test a query
query = "What are the symptoms of COVID-19?"

client = OpenAI()
response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[{"role": "system", "content": system_prompt},
        {"role": "user", "content": query}
    ])


print(response.choices[0].message.content.strip())

COVID-19 symptoms can vary widely among individuals, but some of the most commonly reported symptoms include:

1. **Fever or chills**
2. **Cough**
3. **Shortness of breath or difficulty breathing**
4. **Fatigue**
5. **Muscle or body aches**
6. **Headache**
7. **New loss of taste or smell**
8. **Sore throat**
9. **Congestion or runny nose**
10. **Nausea or vomiting**
11. **Diarrhea**

Symptoms typically appear 2-14 days after exposure to the virus. It's important to note that some individuals may experience mild symptoms while others may have severe complications. If you or someone you know is experiencing difficulty breathing, persistent chest pain, confusion, or bluish lips or face, seek emergency medical care immediately.

Please remember that this information is not a substitute for professional medical advice. If you have concerns about COVID-19 or your health, consult a licensed healthcare provider for personalized guidance.


In [23]:
# Zero-shot prompt 2
system_prompt = (
    "You are a knowledgeable and empathetic virtual medical assistant trained to provide accurate, concise, and evidence-based responses to medical queries. Your primary goal is to assist users with their health-related questions, guide them toward better understanding of medical conditions, and suggest actions they may take based on their concerns. Always include a disclaimer that the information is not a substitute for professional medical advice. Avoid providing definitive diagnoses or treatment plans and encourage users to consult a licensed healthcare provider for personalized care."
)

# Test a query
query = "What are the symptoms of Cold and Flu?"

client = OpenAI()
response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[{"role": "system", "content": system_prompt},
        {"role": "user", "content": query}
    ])


print(response.choices[0].message.content.strip())

Cold and flu are both respiratory illnesses, but they are caused by different viruses and typically present with some varying symptoms. Here’s a brief overview of the symptoms for each:

### Common Cold Symptoms:
- **Runny or stuffy nose**
- **Sneezing**
- **Sore throat**
- **Cough**
- **Mild headache**
- **Fatigue**
- **Slight body aches**
- **Low-grade fever (sometimes)**

### Influenza (Flu) Symptoms:
- **High fever (often sudden onset)**
- **Chills**
- **Headache**
- **Severe body aches and fatigue**
- **Dry cough**
- **Sore throat**
- **Runny or stuffy nose**
- **Nausea or vomiting (more common in children)**

Both illnesses can share some symptoms, making it difficult to distinguish between them based purely on symptoms. The flu tends to be more severe and sudden compared to a cold, which typically develops gradually.

If you are experiencing symptoms and are unsure about their severity or duration, it's important to consult a healthcare provider. They can offer personalized advi

In [24]:
# Zero-shot prompt 3
# Test a query
query = "How many people are affected by Crouzonodermoskeletal syndrome ?"

client = OpenAI()
response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[{"role": "system", "content": system_prompt},
        {"role": "user", "content": query}
    ])


print(response.choices[0].message.content.strip())

Crouzonodermoskeletal syndrome is a rare genetic condition, and specific prevalence rates are not widely documented. Estimates suggest that it may occur in about 1 in 100,000 to 1 in 500,000 births. However, exact numbers can vary, and data on rare syndromes can be limited.

If you have concerns or questions about this syndrome, whether for yourself or a loved one, it's important to consult with a healthcare professional who specializes in genetic conditions for more personalized information and guidance.

*Please note that this information is not a substitute for professional medical advice.*


**Actual Answer from the dataset:**. Crouzonodermoskeletal syndrome is rare; this condition is seen in about 1 person per million.

In [25]:
# Zero-shot prompt 4
# Test a query 4
query = "What is (are) Unverricht-Lundborg disease ?"

client = OpenAI()
response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[{"role": "system", "content": system_prompt},
        {"role": "user", "content": query}
    ])


print(response.choices[0].message.content.strip())

Unverricht-Lundborg disease (ULD) is a rare genetic disorder characterized primarily by progressive myoclonic epilepsy (PME). It typically manifests in childhood or adolescence and is associated with neurological symptoms that can include:

- Myoclonus: Sudden, involuntary muscle jerks.
- Generalized seizures: These can vary in type and severity.
- Ataxia: Problems with coordination and balance.
- Cognitive decline: This may occur over time, affecting learning and memory.

ULD is caused by mutations in the **C11orf73** gene, which affects the function of certain proteins involved in cellular processes. The condition is inherited in an autosomal recessive pattern, meaning both copies of the gene must be mutated for the disease to manifest.

Management of Unverricht-Lundborg disease involves addressing symptoms, primarily through the use of antiepileptic medications to help control seizures and myoclonus. Physical therapy and supportive care may also be beneficial.

Due to the complexity

**Actual answer from the dataset :** Unverricht-Lundborg disease is an inherited form of progressive myoclonus epilepsy that is characterized by episodes of involuntary muscle jerking or twitching (myoclonus) that increase in frequency and severity over time. Episodes of myoclonus may be brought on by physical exertion, stress, light, or other stimuli. Affected individuals usually begin showing signs and symptoms of the disorder between the ages of 6 and 15. Over time, the myoclonic episodes may become severe enough to interfere with walking and other everyday activities. Other features include seizures involving loss of consciousness, muscle rigidity, and convulsions (tonic-clonic or grand mal seizures). Like the myoclonic episodes, these may increase in frequency over several years but may be controlled with treatment. After several years of progression, the frequency of seizures may stabilize or decrease. Unverricht-Lundborg disease is caused by mutation in the CSTB gene. It is inherited in an autosomal recessive pattern.

# **Zero Shot Evaluation Metrics**

**1. Exact Match Accuracy**

**2. BLEU Score**

**3. Semantic Similarity**

**4. Cosine Similarity**

**5. Token-Level Accuracy**

**6. Accuracy**

**7. F1 Score**

In [60]:


# Questions from the dataset
queries = [
    "What is (are) Unverricht-Lundborg disease?",
    "How many people are affected by Crouzonodermoskeletal syndrome?"
]

expected_answers = [
    "Unverricht-Lundborg disease is an inherited form of progressive myoclonus epilepsy that is characterized by episodes of involuntary muscle jerking or twitching (myoclonus) that increase in frequency and severity over time. Episodes of myoclonus may be brought on by physical exertion, stress, light, or other stimuli. Affected individuals usually begin showing signs and symptoms of the disorder between the ages of 6 and 15. Over time, the myoclonic episodes may become severe enough to interfere with walking and other everyday activities. Other features include seizures involving loss of consciousness, muscle rigidity, and convulsions (tonic-clonic or grand mal seizures). Like the myoclonic episodes, these may increase in frequency over several years but may be controlled with treatment. After several years of progression, the frequency of seizures may stabilize or decrease. Unverricht-Lundborg disease is caused by mutation in the CSTB gene. It is inherited in an autosomal recessive pattern.",
    "Crouzonodermoskeletal syndrome is rare; this condition is seen in about 1 person per million."
]

generated_answers = [
    "Unverricht-Lundborg disease, also known as EPM1 (Progressive Myoclonic Epilepsy type 1), is a rare genetic disorder characterized by progressive myoclonus, generalized tonic-clonic seizures, and neurological deterioration. It typically manifests in adolescence or early adulthood. The myoclonic jerks are sudden, brief involuntary muscle contractions, while the seizures can vary in type and severity. The condition is caused by mutations in the CSTB gene, which encodes for cystatin B, a protein that plays a role in neuronal function. This disorder is inherited in an autosomal recessive manner, meaning that both copies of the gene (one from each parent) must be mutated for a child to be affected. Symptoms may also include cognitive decline, ataxia (loss of coordination), and psychiatric disturbances. While there is currently no cure for Unverricht-Lundborg disease, treatments focus on managing symptoms, particularly seizure control, using antiepileptic medications. If you suspect you or someone you know may have this condition, or if there are concerning symptoms, it is important to consult a healthcare provider for an accurate diagnosis and appropriate management. **Disclaimer:** The information provided here is for educational purposes only and is not a substitute for professional medical advice. Always consult a licensed healthcare provider for personalized care and guidance.",
    "Crouzonodermoskeletal syndrome is a rare genetic disorder characterized by craniosynostosis, skeletal malformations, and skin abnormalities. Due to its rarity, specific prevalence statistics can be challenging to find, but it is estimated to affect only a small number of individuals globally. As with many rare syndromes, the exact number of affected individuals can vary, and new cases can emerge as awareness and genetic testing improve. If you are looking for more detailed statistics or more information on this condition, consulting medical literature or a healthcare provider specializing in genetic disorders may be helpful. Please remember that this information is not a substitute for professional medical advice. Always consult with a licensed healthcare provider for personalized care and guidance."
]

# Initialize Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define evaluation functions
def calculate_exact_match(expected, generated):
    return int(expected.strip().lower() == generated.strip().lower())

def calculate_bleu(expected, generated):
    expected_tokens = expected.split()
    generated_tokens = generated.split()
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu([expected_tokens], generated_tokens, smoothing_function=smoothing_function)

def calculate_semantic_similarity(expected, generated):
    embeddings = model.encode([expected, generated])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def calculate_token_level_accuracy(expected, generated):
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())
    overlap = expected_tokens & generated_tokens
    return len(overlap) / len(expected_tokens)

def calculate_accuracy_f1(expected_tokens, generated_tokens):
    # Convert tokens to binary vector for classification metrics
    expected_vector = [1 if token in expected_tokens else 0 for token in expected_tokens]
    generated_vector = [1 if token in generated_tokens else 0 for token in expected_tokens]
    accuracy = accuracy_score(expected_vector, generated_vector)
    f1 = f1_score(expected_vector, generated_vector, zero_division=1)
    return accuracy, f1



# Evaluate all metrics
results = []
for query, expected, generated in zip(queries, expected_answers, generated_answers):
    exact_match = calculate_exact_match(expected, generated)
    bleu_score = calculate_bleu(expected, generated)
    semantic_similarity = calculate_semantic_similarity(expected, generated)
    token_level_accuracy = calculate_token_level_accuracy(expected, generated)

    # Calculate Accuracy and F1 Score
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())
    accuracy, f1 = calculate_accuracy_f1(expected_tokens, generated_tokens)

    results.append({
        "Query": query,
        "Exact Match Accuracy": exact_match,
        "BLEU Score": bleu_score,
        "Semantic Similarity": semantic_similarity,
        "Cosine Similarity": semantic_similarity,  # Semantic similarity is the cosine similarity
        "Token-Level Accuracy": token_level_accuracy,
        "Accuracy": accuracy,
        "F1 Score": f1
    })

# Print results
for result in results:
    print(f"Query: {result['Query']}")
    print(f"Exact Match Accuracy: {result['Exact Match Accuracy']}")
    print(f"BLEU Score: {result['BLEU Score']:.2f}")
    print(f"Semantic Similarity: {result['Semantic Similarity']:.2f}")
    print(f"Cosine Similarity: {result['Cosine Similarity']:.2f}")
    print(f"Token-Level Accuracy: {result['Token-Level Accuracy']:.2f}")
    print(f"Accuracy: {result['Accuracy']:.2f}")
    print(f"F1 Score: {result['F1 Score']:.2f}")


Query: What is (are) Unverricht-Lundborg disease?
Exact Match Accuracy: 0
BLEU Score: 0.05
Semantic Similarity: 0.90
Cosine Similarity: 0.90
Token-Level Accuracy: 0.29
Accuracy: 0.29
F1 Score: 0.45
Query: How many people are affected by Crouzonodermoskeletal syndrome?
Exact Match Accuracy: 0
BLEU Score: 0.01
Semantic Similarity: 0.82
Cosine Similarity: 0.82
Token-Level Accuracy: 0.38
Accuracy: 0.38
F1 Score: 0.56


# **2.ONE SHOT PROMT**

In [27]:
# One shot prompt 1
# Example Question and Answer
question = "What are the treatments for 21-hydroxylase deficiency?"
answer = "The objectives for treating 21-hydroxylase deficiency differ with age. In childhood, the overall goal is to replace cortisol. Obtaining hormonal balance is important and patients growth velocity and bone age is monitored. Routine analysis of blood, urine, and/or saliva may also be necessary. Corrective surgery is frequently required for females born with abnormal genitalia. In late childhood and adolescence, maintaining hormonal balance is equally important. Overtreatment may result in obesity and delayed menarche/puberty, whereas under-replacement will result in sexual precocity. Also, it is important that teens and young adults with 21-hydroxylase deficiency be successfully transitioned to adult care facilities. Follow-up of adult patients should involve multidisciplinary clinics. Problems in adult women include fertility concerns, excessive hair growth, and menstrual irregularity; obesity and impact of short stature; sexual dysfunction and psychological problems. Counseling may be helpful. Adult males may develop enlargement of the testes and if so, should work with an endocrinologist familiar with the management of patients with this deficiency."

# Query to test 1
query = "What are the treatments for Prevent diabetes problems: Keep your nervous system healthy ??"

client = openai.OpenAI()
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": question},
        {"role": "assistant", "content": answer},
        {"role": "user", "content": query}
    ]
)

print(response.choices[0].message.content.strip())

To help prevent diabetes-related complications, particularly those affecting the nervous system (diabetic neuropathy), it's important to focus on several key areas of health management. Here are some strategies that can support nerve health in individuals with diabetes:

1. **Blood Sugar Control**: 
   - Maintaining stable blood glucose levels is crucial. This can often be achieved through a combination of diet, exercise, and medication as prescribed by your healthcare provider.

2. **Healthy Diet**:
   - Eating a balanced diet rich in vegetables, fruits, whole grains, lean proteins, and healthy fats can help manage blood sugar levels. Limiting processed foods and sugars is also beneficial.

3. **Regular Exercise**:
   - Engaging in regular physical activity can improve insulin sensitivity, help control weight, and enhance blood flow, all of which are beneficial for nerve health.

4. **Weight Management**:
   - Maintaining a healthy weight can reduce the risk of developing complication

In [28]:
# One shot prompt 2
# Example Question and Answer
question = "What are the treatments for 21-hydroxylase deficiency?"
answer = "The objectives for treating 21-hydroxylase deficiency differ with age. In childhood, the overall goal is to replace cortisol. Obtaining hormonal balance is important and patients growth velocity and bone age is monitored. Routine analysis of blood, urine, and/or saliva may also be necessary. Corrective surgery is frequently required for females born with abnormal genitalia. In late childhood and adolescence, maintaining hormonal balance is equally important. Overtreatment may result in obesity and delayed menarche/puberty, whereas under-replacement will result in sexual precocity. Also, it is important that teens and young adults with 21-hydroxylase deficiency be successfully transitioned to adult care facilities. Follow-up of adult patients should involve multidisciplinary clinics. Problems in adult women include fertility concerns, excessive hair growth, and menstrual irregularity; obesity and impact of short stature; sexual dysfunction and psychological problems. Counseling may be helpful. Adult males may develop enlargement of the testes and if so, should work with an endocrinologist familiar with the management of patients with this deficiency."

# Query to test 1
query = "What are the treatments for gastrointestinal stromal tumor ?"

client = openai.OpenAI()
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": question},
        {"role": "assistant", "content": answer},
        {"role": "user", "content": query}
    ]
)

print(response.choices[0].message.content.strip())

Gastrointestinal stromal tumors (GISTs) are a type of tumor found in the digestive tract, most often in the stomach or small intestine. Treatment for GISTs typically involves a combination of approaches, which can include:

1. **Surgery**: If the tumor is localized and operable, surgical resection is usually the primary treatment. The goal is to remove the tumor along with a margin of healthy tissue.

2. **Targeted Therapy**: Most GISTs have mutations in the KIT or PDGFRA genes, making them responsive to targeted therapies such as imatinib (Gleevec) and sunitinib (Sutent). These medications can help control tumor growth and are often used in cases where the tumor cannot be surgically removed or if it recurs after surgery.

3. **Imatinib**: This is typically the first-line treatment for advanced GISTs. It can shrink tumors and slow their growth, improving symptoms and quality of life.

4. **Sunitinib**: This may be used if the tumor becomes intolerant to imatinib or in cases where imati

**Expected Answer**: These resources address the diagnosis or management of gastrointestinal stromal tumor: - American Cancer Society: Treating Gastrointestinal Stromal Tumor (GIST) - Cancer.Net: Gastrointestinal Stromal Tumor--Diagnosis - Genetic Testing Registry: Gastrointestinal stromal tumor These resources from MedlinePlus offer information about the diagnosis and management of various health conditions: - Diagnostic Tests - Drug Therapy - Surgery and Rehabilitation - Genetic Counseling - Palliative Care

## **One Shot Evaluation Metrics**

**1. Exact Match Accuracy**

**2. BLEU Score**

**3. Semantic Similarity**

**4. Cosine Similarity**

**5. Token-Level Accuracy**

**6. Accuracy**

**7. F1 Score**

In [57]:


# Example data
query = "What are the treatments for gastrointestinal stromal tumor?"
expected_answer = ("These resources address the diagnosis or management of gastrointestinal stromal tumor: - American Cancer Society: Treating Gastrointestinal Stromal Tumor (GIST) - Cancer.Net: Gastrointestinal Stromal Tumor--Diagnosis - Genetic Testing Registry: Gastrointestinal stromal tumor These resources from MedlinePlus offer information about the diagnosis and management of various health conditions: - Diagnostic Tests - Drug Therapy - Surgery and Rehabilitation - Genetic Counseling - Palliative Care")
generated_answer = ("Gastrointestinal stromal tumors (GISTs) are a type of tumor that occurs in the digestive tract, primarily in the stomach or small intestine. Treatment for GISTs typically involves a combination of approaches, which may include: 1. **Surgery**: The primary treatment for localized GISTs is surgical resection to remove the tumor, along with a margin of healthy tissue. This is often the best option if the tumor has not spread. 2. **Targeted Therapy**: For GISTs that cannot be completely removed by surgery, or for metastatic GISTs, targeted therapy is commonly used. The most effective medication is imatinib (Gleevec), which targets the mutated c-KIT protein that drives the growth of these tumors. Other targeted therapies may be used in cases where resistance to imatinib develops or when the tumor has specific mutations (e.g., sunitinib or regorafenib). 3. **Radiation Therapy**: Although not standard treatment for GISTs, radiation may be used in certain situations, such as to relieve symptoms or control local disease, particularly when surgery is not viable. 4. **Clinical Trials**: Patients may also consider participating in clinical trials, which can provide access to new therapies or combinations of treatments that are not yet widely available. 5. **Regular Follow-up**: After treatment, regular follow-up is essential to monitor for recurrence or progression of the disease. Each patient's treatment plan may vary based on the tumor's size, location, genetic characteristics, and the patient's overall health. It is crucial to consult with an oncologist who specializes in GISTs to determine the most appropriate treatment strategy. **Disclaimer**: The information provided is for educational purposes only and is not a substitute for professional medical advice. Always consult a licensed healthcare provider for personalized care.")

# Initialize Sentence Transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define evaluation functions
def calculate_exact_match(expected, generated):
    return int(expected.strip().lower() == generated.strip().lower())

def calculate_bleu(expected, generated):
    expected_tokens = expected.split()
    generated_tokens = generated.split()
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu([expected_tokens], generated_tokens, smoothing_function=smoothing_function)

def calculate_semantic_similarity(expected, generated):
    embeddings = model.encode([expected, generated])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def calculate_cosine_similarity(expected, generated):
    embeddings = model.encode([expected, generated])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def calculate_token_level_accuracy(expected, generated):
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())
    overlap = expected_tokens & generated_tokens
    return len(overlap) / len(expected_tokens)

def calculate_accuracy_f1(expected, generated):
    # Create sets for expected and generated tokens
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())

    # Create binary vectors for accuracy and F1 score calculation
    all_tokens = expected_tokens | generated_tokens
    expected_vector = [1 if token in expected_tokens else 0 for token in all_tokens]
    generated_vector = [1 if token in generated_tokens else 0 for token in all_tokens]

    accuracy = accuracy_score(expected_vector, generated_vector)
    f1 = f1_score(expected_vector, generated_vector, zero_division=1)
    return accuracy, f1

# Evaluate metrics
exact_match = calculate_exact_match(expected_answer, generated_answer)
bleu_score = calculate_bleu(expected_answer, generated_answer)
semantic_similarity = calculate_semantic_similarity(expected_answer, generated_answer)
cosine_similarity = calculate_cosine_similarity(expected_answer, generated_answer)
token_level_accuracy = calculate_token_level_accuracy(expected_answer, generated_answer)

# Accuracy and F1 Score
accuracy, f1 = calculate_accuracy_f1(expected_answer, generated_answer)

# Print results
print(f"Query: {query}")
print(f"Exact Match Accuracy: {exact_match}")
print(f"BLEU Score: {bleu_score:.2f}")
print(f"Semantic Similarity: {semantic_similarity:.2f}")
print(f"Cosine Similarity: {cosine_similarity:.2f}")
print(f"Token-Level Accuracy: {token_level_accuracy:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")

Query: What are the treatments for gastrointestinal stromal tumor?
Exact Match Accuracy: 0
BLEU Score: 0.00
Semantic Similarity: 0.56
Cosine Similarity: 0.56
Token-Level Accuracy: 0.18
Accuracy: 0.04
F1 Score: 0.07


# **3. FEW SHOT PROMPT**

In [31]:
# Examples
examples = [
    {"question": "How many people are affected by Crouzonodermoskeletal syndrome?", "answer": "Today, many different types of medicines are available to control high blood pressure. These medicines work in different ways. Some lower blood pressure by removing extra fluid and salt from your body. Others affect blood pressure by slowing down the heartbeat, or by relaxing and widening blood vessels. Often, two or more drugs work better than one. Here are the types of medicines used to treat high blood pressure. - Diuretics (water or fluid Pills) flush excess sodium from your body, which reduces the amount of fluid in your blood and helps to lower your blood pressure. Diuretics are often used with other high blood pressure medicines, sometimes in one combined pill. - Beta Blockers help your heart beat slower and with less force. As a result, your heart pumps less blood through your blood vessels, which can help to lower your blood pressure. - Angiotensin-Converting Enzyme (ACE) Inhibitors. Angiotensin-II is a hormone that narrows blood vessels, increasing blood pressure. ACE converts Angiotensin I to Angiotensin II. ACE inhibitors block this process, which stops the production of Angiotensin II, lowering blood pressure. - Angiotensin II Receptor Blockers (ARBs) block angiotensin II hormone from binding with receptors in the blood vessels. When angiotensin II is blocked, the blood vessels do not constrict or narrow, which can lower your blood pressure. - Calcium Channel Blockers keep calcium from entering the muscle cells of your heart and blood vessels. This allows blood vessels to relax, which can lower your blood pressure. - Alpha Blockers reduce nerve impulses that tighten blood vessels. This allows blood to flow more freely, causing blood pressure to go down. - Alpha-Beta Blockers reduce nerve impulses the same way alpha blockers do. However, like beta blockers, they also slow the heartbeat. As a result, blood pressure goes down. - Central Acting Agents act in the brain to decrease nerve signals that narrow blood vessels, which can lower blood pressure. - Vasodilators relax the muscles in blood vessel walls, which can lower blood pressure. Diuretics (water or fluid Pills) flush excess sodium from your body, which reduces the amount of fluid in your blood and helps to lower your blood pressure. Diuretics are often used with other high blood pressure medicines, sometimes in one combined pill. Beta Blockers help your heart beat slower and with less force. As a result, your heart pumps less blood through your blood vessels, which can help to lower your blood pressure. Angiotensin-Converting Enzyme (ACE) Inhibitors. Angiotensin-II is a hormone that narrows blood vessels, increasing blood pressure. ACE converts Angiotensin I to Angiotensin II. ACE inhibitors block this process, which stops the production of Angiotensin II, lowering blood pressure. Angiotensin II Receptor Blockers (ARBs) block angiotensin II hormone from binding with receptors in the blood vessels. When angiotensin II is blocked, the blood vessels do not constrict or narrow, which can lower your blood pressure. Calcium Channel Blockers keep calcium from entering the muscle cells of your heart and blood vessels. This allows blood vessels to relax, which can lower your blood pressure. Alpha Blockers reduce nerve impulses that tighten blood vessels. This allows blood to flow more freely, causing blood pressure to go down. Alpha-Beta Blockers reduce nerve impulses the same way alpha blockers do. However, like beta blockers, they also slow the heartbeat. As a result, blood pressure goes down. Central Acting Agents act in the brain to decrease nerve signals that narrow blood vessels, which can lower blood pressure. Vasodilators relax the muscles in blood vessel walls, which can lower blood pressure."},
    {"question": "How many people are affected by Cystic Fibrosis?", "answer": "Cystic Fibrosis affects about 30,000 people in the United States."},
    {"question": "What is the prevalence of Down syndrome?", "answer": "Down syndrome affects about 1 in 700 babies born in the United States."},
    {"question": "How common is autism spectrum disorder?", "answer": "Autism spectrum disorder affects approximately 1 in 54 children in the United States."},
    {"question": "What is the prevalence of diabetes in the United States?", "answer": "Approximately 37 million Americans have diabetes."}
]

# Query to test
query = "How many people are affected by Attention-Deficit/Hyperactivity Disorder (ADHD)?"

messages = [{"role": "system", "content": system_prompt}]
for example in examples:
    messages.append({"role": "user", "content": example["question"]})
    messages.append({"role": "assistant", "content": example["answer"]})
messages.append({"role": "user", "content": query})


client = openai.OpenAI()
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)

print(response.choices[0].message.content.strip())

Attention-Deficit/Hyperactivity Disorder (ADHD) affects about 6-9% of children in the United States, which translates to roughly 4 to 5 million children. It can also persist into adulthood, affecting about 2.5% of adults. However, prevalence rates can vary based on factors such as age, diagnostic criteria, and geographical location. 

As always, please consult a healthcare professional for personalized information and advice.


In [32]:

# Few-shot prompt with multiple examples
prompt = "Given a medical question, provide a comprehensive and informative answer. If the question is ambiguous, ask clarifying questions. Always suggest consulting a healthcare professional for personalized advice."

# Examples
examples = [
    {"question": "How many people are affected by Crouzonodermoskeletal syndrome?", "answer": "Today, many different types of medicines are available to control high blood pressure. These medicines work in different ways. Some lower blood pressure by removing extra fluid and salt from your body. Others affect blood pressure by slowing down the heartbeat, or by relaxing and widening blood vessels. Often, two or more drugs work better than one. Here are the types of medicines used to treat high blood pressure. - Diuretics (water or fluid Pills) flush excess sodium from your body, which reduces the amount of fluid in your blood and helps to lower your blood pressure. Diuretics are often used with other high blood pressure medicines, sometimes in one combined pill. - Beta Blockers help your heart beat slower and with less force. As a result, your heart pumps less blood through your blood vessels, which can help to lower your blood pressure. - Angiotensin-Converting Enzyme (ACE) Inhibitors. Angiotensin-II is a hormone that narrows blood vessels, increasing blood pressure. ACE converts Angiotensin I to Angiotensin II. ACE inhibitors block this process, which stops the production of Angiotensin II, lowering blood pressure. - Angiotensin II Receptor Blockers (ARBs) block angiotensin II hormone from binding with receptors in the blood vessels. When angiotensin II is blocked, the blood vessels do not constrict or narrow, which can lower your blood pressure. - Calcium Channel Blockers keep calcium from entering the muscle cells of your heart and blood vessels. This allows blood vessels to relax, which can lower your blood pressure. - Alpha Blockers reduce nerve impulses that tighten blood vessels. This allows blood to flow more freely, causing blood pressure to go down. - Alpha-Beta Blockers reduce nerve impulses the same way alpha blockers do. However, like beta blockers, they also slow the heartbeat. As a result, blood pressure goes down. - Central Acting Agents act in the brain to decrease nerve signals that narrow blood vessels, which can lower blood pressure. - Vasodilators relax the muscles in blood vessel walls, which can lower blood pressure. Diuretics (water or fluid Pills) flush excess sodium from your body, which reduces the amount of fluid in your blood and helps to lower your blood pressure. Diuretics are often used with other high blood pressure medicines, sometimes in one combined pill. Beta Blockers help your heart beat slower and with less force. As a result, your heart pumps less blood through your blood vessels, which can help to lower your blood pressure. Angiotensin-Converting Enzyme (ACE) Inhibitors. Angiotensin-II is a hormone that narrows blood vessels, increasing blood pressure. ACE converts Angiotensin I to Angiotensin II. ACE inhibitors block this process, which stops the production of Angiotensin II, lowering blood pressure. Angiotensin II Receptor Blockers (ARBs) block angiotensin II hormone from binding with receptors in the blood vessels. When angiotensin II is blocked, the blood vessels do not constrict or narrow, which can lower your blood pressure. Calcium Channel Blockers keep calcium from entering the muscle cells of your heart and blood vessels. This allows blood vessels to relax, which can lower your blood pressure. Alpha Blockers reduce nerve impulses that tighten blood vessels. This allows blood to flow more freely, causing blood pressure to go down. Alpha-Beta Blockers reduce nerve impulses the same way alpha blockers do. However, like beta blockers, they also slow the heartbeat. As a result, blood pressure goes down. Central Acting Agents act in the brain to decrease nerve signals that narrow blood vessels, which can lower blood pressure. Vasodilators relax the muscles in blood vessel walls, which can lower blood pressure."},
    {"question": "How many people are affected by Cystic Fibrosis?", "answer": "Cystic Fibrosis affects about 30,000 people in the United States."},
    {"question": "What is the prevalence of Down syndrome?", "answer": "Down syndrome affects about 1 in 700 babies born in the United States."},
    {"question": "How common is autism spectrum disorder?", "answer": "Autism spectrum disorder affects approximately 1 in 54 children in the United States."},
    {"question": "What is the prevalence of diabetes in the United States?", "answer": "Approximately 37 million Americans have diabetes."}
]

# Query to test
query = "How many people are affected by Attention-Deficit/Hyperactivity Disorder (ADHD)?"

messages = [{"role": "system", "content": prompt}]
for example in examples:
    messages.append({"role": "user", "content": example["question"]})
    messages.append({"role": "assistant", "content": example["answer"]})
messages.append({"role": "user", "content": query})


client = openai.OpenAI()
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)

print(response.choices[0].message.content.strip())

Attention-Deficit/Hyperactivity Disorder (ADHD) affects about 6-9% of children in the United States, based on various studies. This translates to approximately 4 to 6 million children diagnosed with the condition. In addition, ADHD can continue into adulthood, and approximately 4% of adults are estimated to have ADHD. Prevalence may vary based on factors such as age, sex, and the criteria used for diagnosis. 

As always, for personalized advice or concerns regarding ADHD, it's best to consult a healthcare professional.


In [34]:
# Few-shot prompt with multiple examples
prompt = "Given a medical question, provide to the point answer in a paragraph style. If the question is ambiguous, ask clarifying questions. Always suggest consulting a healthcare professional for personalized advice."


# Examples
examples = [
    {"question": "How many people are affected by Crouzonodermoskeletal syndrome?", "answer": "Today, many different types of medicines are available to control high blood pressure. These medicines work in different ways. Some lower blood pressure by removing extra fluid and salt from your body. Others affect blood pressure by slowing down the heartbeat, or by relaxing and widening blood vessels. Often, two or more drugs work better than one. Here are the types of medicines used to treat high blood pressure. - Diuretics (water or fluid Pills) flush excess sodium from your body, which reduces the amount of fluid in your blood and helps to lower your blood pressure. Diuretics are often used with other high blood pressure medicines, sometimes in one combined pill. - Beta Blockers help your heart beat slower and with less force. As a result, your heart pumps less blood through your blood vessels, which can help to lower your blood pressure. - Angiotensin-Converting Enzyme (ACE) Inhibitors. Angiotensin-II is a hormone that narrows blood vessels, increasing blood pressure. ACE converts Angiotensin I to Angiotensin II. ACE inhibitors block this process, which stops the production of Angiotensin II, lowering blood pressure. - Angiotensin II Receptor Blockers (ARBs) block angiotensin II hormone from binding with receptors in the blood vessels. When angiotensin II is blocked, the blood vessels do not constrict or narrow, which can lower your blood pressure. - Calcium Channel Blockers keep calcium from entering the muscle cells of your heart and blood vessels. This allows blood vessels to relax, which can lower your blood pressure. - Alpha Blockers reduce nerve impulses that tighten blood vessels. This allows blood to flow more freely, causing blood pressure to go down. - Alpha-Beta Blockers reduce nerve impulses the same way alpha blockers do. However, like beta blockers, they also slow the heartbeat. As a result, blood pressure goes down. - Central Acting Agents act in the brain to decrease nerve signals that narrow blood vessels, which can lower blood pressure. - Vasodilators relax the muscles in blood vessel walls, which can lower blood pressure. Diuretics (water or fluid Pills) flush excess sodium from your body, which reduces the amount of fluid in your blood and helps to lower your blood pressure. Diuretics are often used with other high blood pressure medicines, sometimes in one combined pill. Beta Blockers help your heart beat slower and with less force. As a result, your heart pumps less blood through your blood vessels, which can help to lower your blood pressure. Angiotensin-Converting Enzyme (ACE) Inhibitors. Angiotensin-II is a hormone that narrows blood vessels, increasing blood pressure. ACE converts Angiotensin I to Angiotensin II. ACE inhibitors block this process, which stops the production of Angiotensin II, lowering blood pressure. Angiotensin II Receptor Blockers (ARBs) block angiotensin II hormone from binding with receptors in the blood vessels. When angiotensin II is blocked, the blood vessels do not constrict or narrow, which can lower your blood pressure. Calcium Channel Blockers keep calcium from entering the muscle cells of your heart and blood vessels. This allows blood vessels to relax, which can lower your blood pressure. Alpha Blockers reduce nerve impulses that tighten blood vessels. This allows blood to flow more freely, causing blood pressure to go down. Alpha-Beta Blockers reduce nerve impulses the same way alpha blockers do. However, like beta blockers, they also slow the heartbeat. As a result, blood pressure goes down. Central Acting Agents act in the brain to decrease nerve signals that narrow blood vessels, which can lower blood pressure. Vasodilators relax the muscles in blood vessel walls, which can lower blood pressure."},
    {"question": "How many people are affected by Cystic Fibrosis?", "answer": "Cystic Fibrosis affects about 30,000 people in the United States."},
    {"question": "What is the prevalence of Down syndrome?", "answer": "Down syndrome affects about 1 in 700 babies born in the United States."},
    {"question": "How common is autism spectrum disorder?", "answer": "Autism spectrum disorder affects approximately 1 in 54 children in the United States."},
    {"question": "What is the prevalence of diabetes in the United States?", "answer": "Approximately 37 million Americans have diabetes."}
]

# Query to test
query = "What are the treatments for gastrointestinal stromal tumor?"

messages = [{"role": "system", "content":prompt}]
for example in examples:
    messages.append({"role": "user", "content": example["question"]})
    messages.append({"role": "assistant", "content": example["answer"]})
messages.append({"role": "user", "content": query})


client = openai.OpenAI()
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)

print(response.choices[0].message.content.strip())

Treatments for gastrointestinal stromal tumors (GISTs) typically include surgery, targeted therapy, and sometimes chemotherapy or radiation. Surgery is often the primary treatment and aims to remove the tumor entirely, along with some surrounding healthy tissue. Targeted therapies, such as imatinib (Gleevec), are commonly used to treat GISTs, especially those that are advanced or cannot be surgically removed, as they specifically target the cancer cells. In some cases, additional therapies like chemotherapy or radiation may be considered, but they are less common as GISTs are generally resistant to these treatments. It's essential to consult with a healthcare professional for personalized treatment options and management.


**Query :** What are the treatments for gastrointestinal stromal tumor?

**Expected answer :** These resources address the diagnosis or management of gastrointestinal stromal tumor: - American Cancer Society: Treating Gastrointestinal Stromal Tumor (GIST) - Cancer.Net: Gastrointestinal Stromal Tumor--Diagnosis - Genetic Testing Registry: Gastrointestinal stromal tumor These resources from MedlinePlus offer information about the diagnosis and management of various health conditions: - Diagnostic Tests - Drug Therapy - Surgery and Rehabilitation - Genetic Counseling - Palliative Care.

**Generated Answer :** Treatments for gastrointestinal stromal tumors (GISTs) typically include surgery, targeted therapy, and sometimes chemotherapy or radiation. Surgery is often the primary treatment and aims to remove the tumor entirely, along with some surrounding healthy tissue. Targeted therapies, such as imatinib (Gleevec), are commonly used to treat GISTs, especially those that are advanced or cannot be surgically removed, as they specifically target the cancer cells. In some cases, additional therapies like chemotherapy or radiation may be considered, but they are less common as GISTs are generally resistant to these treatments. It's essential to consult with a healthcare professional for personalized treatment options and management.

# **Few Shot Evaluation Metrics**

**1. Exact Match Accuracy**

**2. BLEU Score**

**3. Semantic Similarity**

**4. Cosine Similarity**

**5. Token-Level Accuracy**

**6. Accuracy**

**7. F1 Score**

In [56]:


# Few-shot prompt example
query = "What are the treatments for gastrointestinal stromal tumor?"

expected_answer = ("These resources address the diagnosis or management of gastrointestinal stromal tumor: - American Cancer Society: Treating Gastrointestinal Stromal Tumor (GIST) - Cancer.Net: Gastrointestinal Stromal Tumor--Diagnosis - Genetic Testing Registry: Gastrointestinal stromal tumor These resources from MedlinePlus offer information about the diagnosis and management of various health conditions: - Diagnostic Tests - Drug Therapy - Surgery and Rehabilitation - Genetic Counseling - Palliative Care.")


generated_answer = ("Treatments for gastrointestinal stromal tumors (GISTs) typically include surgery, targeted therapy, and sometimes chemotherapy or radiation. Surgery is often the primary treatment and aims to remove the tumor entirely, along with some surrounding healthy tissue. Targeted therapies, such as imatinib (Gleevec), are commonly used to treat GISTs, especially those that are advanced or cannot be surgically removed, as they specifically target the cancer cells. In some cases, additional therapies like chemotherapy or radiation may be considered, but they are less common as GISTs are generally resistant to these treatments. It's essential to consult with a healthcare professional for personalized treatment options and management.")

# Initialize Sentence Transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define evaluation functions
def calculate_exact_match(expected, generated):
    return int(expected.strip().lower() == generated.strip().lower())

def calculate_bleu(expected, generated):
    expected_tokens = expected.split()
    generated_tokens = generated.split()
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu([expected_tokens], generated_tokens, smoothing_function=smoothing_function)

def calculate_semantic_similarity(expected, generated):
    embeddings = model.encode([expected, generated])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def calculate_cosine_similarity(expected, generated):
    embeddings = model.encode([expected, generated])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def calculate_token_level_accuracy(expected, generated):
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())
    overlap = expected_tokens & generated_tokens
    return len(overlap) / len(expected_tokens)

def calculate_accuracy_f1(expected, generated):
    # Create sets for expected and generated tokens
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())

    # Create binary vectors for accuracy and F1 score calculation
    all_tokens = expected_tokens | generated_tokens
    expected_vector = [1 if token in expected_tokens else 0 for token in all_tokens]
    generated_vector = [1 if token in generated_tokens else 0 for token in all_tokens]

    accuracy = accuracy_score(expected_vector, generated_vector)
    f1 = f1_score(expected_vector, generated_vector, zero_division=1)
    return accuracy, f1

# Evaluate metrics
exact_match = calculate_exact_match(expected_answer, generated_answer)
bleu_score = calculate_bleu(expected_answer, generated_answer)
semantic_similarity = calculate_semantic_similarity(expected_answer, generated_answer)
cosine_similarity = calculate_cosine_similarity(expected_answer, generated_answer)
token_level_accuracy = calculate_token_level_accuracy(expected_answer, generated_answer)

# Accuracy and F1 Score
accuracy, f1 = calculate_accuracy_f1(expected_answer, generated_answer)

# Print results
print(f"Query: {query}")
print(f"Exact Match Accuracy: {exact_match}")
print(f"BLEU Score: {bleu_score:.2f}")
print(f"Semantic Similarity: {semantic_similarity:.2f}")
print(f"Cosine Similarity: {cosine_similarity:.2f}")
print(f"Token-Level Accuracy: {token_level_accuracy:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")



Query: What are the treatments for gastrointestinal stromal tumor?
Exact Match Accuracy: 0
BLEU Score: 0.01
Semantic Similarity: 0.66
Cosine Similarity: 0.66
Token-Level Accuracy: 0.16
Accuracy: 0.06
F1 Score: 0.11


**Testing with a question from the 1000 sample dataset**

In [61]:
# Few-shot prompt with multiple examples
prompt = "Given a medical question, provide informative answer in a paragraph style. If the question is ambiguous, ask clarifying questions. Always suggest consulting a healthcare professional for personalized advice."

# Examples
examples = [
    {"question": "What are the treatments for High Blood Pressure?", "answer": "Treatment includes lifestyle changes such as a healthy diet, regular exercise, and medications like ACE inhibitors, beta-blockers, or diuretics."},
    {"question": "What are the treatments for 21-hydroxylase deficiency?", "answer": "The objectives for treating 21-hydroxylase deficiency differ with age. In childhood, the goal is to replace cortisol, monitor growth, and balance hormones. In adulthood, multidisciplinary care and counseling are critical for addressing fertility, psychological, and hormonal concerns."},
    {"question": "How many people are affected by glutaric acidemia type I?", "answer": "Glutaric acidemia type I occurs in approximately 1 in every 30,000 to 40,000 individuals. It is more common in specific populations, such as the Amish and Ojibwa communities."},
    {"question": "How is HPS diagnosed and treated for Hantavirus?", "answer": "Diagnosing HPS involves evaluating symptoms like fever, fatigue, and respiratory issues, often linked to rodent exposure. Treatment includes intensive care with oxygen therapy to manage respiratory distress."},
    {"question": "What is the prevalence of diabetes in the United States?", "answer": "Approximately 37 million Americans have diabetes."},
    {"question": "What are the symptoms of Microscopic Colitis?", "answer": "Symptoms include chronic, watery, nonbloody diarrhea, abdominal pain, urgency, and weight loss. Symptoms may come and go and often improve without treatment."}
]

# Query to test
query = "what is yersiniosis for Yersinia ?"

messages = [{"role": "system", "content": prompt}]
for example in examples:
    messages.append({"role": "user", "content": example["question"]})
    messages.append({"role": "assistant", "content": example["answer"]})
messages.append({"role": "user", "content": query})


client = openai.OpenAI()
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)

print(response.choices[0].message.content.strip())

Yersiniosis is an infection caused by the bacteria Yersinia enterocolitica, which can lead to gastrointestinal symptoms such as diarrhea (often bloody), abdominal pain, fever, and sometimes nausea and vomiting. It is typically transmitted through contaminated food, especially undercooked pork, or through unpasteurized milk and contaminated water. In some cases, it can also cause a more severe illness resembling appendicitis, known as appendicitis-like syndrome. If you suspect an infection or experience severe symptoms, it is important to consult a healthcare professional for proper diagnosis and treatment.


**Dataset Query** : what is yersiniosis for Yersinia ?

**Expected Answer** : Yersiniosis is an infectious disease caused by a bacterium of the genus Yersinia. In the United States, most human illness is caused by one species, Y. enterocolitica. Infection with Y. enterocolitica can cause a variety of symptoms depending on the age of the person infected. Infection with Y. enterocolitica occurs most often in young children. Common symptoms in children are fever, abdominal pain, and diarrhea, which is often bloody. Symptoms typically develop 4 to 7 days after exposure and may last 1 to 3 weeks or longer. In older children and adults, right-sided abdominal pain and fever may be the predominant symptoms, and may be confused with appendicitis. In a small proportion of cases, complications such as skin rash, joint pains, or spread of bacteria to the bloodstream can occur.

**Generated Answer** :Yersiniosis is an infectious disease caused by the bacterium Yersinia enterocolitica, primarily affecting the gastrointestinal system. It typically presents with symptoms such as diarrhea (which can be bloody), abdominal pain, fever, and sometimes nausea and vomiting. The infection can be contracted through consuming contaminated food, particularly undercooked pork, unpasteurized milk, or contaminated water. In some cases, especially in children, yersiniosis can mimic appendicitis due to abdominal symptoms. If you suspect you have yersiniosis or have any concerns about foodborne illnesses, it is essential to consult a healthcare professional for diagnosis and appropriate treatment.

### **Evaluation 2**

In [62]:



# Few-shot prompt example
query = "what is yersiniosis for Yersinia ?"

expected_answer = ("Yersiniosis is an infectious disease caused by a bacterium of the genus Yersinia. In the United States, most human illness is caused by one species, Y. enterocolitica. Infection with Y. enterocolitica can cause a variety of symptoms depending on the age of the person infected. Infection with Y. enterocolitica occurs most often in young children. Common symptoms in children are fever, abdominal pain, and diarrhea, which is often bloody. Symptoms typically develop 4 to 7 days after exposure and may last 1 to 3 weeks or longer. In older children and adults, right-sided abdominal pain and fever may be the predominant symptoms, and may be confused with appendicitis. In a small proportion of cases, complications such as skin rash, joint pains, or spread of bacteria to the bloodstream can occur.")
generated_answer = ("Yersiniosis is an infectious disease caused by the bacterium Yersinia enterocolitica, primarily affecting the gastrointestinal system. It typically presents with symptoms such as diarrhea (which can be bloody), abdominal pain, fever, and sometimes nausea and vomiting. The infection can be contracted through consuming contaminated food, particularly undercooked pork, unpasteurized milk, or contaminated water. In some cases, especially in children, yersiniosis can mimic appendicitis due to abdominal symptoms. If you suspect you have yersiniosis or have any concerns about foodborne illnesses, it is essential to consult a healthcare professional for diagnosis and appropriate treatment.")
# Initialize Sentence Transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define evaluation functions
def calculate_exact_match(expected, generated):
    return int(expected.strip().lower() == generated.strip().lower())

def calculate_bleu(expected, generated):
    expected_tokens = expected.split()
    generated_tokens = generated.split()
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu([expected_tokens], generated_tokens, smoothing_function=smoothing_function)

def calculate_semantic_similarity(expected, generated):
    embeddings = model.encode([expected, generated])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def calculate_cosine_similarity(expected, generated):
    # Similar to semantic similarity but added explicitly for clarification
    embeddings = model.encode([expected, generated])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def calculate_token_level_accuracy(expected, generated):
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())
    overlap = expected_tokens & generated_tokens
    return len(overlap) / len(expected_tokens)

def calculate_accuracy_f1(expected, generated):
    # Create sets for expected and generated tokens
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())

    # Create binary vectors for accuracy and F1 score calculation
    all_tokens = expected_tokens | generated_tokens
    expected_vector = [1 if token in expected_tokens else 0 for token in all_tokens]
    generated_vector = [1 if token in generated_tokens else 0 for token in all_tokens]

    accuracy = accuracy_score(expected_vector, generated_vector)
    f1 = f1_score(expected_vector, generated_vector, zero_division=1)
    return accuracy, f1

# Evaluate metrics
exact_match = calculate_exact_match(expected_answer, generated_answer)
bleu_score = calculate_bleu(expected_answer, generated_answer)
semantic_similarity = calculate_semantic_similarity(expected_answer, generated_answer)
cosine_similarity_score = calculate_cosine_similarity(expected_answer, generated_answer)
token_level_accuracy = calculate_token_level_accuracy(expected_answer, generated_answer)

# Accuracy and F1 Score
accuracy, f1 = calculate_accuracy_f1(expected_answer, generated_answer)

# Print results
print(f"Query: {query}")
print(f"Exact Match Accuracy: {exact_match}")
print(f"BLEU Score: {bleu_score:.2f}")
print(f"Semantic Similarity: {semantic_similarity:.2f}")
print(f"Cosine Similarity: {cosine_similarity_score:.2f}")
print(f"Token-Level Accuracy: {token_level_accuracy:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")



Query: what is yersiniosis for Yersinia ?
Exact Match Accuracy: 0
BLEU Score: 0.06
Semantic Similarity: 0.88
Cosine Similarity: 0.88
Token-Level Accuracy: 0.29
Accuracy: 0.18
F1 Score: 0.31


**Test : Different system prompt, same question**

In [64]:
# Few-shot prompt with multiple examples
prompt = "You are a knowledgeable and empathetic virtual medical assistant trained to provide accurate, concise, and evidence-based responses to medical queries. Your primary goal is to assist users with their health-related questions, guide them toward better understanding of medical conditions, and suggest actions they may take based on their concerns. Always advise the user that the information is not a substitute for professional medical advice. Avoid providing definitive diagnoses or treatment plans and encourage users to consult a licensed healthcare provider for personalized care."

# Examples
examples = [
    {"question": "What are the treatments for High Blood Pressure?", "answer": "Treatment includes lifestyle changes such as a healthy diet, regular exercise, and medications like ACE inhibitors, beta-blockers, or diuretics."},
    {"question": "What are the treatments for 21-hydroxylase deficiency?", "answer": "The objectives for treating 21-hydroxylase deficiency differ with age. In childhood, the goal is to replace cortisol, monitor growth, and balance hormones. In adulthood, multidisciplinary care and counseling are critical for addressing fertility, psychological, and hormonal concerns."},
    {"question": "How many people are affected by glutaric acidemia type I?", "answer": "Glutaric acidemia type I occurs in approximately 1 in every 30,000 to 40,000 individuals. It is more common in specific populations, such as the Amish and Ojibwa communities."},
    {"question": "How is HPS diagnosed and treated for Hantavirus?", "answer": "Diagnosing HPS involves evaluating symptoms like fever, fatigue, and respiratory issues, often linked to rodent exposure. Treatment includes intensive care with oxygen therapy to manage respiratory distress."},
    {"question": "What is the prevalence of diabetes in the United States?", "answer": "Approximately 37 million Americans have diabetes."},
    {"question": "What are the symptoms of Microscopic Colitis?", "answer": "Symptoms include chronic, watery, nonbloody diarrhea, abdominal pain, urgency, and weight loss. Symptoms may come and go and often improve without treatment."}
]

# Query to test
query = "what is yersiniosis for Yersinia ?"

messages = [{"role": "system", "content": prompt}]
for example in examples:
    messages.append({"role": "user", "content": example["question"]})
    messages.append({"role": "assistant", "content": example["answer"]})
messages.append({"role": "user", "content": query})


client = openai.OpenAI()
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages
)

print(response.choices[0].message.content.strip())

Yersiniosis is an infection caused by the bacterium Yersinia enterocolitica, typically associated with consuming contaminated food or water. Symptoms often include diarrhea (which can be bloody), abdominal pain, fever, and can mimic appendicitis. In some cases, it may lead to more severe complications, especially in individuals with weakened immune systems. Most infections resolve on their own, but severe cases may require antibiotics. If you suspect you have yersiniosis, it’s important to consult a healthcare provider.


In [65]:



# Few-shot prompt example
query = "what is yersiniosis for Yersinia ?"

expected_answer = ("Yersiniosis is an infectious disease caused by a bacterium of the genus Yersinia. In the United States, most human illness is caused by one species, Y. enterocolitica. Infection with Y. enterocolitica can cause a variety of symptoms depending on the age of the person infected. Infection with Y. enterocolitica occurs most often in young children. Common symptoms in children are fever, abdominal pain, and diarrhea, which is often bloody. Symptoms typically develop 4 to 7 days after exposure and may last 1 to 3 weeks or longer. In older children and adults, right-sided abdominal pain and fever may be the predominant symptoms, and may be confused with appendicitis. In a small proportion of cases, complications such as skin rash, joint pains, or spread of bacteria to the bloodstream can occur.")
generated_answer = ("Yersiniosis is an infection caused by the bacterium Yersinia enterocolitica, typically associated with consuming contaminated food or water. Symptoms often include diarrhea (which can be bloody), abdominal pain, fever, and can mimic appendicitis. In some cases, it may lead to more severe complications, especially in individuals with weakened immune systems. Most infections resolve on their own, but severe cases may require antibiotics. If you suspect you have yersiniosis, it’s important to consult a healthcare provider.")
# Initialize Sentence Transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define evaluation functions
def calculate_exact_match(expected, generated):
    return int(expected.strip().lower() == generated.strip().lower())

def calculate_bleu(expected, generated):
    expected_tokens = expected.split()
    generated_tokens = generated.split()
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu([expected_tokens], generated_tokens, smoothing_function=smoothing_function)

def calculate_semantic_similarity(expected, generated):
    embeddings = model.encode([expected, generated])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def calculate_cosine_similarity(expected, generated):
    # Similar to semantic similarity but added explicitly for clarification
    embeddings = model.encode([expected, generated])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def calculate_token_level_accuracy(expected, generated):
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())
    overlap = expected_tokens & generated_tokens
    return len(overlap) / len(expected_tokens)

def calculate_accuracy_f1(expected, generated):
    # Create sets for expected and generated tokens
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())

    # Create binary vectors for accuracy and F1 score calculation
    all_tokens = expected_tokens | generated_tokens
    expected_vector = [1 if token in expected_tokens else 0 for token in all_tokens]
    generated_vector = [1 if token in generated_tokens else 0 for token in all_tokens]

    accuracy = accuracy_score(expected_vector, generated_vector)
    f1 = f1_score(expected_vector, generated_vector, zero_division=1)
    return accuracy, f1

# Evaluate metrics
exact_match = calculate_exact_match(expected_answer, generated_answer)
bleu_score = calculate_bleu(expected_answer, generated_answer)
semantic_similarity = calculate_semantic_similarity(expected_answer, generated_answer)
cosine_similarity_score = calculate_cosine_similarity(expected_answer, generated_answer)
token_level_accuracy = calculate_token_level_accuracy(expected_answer, generated_answer)

# Accuracy and F1 Score
accuracy, f1 = calculate_accuracy_f1(expected_answer, generated_answer)

# Print results
print(f"Query: {query}")
print(f"Exact Match Accuracy: {exact_match}")
print(f"BLEU Score: {bleu_score:.2f}")
print(f"Semantic Similarity: {semantic_similarity:.2f}")
print(f"Cosine Similarity: {cosine_similarity_score:.2f}")
print(f"Token-Level Accuracy: {token_level_accuracy:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")



Query: what is yersiniosis for Yersinia ?
Exact Match Accuracy: 0
BLEU Score: 0.01
Semantic Similarity: 0.89
Cosine Similarity: 0.89
Token-Level Accuracy: 0.29
Accuracy: 0.20
F1 Score: 0.33


**We see that by changing syatem prompt, the model is able to generate better response to the same question, evident from the scores**

# **4. FINE TUNING**

### >> **1. Uploading Training and Validation Files**





In [38]:
train_file = client.files.create(
  file=open("train_data.jsonl", "rb"),  # Path to training file
  purpose="fine-tune"
)

valid_file = client.files.create(
  file=open("validation_data.jsonl", "rb"),  # Path to validation file
  purpose="fine-tune"
)

print(f"Training file Info: {train_file}")
print(f"Validation file Info: {valid_file}")

Training file Info: FileObject(id='file-Tahhf3TtrR71ML6a96Jniu', bytes=1657237, created_at=1734325982, filename='train_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)
Validation file Info: FileObject(id='file-PNkGFYmHEkYUjdRap95EKX', bytes=228176, created_at=1734325983, filename='validation_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)


### >> **2. Starting the fine-tuning job**

In [39]:
# Create the fine-tuning job
fine_tuning_job = client.fine_tuning.jobs.create(
  training_file=train_file.id,
  validation_file=valid_file.id,
  model="gpt-4o-mini-2024-07-18",  # The model to be used
  hyperparameters={
    "n_epochs": 3,
    "batch_size": 8,
    "learning_rate_multiplier": 0.1
  }
)

# Get the job ID and status
job_id = fine_tuning_job.id
status = fine_tuning_job.status

# Print details
print(f'Fine-tuning job created with ID: {job_id}')
print(f"Fine-tuning job details: {fine_tuning_job}")
print(f"Fine-tuning job status: {status}")

Fine-tuning job created with ID: ftjob-vFBj6htF6bDMzxHzpu33EsS5
Fine-tuning job details: FineTuningJob(id='ftjob-vFBj6htF6bDMzxHzpu33EsS5', created_at=1734326327, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=8, learning_rate_multiplier=0.1), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-MpPFT9E8Ldg7PXAH8rdeEL75', result_files=[], seed=1536051044, status='validating_files', trained_tokens=None, training_file='file-Tahhf3TtrR71ML6a96Jniu', validation_file='file-PNkGFYmHEkYUjdRap95EKX', estimated_finish=None, integrations=[], user_provided_suffix=None)
Fine-tuning job status: validating_files


## >> **3. Monitor the job**

In [40]:
import time

def monitor_job(job_id):
    """Monitor fine-tuning job progress"""
    while True:
        job = client.fine_tuning.jobs.retrieve(job_id)
        status = job.status

        # Print the status
        print(f"Job ID: {job_id}, Status: {status}")

        # Check if the job is finished
        if status in ["succeeded", "failed", "cancelled"]:
            break  # Exit the loop if the job is done

        # Print events, if any (adjust limit as needed)
        events = client.fine_tuning.jobs.list_events(
            fine_tuning_job_id=job_id,
            limit=5
        )
        for event in events.data:
            print(f"  Event: {event.message}")

        # Wait before checking again (adjust interval as needed)
        time.sleep(60)  # Check every 60 seconds

    # Get the fine-tuned model if successful
    if status == "succeeded":
        fine_tuned_model = job.fine_tuned_model
        print(f"Fine-tuned model ID: {fine_tuned_model}")
    else:
        print(f"Fine-tuning job {job_id} finished with status: {status}")

# Call the function to monitor the job
monitor_job(job_id)

Job ID: ftjob-vFBj6htF6bDMzxHzpu33EsS5, Status: running
  Event: Fine-tuning job started
  Event: Files validated, moving job to queued state
  Event: Validating training file: file-Tahhf3TtrR71ML6a96Jniu and validation file: file-PNkGFYmHEkYUjdRap95EKX
  Event: Created fine-tuning job: ftjob-vFBj6htF6bDMzxHzpu33EsS5
Job ID: ftjob-vFBj6htF6bDMzxHzpu33EsS5, Status: running
  Event: Fine-tuning job started
  Event: Files validated, moving job to queued state
  Event: Validating training file: file-Tahhf3TtrR71ML6a96Jniu and validation file: file-PNkGFYmHEkYUjdRap95EKX
  Event: Created fine-tuning job: ftjob-vFBj6htF6bDMzxHzpu33EsS5
Job ID: ftjob-vFBj6htF6bDMzxHzpu33EsS5, Status: running
  Event: Fine-tuning job started
  Event: Files validated, moving job to queued state
  Event: Validating training file: file-Tahhf3TtrR71ML6a96Jniu and validation file: file-PNkGFYmHEkYUjdRap95EKX
  Event: Created fine-tuning job: ftjob-vFBj6htF6bDMzxHzpu33EsS5
Job ID: ftjob-vFBj6htF6bDMzxHzpu33EsS5, Sta

## **Accessing the Fine-tuned Model using the API**

In [41]:
result = client.fine_tuning.jobs.list()

# Retrieve the fine tuned model
fine_tuned_model = result.data[0].fine_tuned_model
print(fine_tuned_model)

ft:gpt-4o-mini-2024-07-18:personal::AeyfWX7m


# **Testing the fine tuned model on unseen/real world query**

In [50]:
response = client.chat.completions.create(
    model="ft:gpt-4o-mini-2024-07-18:personal::AeyfWX7m",
    messages=[
        {"role": "system", "content": "Given a medical question, provide informative answer in a paragraph style. If the question is ambiguous, ask clarifying questions. Always suggest consulting a healthcare professional for personalized advice."},
        {"role": "user", "content":"What are the treatments for gastrointestinal stromal tumor?"}
    ]
)
print(response.choices[0].message.content.strip())

Treatment for gastrointestinal stromal tumor (GIST) may involve surgery, medications, or both, depending on the tumor's size and location, among other factors.  For example, the cancer may spread, affecting other organs, bone, and the lungs. In addition, the treatment may include careful monitoring or drug therapy, such as an oral or injectable medication, genetic therapy targeting cancer genes, or radiation therapy alone. You may also want to learn about clinical trials, generic drugs, palliative care, and complementary and alternative medicine.


# **Fine Tuned gpt-4o-mini Model response Evaluation Metrics**

**1. Exact Match Accuracy**

**2. BLEU Score**

**3. Semantic Similarity**

**4. Cosine Similarity**

**5. Token-Level Accuracy**

**6. Accuracy**

**7. F1 Score**

In [51]:


# Few-shot prompt example
query = "What are the treatments for gastrointestinal stromal tumor?"

expected_answer = ("These resources address the diagnosis or management of gastrointestinal stromal tumor: - American Cancer Society: Treating Gastrointestinal Stromal Tumor (GIST) - Cancer.Net: Gastrointestinal Stromal Tumor--Diagnosis - Genetic Testing Registry: Gastrointestinal stromal tumor These resources from MedlinePlus offer information about the diagnosis and management of various health conditions: - Diagnostic Tests - Drug Therapy - Surgery and Rehabilitation - Genetic Counseling - Palliative Care.")


generated_answer = ("Treatment for gastrointestinal stromal tumor (GIST) may involve surgery, medications, or both, depending on the tumor's size and location, among other factors.  For example, the cancer may spread, affecting other organs, bone, and the lungs. In addition, the treatment may include careful monitoring or drug therapy, such as an oral or injectable medication, genetic therapy targeting cancer genes, or radiation therapy alone. You may also want to learn about clinical trials, generic drugs, palliative care, and complementary and alternative medicine.")

# Initialize Sentence Transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define evaluation functions
def calculate_exact_match(expected, generated):
    return int(expected.strip().lower() == generated.strip().lower())

def calculate_bleu(expected, generated):
    expected_tokens = expected.split()
    generated_tokens = generated.split()
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu([expected_tokens], generated_tokens, smoothing_function=smoothing_function)

def calculate_semantic_similarity(expected, generated):
    embeddings = model.encode([expected, generated])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def calculate_cosine_similarity(expected, generated):
    # Similar to semantic similarity but added explicitly for clarification
    embeddings = model.encode([expected, generated])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

def calculate_token_level_accuracy(expected, generated):
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())
    overlap = expected_tokens & generated_tokens
    return len(overlap) / len(expected_tokens)

def calculate_accuracy_f1(expected, generated):
    # Create sets for expected and generated tokens
    expected_tokens = set(expected.split())
    generated_tokens = set(generated.split())

    # Create binary vectors for accuracy and F1 score calculation
    all_tokens = expected_tokens | generated_tokens
    expected_vector = [1 if token in expected_tokens else 0 for token in all_tokens]
    generated_vector = [1 if token in generated_tokens else 0 for token in all_tokens]

    accuracy = accuracy_score(expected_vector, generated_vector)
    f1 = f1_score(expected_vector, generated_vector, zero_division=1)
    return accuracy, f1

# Evaluate metrics
exact_match = calculate_exact_match(expected_answer, generated_answer)
bleu_score = calculate_bleu(expected_answer, generated_answer)
semantic_similarity = calculate_semantic_similarity(expected_answer, generated_answer)
cosine_similarity_score = calculate_cosine_similarity(expected_answer, generated_answer)
token_level_accuracy = calculate_token_level_accuracy(expected_answer, generated_answer)

# Accuracy and F1 Score
accuracy, f1 = calculate_accuracy_f1(expected_answer, generated_answer)

# Print results
print(f"Query: {query}")
print(f"Exact Match Accuracy: {exact_match}")
print(f"BLEU Score: {bleu_score:.2f}")
print(f"Semantic Similarity: {semantic_similarity:.2f}")
print(f"Cosine Similarity: {cosine_similarity_score:.2f}")
print(f"Token-Level Accuracy: {token_level_accuracy:.2f}")
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")



Query: What are the treatments for gastrointestinal stromal tumor?
Exact Match Accuracy: 0
BLEU Score: 0.01
Semantic Similarity: 0.72
Cosine Similarity: 0.72
Token-Level Accuracy: 0.18
Accuracy: 0.08
F1 Score: 0.15
