In [5]:
import pandas as pd
import numpy as np
import os
import requests
import time
from dotenv import load_dotenv
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt', quiet=True)
load_dotenv()

True

In [2]:
# Load dataset and sample 3000 rows
df = pd.read_csv('icliniq_medical_qa_cleaned.csv')
df_sample = df.sample(n=3000, random_state=42).reset_index(drop=True)
print(f"Loaded {len(df_sample)} samples")

Loaded 3000 samples


In [3]:
# API setup - Updated for Llama 3.2 3B
api_key = os.getenv('lama3.2_3B_api_key')  # Changed to match your .env file
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
url = "https://openrouter.ai/api/v1/chat/completions"

In [4]:
# Evaluation metrics setup
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoothing = SmoothingFunction().method1

In [None]:
# Function to get response from Llama 3.2 3B
def get_response(question):
    payload = {
        "model": "meta-llama/llama-3.2-3b-instruct",  # Changed to 3.2-3b model
        "messages": [{"role": "user", "content": f"Answer this medical question: {question}"}],
        "max_tokens": 300,  # Reduced for faster responses
        "temperature": 0.1
    }
    
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=30)  # Added timeout
        if response.status_code == 200:
            return response.json()['choices'][0]['message']['content'].strip()
        else:
            print(f"API Error: {response.status_code}")
            return "Error: API request failed"
    except Exception as e:
        print(f"Request failed: {str(e)}")
        return "Error: Request timeout or failed"

In [6]:
def calculate_metrics(reference, generated):
    ref_tokens = nltk.word_tokenize(reference.lower())
    gen_tokens = nltk.word_tokenize(generated.lower())
    
    bleu1 = sentence_bleu([ref_tokens], gen_tokens, weights=(1,0,0,0), smoothing_function=smoothing)
    bleu4 = sentence_bleu([ref_tokens], gen_tokens, weights=(0.25,0.25,0.25,0.25), smoothing_function=smoothing)
    
    rouge_scores = rouge_scorer_obj.score(reference, generated)
    
    return {
        'bleu1': bleu1,
        'bleu4': bleu4,
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure
    }

In [7]:
# Test API with one sample
print("Testing API connection...")
test_question = "What are the symptoms of diabetes?"
test_response = get_response(test_question)
print(f"Response: {test_response[:100]}...")
print("API test completed!")

Testing API connection...
Response: The symptoms of diabetes can vary from person to person, but here are some common ones to look out f...
API test completed!
Response: The symptoms of diabetes can vary from person to person, but here are some common ones to look out f...
API test completed!


In [8]:
# Run evaluation
results = []
start_time = time.time()

for i, row in df_sample.iterrows():
    if i % 10 == 0:  # More frequent updates
        elapsed = time.time() - start_time
        rate = i / elapsed if elapsed > 0 else 0
        eta = (3000 - i) / rate if rate > 0 else 0
        print(f"Processing {i}/3000 | Rate: {rate:.1f}/min | ETA: {eta/60:.1f} min")
    
    question = row['Question']
    reference = row['Answer']
    
    generated = get_response(question)
    
    # Skip failed requests
    if "Error:" in generated:
        print(f"Skipping failed request at index {i}")
        continue
        
    metrics = calculate_metrics(reference, generated)
    
    results.append({
        'index': i,
        'question': question,
        'reference': reference,
        'generated': generated,
        **metrics
    })
    
    # Save checkpoint every 50 samples
    if (i + 1) % 50 == 0:
        temp_df = pd.DataFrame(results)
        temp_df.to_csv(f'checkpoint_llama32_3b_{i+1}.csv', index=False)
    
    time.sleep(0.2)  # Reduced delay

results_df = pd.DataFrame(results)
results_df.to_csv('evaluation_results_llama32_3b.csv', index=False)
print(f"Evaluation completed! Processed {len(results_df)} samples successfully.")

Processing 0/3000 | Rate: 0.0/min | ETA: 0.0 min
Processing 10/3000 | Rate: 0.3/min | ETA: 182.6 min
Processing 10/3000 | Rate: 0.3/min | ETA: 182.6 min
Processing 20/3000 | Rate: 0.3/min | ETA: 178.1 min
Processing 20/3000 | Rate: 0.3/min | ETA: 178.1 min
Processing 30/3000 | Rate: 0.3/min | ETA: 180.9 min
Processing 30/3000 | Rate: 0.3/min | ETA: 180.9 min
Processing 40/3000 | Rate: 0.3/min | ETA: 180.9 min
Processing 40/3000 | Rate: 0.3/min | ETA: 180.9 min
Processing 50/3000 | Rate: 0.3/min | ETA: 181.9 min
Processing 50/3000 | Rate: 0.3/min | ETA: 181.9 min
Processing 60/3000 | Rate: 0.3/min | ETA: 181.5 min
Processing 60/3000 | Rate: 0.3/min | ETA: 181.5 min
Processing 70/3000 | Rate: 0.3/min | ETA: 186.0 min
Processing 70/3000 | Rate: 0.3/min | ETA: 186.0 min
Processing 80/3000 | Rate: 0.3/min | ETA: 183.9 min
Processing 80/3000 | Rate: 0.3/min | ETA: 183.9 min
Processing 90/3000 | Rate: 0.3/min | ETA: 187.3 min
Processing 90/3000 | Rate: 0.3/min | ETA: 187.3 min
Processing 100/

In [9]:
# Analyze results
print("Average Scores:")
for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
    avg_score = results_df[metric].mean()
    print(f"{metric.upper()}: {avg_score:.4f}")

print(f"\nTotal samples evaluated: {len(results_df)}")
print("Results saved to 'evaluation_results_llama32_3b.csv'")

Average Scores:
BLEU1: 0.2012
BLEU4: 0.0122
ROUGE1: 0.2588
ROUGE2: 0.0355
ROUGEL: 0.1258

Total samples evaluated: 2996
Results saved to 'evaluation_results_llama32_3b.csv'


In [6]:
# Load existing results
try:
    results_df = pd.read_csv('evaluation_results_llama32_3b.csv')
    print(f"Loaded {len(results_df)} results from file")
    
    # Calculate and print standard deviation for each metric
    print("\nStandard Deviations:")
    for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
        std_score = results_df[metric].std()
        print(f"{metric.upper()}: {std_score:.4f}")
        
except FileNotFoundError:
    print("No existing results file found. Please run the evaluation first (cells 1-8)")

Loaded 2996 results from file

Standard Deviations:
BLEU1: 0.0956
BLEU4: 0.0154
ROUGE1: 0.0858
ROUGE2: 0.0276
ROUGEL: 0.0345
