In [9]:
import pandas as pd
import numpy as np
import os
import requests
import time
from dotenv import load_dotenv
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt', quiet=True)
load_dotenv()

True

In [10]:
# Load dataset and sample 3000 rows
df = pd.read_csv('icliniq_medical_qa_cleaned.csv')
df_sample = df.sample(n=3000, random_state=42).reset_index(drop=True)
print(f"Loaded {len(df_sample)} samples")

Loaded 3000 samples


In [11]:
# API setup
api_key = os.getenv('Llama_3.3_70B_Instruct_api_key')
headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
url = "https://openrouter.ai/api/v1/chat/completions"

In [12]:
# Evaluation metrics setup
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoothing = SmoothingFunction().method1

In [13]:
def get_response(question):
    payload = {
        "model": "meta-llama/llama-3.3-70b-instruct",
        "messages": [{"role": "user", "content": f"Answer this medical question: {question}"}],
        "max_tokens": 300,  # Reduced for faster responses
        "temperature": 0.1
    }
    
    try:
        response = requests.post(url, headers=headers, json=payload, timeout=30)  # Added timeout
        if response.status_code == 200:
            return response.json()['choices'][0]['message']['content'].strip()
        else:
            print(f"API Error: {response.status_code}")
            return "Error: API request failed"
    except Exception as e:
        print(f"Request failed: {str(e)}")
        return "Error: Request timeout or failed"

In [14]:
def calculate_metrics(reference, generated):
    ref_tokens = nltk.word_tokenize(reference.lower())
    gen_tokens = nltk.word_tokenize(generated.lower())
    
    bleu1 = sentence_bleu([ref_tokens], gen_tokens, weights=(1,0,0,0), smoothing_function=smoothing)
    bleu4 = sentence_bleu([ref_tokens], gen_tokens, weights=(0.25,0.25,0.25,0.25), smoothing_function=smoothing)
    
    rouge_scores = rouge_scorer_obj.score(reference, generated)
    
    return {
        'bleu1': bleu1,
        'bleu4': bleu4,
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure
    }

In [15]:
# Test API with one sample
print("Testing API connection...")
test_question = "What are the symptoms of diabetes?"
test_response = get_response(test_question)
print(f"Response: {test_response[:100]}...")
print("API test completed!")

Testing API connection...
Response: The symptoms of diabetes can vary depending on the type of diabetes and the individual, but common s...
API test completed!


In [16]:
# Run evaluation
results = []
start_time = time.time()

for i, row in df_sample.iterrows():
    if i % 10 == 0:  # More frequent updates
        elapsed = time.time() - start_time
        rate = i / elapsed if elapsed > 0 else 0
        eta = (3000 - i) / rate if rate > 0 else 0
        print(f"Processing {i}/3000 | Rate: {rate:.1f}/min | ETA: {eta/60:.1f} min")
    
    question = row['Question']
    reference = row['Answer']
    
    generated = get_response(question)
    
    # Skip failed requests
    if "Error:" in generated:
        print(f"Skipping failed request at index {i}")
        continue
        
    metrics = calculate_metrics(reference, generated)
    
    results.append({
        'index': i,
        'question': question,
        'reference': reference,
        'generated': generated,
        **metrics
    })
    
    # Save checkpoint every 50 samples
    if (i + 1) % 50 == 0:
        temp_df = pd.DataFrame(results)
        temp_df.to_csv(f'checkpoint_{i+1}.csv', index=False)
    
    time.sleep(0.2)  # Reduced delay

results_df = pd.DataFrame(results)
results_df.to_csv('evaluation_results.csv', index=False)
print(f"Evaluation completed! Processed {len(results_df)} samples successfully.")

Processing 0/3000 | Rate: 0.0/min | ETA: 0.0 min
Processing 10/3000 | Rate: 0.1/min | ETA: 445.8 min
Processing 20/3000 | Rate: 0.1/min | ETA: 538.7 min
Processing 30/3000 | Rate: 0.1/min | ETA: 494.3 min
Processing 40/3000 | Rate: 0.1/min | ETA: 492.6 min
Processing 50/3000 | Rate: 0.1/min | ETA: 493.1 min
Processing 60/3000 | Rate: 0.1/min | ETA: 474.8 min
Processing 70/3000 | Rate: 0.1/min | ETA: 464.5 min
Processing 80/3000 | Rate: 0.1/min | ETA: 452.5 min
Processing 90/3000 | Rate: 0.1/min | ETA: 455.0 min
Processing 100/3000 | Rate: 0.1/min | ETA: 452.1 min
Processing 110/3000 | Rate: 0.1/min | ETA: 443.9 min
Processing 120/3000 | Rate: 0.1/min | ETA: 449.7 min
Processing 130/3000 | Rate: 0.1/min | ETA: 454.7 min
Processing 140/3000 | Rate: 0.1/min | ETA: 456.3 min
Processing 150/3000 | Rate: 0.1/min | ETA: 451.1 min
Processing 160/3000 | Rate: 0.1/min | ETA: 445.5 min
Processing 170/3000 | Rate: 0.1/min | ETA: 442.0 min
Processing 180/3000 | Rate: 0.1/min | ETA: 442.7 min
Proces

In [17]:
# Analyze results
print("Average Scores:")
for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
    avg_score = results_df[metric].mean()
    print(f"{metric.upper()}: {avg_score:.4f}")

print(f"\nTotal samples evaluated: {len(results_df)}")
print("Results saved to 'evaluation_results.csv'")

Average Scores:
BLEU1: 0.2202
BLEU4: 0.0136
ROUGE1: 0.2754
ROUGE2: 0.0398
ROUGEL: 0.1299

Total samples evaluated: 2234
Results saved to 'evaluation_results.csv'


In [None]:
# Resume evaluation from where it stopped
import pandas as pd

# Load existing results
try:
    existing_results = pd.read_csv('evaluation_results.csv')
    print(f"Found existing results: {len(existing_results)} samples")
    last_processed = existing_results['index'].max()
    print(f"Last processed index: {last_processed}")
except:
    existing_results = pd.DataFrame()
    last_processed = -1
    print("No existing results found, starting from beginning")

# Resume from next sample
start_from = last_processed + 1
print(f"Resuming from sample {start_from}")

# Convert existing results to list format for appending
if len(existing_results) > 0:
    results = existing_results.to_dict('records')
else:
    results = []

start_time = time.time()

for i, row in df_sample.iterrows():
    if i <= last_processed:
        continue
        
    if i % 10 == 0:
        elapsed = time.time() - start_time
        remaining = 3000 - i
        rate = (i - start_from) / elapsed if elapsed > 0 and i > start_from else 0
        eta = remaining / rate if rate > 0 else 0
        print(f"Processing {i}/3000 | Remaining: {remaining} | Rate: {rate:.1f}/min | ETA: {eta/60:.1f} min")
    
    question = row['Question']
    reference = row['Answer']
    
    generated = get_response(question)
    
    # Skip failed requests
    if "Error:" in generated:
        print(f"Skipping failed request at index {i}")
        continue
        
    metrics = calculate_metrics(reference, generated)
    
    results.append({
        'index': i,
        'question': question,
        'reference': reference,
        'generated': generated,
        **metrics
    })
    
    # Save checkpoint every 50 samples
    if (i + 1) % 50 == 0:
        temp_df = pd.DataFrame(results)
        temp_df.to_csv('evaluation_results.csv', index=False)
        print(f"Checkpoint saved at sample {i + 1}")
    
    time.sleep(0.2)

# Final save
results_df = pd.DataFrame(results)
results_df.to_csv('evaluation_results.csv', index=False)
print(f"Evaluation completed! Total processed: {len(results_df)} samples")

Found existing results: 2234 samples
Last processed index: 2233
Resuming from sample 2234
Processing 2240/3000 | Remaining: 760 | Rate: 0.1/min | ETA: 140.5 min
Processing 2240/3000 | Remaining: 760 | Rate: 0.1/min | ETA: 140.5 min
Checkpoint saved at sample 2250
Processing 2250/3000 | Remaining: 750 | Rate: 0.1/min | ETA: 139.8 min
Checkpoint saved at sample 2250
Processing 2250/3000 | Remaining: 750 | Rate: 0.1/min | ETA: 139.8 min
Processing 2260/3000 | Remaining: 740 | Rate: 0.1/min | ETA: 130.9 min
Processing 2260/3000 | Remaining: 740 | Rate: 0.1/min | ETA: 130.9 min
Processing 2270/3000 | Remaining: 730 | Rate: 0.1/min | ETA: 129.0 min
Processing 2270/3000 | Remaining: 730 | Rate: 0.1/min | ETA: 129.0 min
Processing 2280/3000 | Remaining: 720 | Rate: 0.1/min | ETA: 129.0 min
Processing 2280/3000 | Remaining: 720 | Rate: 0.1/min | ETA: 129.0 min
Processing 2290/3000 | Remaining: 710 | Rate: 0.1/min | ETA: 123.7 min
Processing 2290/3000 | Remaining: 710 | Rate: 0.1/min | ETA: 123.

In [20]:
# Analyze complete results
results_df = pd.read_csv('evaluation_results.csv')

print("=== FINAL EVALUATION RESULTS ===")
print(f"Total samples evaluated: {len(results_df)}")
print()

print("Average Scores:")
for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
    avg_score = results_df[metric].mean()
    std_score = results_df[metric].std()
    print(f"{metric.upper()}: {avg_score:.4f} (±{std_score:.4f})")

print()
print("Score Ranges:")
for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
    min_score = results_df[metric].min()
    max_score = results_df[metric].max()
    print(f"{metric.upper()}: {min_score:.4f} - {max_score:.4f}")

print(f"\nResults saved to 'evaluation_results.csv'")
print(f"Processing time saved: 344 minutes + remaining time")

=== FINAL EVALUATION RESULTS ===
Total samples evaluated: 2995

Average Scores:
BLEU1: 0.2207 (±0.0877)
BLEU4: 0.0141 (±0.0167)
ROUGE1: 0.2761 (±0.0828)
ROUGE2: 0.0404 (±0.0287)
ROUGEL: 0.1306 (±0.0339)

Score Ranges:
BLEU1: 0.0000 - 0.5452
BLEU4: 0.0000 - 0.1898
ROUGE1: 0.0000 - 0.5741
ROUGE2: 0.0000 - 0.2637
ROUGEL: 0.0000 - 0.3155

Results saved to 'evaluation_results.csv'
Processing time saved: 344 minutes + remaining time
