In [1]:
# Install required packages first
import subprocess
import sys

def install_package(package):
    try:
        __import__(package)
        print(f"[OK] {package} is already installed")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"[OK] {package} installed successfully")

# Install all required packages
install_package("replicate")
install_package("tiktoken")
install_package("python-dotenv")
install_package("nltk")
install_package("rouge-score")

# Now import everything
import pandas as pd
import numpy as np
import os
import time
import replicate
from dotenv import load_dotenv
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt', quiet=True)
load_dotenv()

print("All packages installed and imported successfully!")

[OK] replicate is already installed
[OK] tiktoken is already installed
Installing python-dotenv...
[OK] python-dotenv installed successfully
[OK] python-dotenv installed successfully
[OK] nltk is already installed
Installing rouge-score...
[OK] nltk is already installed
Installing rouge-score...
[OK] rouge-score installed successfully
[OK] rouge-score installed successfully
All packages installed and imported successfully!
All packages installed and imported successfully!


In [2]:
# Load separated dataset from index 870 to end
df = pd.read_csv('dataset_from_870_to_end.csv')
print(f"Dataset size: {len(df)} rows")
print(f"Dataset columns: {df.columns.tolist()}")

# Use the separated dataset for evaluation (from index 870 to end)
df_sample = df.reset_index(drop=True)
print(f"Using {len(df_sample)} samples for evaluation (from original index 870 to end)")
print(f"Sample data:")
print(df_sample.head(2))

Dataset size: 2126 rows
Dataset columns: ['index', 'question', 'reference', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5']
Using 2126 samples for evaluation (from original index 870 to end)
Sample data:
   index                                           question  \
0    870  Hello doctor,\nI have a 6-year-old son and he ...   
1    871  Hello doctor,\nI had unprotected sex with my f...   

                                           reference  Unnamed: 3  Unnamed: 4  \
0  Hello,\nWelcome to icliniq.com.\nAzithromycin ...         NaN         NaN   
1  Hello,\nWelcome to icliniq.com.\nThis really d...         NaN         NaN   

   Unnamed: 5  
0         NaN  
1         NaN  


In [4]:
# Set Replicate API token directly
replicate_api_token = "r8_RqGeBMNSXxmxNqmwlJzIfwQ44bAPIVU4bhV5x"
os.environ['REPLICATE_API_TOKEN'] = replicate_api_token
print("[OK] Replicate API token set successfully")

# Model configuration for Llama-4-Maverick evaluation
model_name = "meta/llama-4-maverick-instruct"  # Correct model name from Replicate
print(f"Using model: {model_name}")
print("✅ Llama-4-Maverick-Instruct model configured")

[OK] Replicate API token set successfully
Using model: meta/llama-4-maverick-instruct
✅ Llama-4-Maverick-Instruct model configured


In [5]:
# Test Replicate connection with Llama-4-Maverick model
def test_replicate_connection():
    if not replicate_api_token:
        print("[ERROR] No API token found.")
        return False
    
    try:
        print(f"Testing model: {model_name}")
        # Test with a medical question using Llama-4-Maverick
        output = replicate.run(
            model_name,
            input={
                "prompt": "Answer this medical question: What are the symptoms of diabetes?",
                "max_tokens": 100,
                "temperature": 0.1,
            }
        )
        
        # Handle different output formats
        if isinstance(output, list):
            response = "".join(output)
        elif isinstance(output, str):
            response = output
        else:
            response = str(output)
            
        print(f"[OK] Llama-4-Maverick connection successful!")
        print(f"Test response: {response.strip()[:100]}...")
        return True
        
    except Exception as e:
        print(f"[ERROR] Connection error: {str(e)}")
        return False

# Test connection
api_ready = test_replicate_connection()

Testing model: meta/llama-4-maverick-instruct
[OK] Llama-4-Maverick connection successful!
Test response: The symptoms of diabetes can vary from person to person, but common symptoms include:

1. **Increase...
[OK] Llama-4-Maverick connection successful!
Test response: The symptoms of diabetes can vary from person to person, but common symptoms include:

1. **Increase...


In [6]:
# Setup instructions if API not ready
if not api_ready:
    print("Please ensure:")
    print("1. You have set the REPLICATE_API_TOKEN environment variable")
    print("2. Your API token is valid")
    print("3. You have sufficient credits in your Replicate account")
    print("\nTo get started:")
    print("- Create account at https://replicate.com")
    print("- Get API token from https://replicate.com/account/api-tokens")
    print("- Add REPLICATE_API_TOKEN=your_token_here to your .env file")

In [6]:
# Evaluation metrics setup
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoothing = SmoothingFunction().method1

In [9]:
def get_response(question, context=""):
    """Get response from Llama-4-Maverick model via Replicate"""
    try:
        # Build the prompt for medical questions
        prompt = f"Answer this medical question: {question}"
        
        # Llama-4-Maverick specific parameters
        input_params = {
            "prompt": prompt,
            "max_tokens": 300,  # Using max_tokens instead of max_new_tokens
            "temperature": 0.1,
        }
        
        output = replicate.run(model_name, input=input_params)
        
        # Handle different output formats
        if isinstance(output, list):
            response = ''.join(output).strip()
        elif isinstance(output, str):
            response = output.strip()
        else:
            response = str(output).strip()
            
        # Clean up the response if needed
        if response.startswith("Answer:"):
            response = response[7:].strip()
            
        return response
            
    except Exception as e:
        print(f"Error getting response: {e}")
        return f"Error: {str(e)}"

In [7]:
def calculate_metrics(reference, generated):
    ref_tokens = nltk.word_tokenize(reference.lower())
    gen_tokens = nltk.word_tokenize(generated.lower())
    
    bleu1 = sentence_bleu([ref_tokens], gen_tokens, weights=(1,0,0,0), smoothing_function=smoothing)
    bleu4 = sentence_bleu([ref_tokens], gen_tokens, weights=(0.25,0.25,0.25,0.25), smoothing_function=smoothing)
    
    rouge_scores = rouge_scorer_obj.score(reference, generated)
    
    return {
        'bleu1': bleu1,
        'bleu4': bleu4,
        'rouge1': rouge_scores['rouge1'].fmeasure,
        'rouge2': rouge_scores['rouge2'].fmeasure,
        'rougeL': rouge_scores['rougeL'].fmeasure
    }

In [10]:
# Test API with a medical question
if api_ready:
    print("Testing API with medical question...")
    test_question = "What are the common symptoms of type 2 diabetes?"
    test_response = get_response(test_question)
    print(f"Question: {test_question}")
    print(f"Response: {test_response}")
    print("\n[OK] API test completed!")
else:
    print("[ERROR] API is not ready. Please check your Replicate API token.")

Testing API with medical question...
Question: What are the common symptoms of type 2 diabetes?
Response: Type 2 diabetes is a chronic condition characterized by insulin resistance and impaired insulin secretion, leading to high blood sugar levels. The common symptoms of type 2 diabetes may develop gradually and can be mild, making it challenging to diagnose. Some people may not experience any noticeable symptoms at all. However, common symptoms include:

1. **Increased thirst and urination**: As the body tries to flush out excess glucose, you may feel the need to drink more water and urinate more frequently.
2. **Fatigue**: High blood sugar levels can cause fatigue, which can be a persistent feeling of tiredness or lack of energy.
3. **Blurred vision**: High blood sugar levels can cause the lens in your eye to swell, leading to blurred vision.
4. **Slow healing of cuts and wounds**: High blood sugar levels can affect the body's ability to heal wounds, making it take longer for cuts an

In [11]:
# EVALUATION SUMMARY - Review before proceeding
print("EVALUATION SUMMARY")
print("=" * 50)
print(f"Total samples to process: {len(df_sample):,}")
print(f"Model: {model_name}")
print(f"Output file: evaluation_results.csv")
print(f"Checkpoints: Every 100 samples")
print("=" * 50)

if api_ready:
    print("[OK] API is ready to proceed")
    user_input = input("\nProceed with full evaluation? (y/n): ")
    if user_input.lower() not in ['y', 'yes']:
        print("[CANCELLED] Evaluation cancelled by user")
    else:
        print("Starting evaluation...")
else:
    print("[ERROR] API not ready. Please check your setup first.")

EVALUATION SUMMARY
Total samples to process: 2,126
Model: meta/llama-4-maverick-instruct
Output file: evaluation_results.csv
Checkpoints: Every 100 samples
[OK] API is ready to proceed
[CANCELLED] Evaluation cancelled by user
[CANCELLED] Evaluation cancelled by user


In [12]:
# Run evaluation on separated dataset (from index 870 to end)
results = []
start_time = time.time()
total_samples = len(df_sample)
failed_requests = 0

print(f"Starting evaluation of {total_samples:,} samples (from original index 870 to end)")
print(f"Using model: {model_name}")
print(f"Expected time: ~{(total_samples * 3) / 60:.1f} minutes (with 3s delay per request)")
print("-" * 80)

for i, row in df_sample.iterrows():
    # Progress updates every 25 samples
    if i % 25 == 0:
        elapsed = time.time() - start_time
        rate = i / elapsed if elapsed > 0 else 0
        eta = (total_samples - i) / rate if rate > 0 else 0
        completed_pct = (i / total_samples) * 100
        print(f"Progress: {i:,}/{total_samples:,} ({completed_pct:.1f}%) | Rate: {rate:.1f}/min | ETA: {eta/60:.1f}min | Failed: {failed_requests}")
    
    question = row['question']
    reference = row['reference']
    original_index = row['index']  # Keep track of original index
    
    generated = get_response(question)
    
    # Skip failed requests but count them
    if generated and "Error:" in generated:
        failed_requests += 1
        print(f"[WARNING] Failed request at index {i} (original {original_index}) (Total failed: {failed_requests})")
        continue
        
    if not generated or len(generated.strip()) < 10:
        failed_requests += 1
        print(f"[WARNING] Empty/short response at index {i} (original {original_index}) (Total failed: {failed_requests})")
        continue
        
    try:
        metrics = calculate_metrics(reference, generated)
    except Exception as e:
        print(f"[WARNING] Metrics calculation failed at index {i} (original {original_index}): {str(e)}")
        continue
    
    results.append({
        'index': i,
        'original_index': original_index,
        'question': question,
        'reference': reference,
        'generated': generated,
        **metrics
    })
    
    # Save checkpoint every 50 samples
    if (i + 1) % 50 == 0:
        temp_df = pd.DataFrame(results)
        temp_df.to_csv(f'checkpoint_separated_{i+1}.csv', index=False)
        print(f"[CHECKPOINT] Checkpoint saved at sample {i + 1} - Total successful: {len(results)}")
    
    # Rate limiting to avoid API limits - 3 second delay
    time.sleep(3)

# Final save
if results:
    results_df = pd.DataFrame(results)
    results_df.to_csv('evaluation_results_separated.csv', index=False)
    total_time = time.time() - start_time

    print("\n" + "=" * 80)
    print(f"EVALUATION COMPLETED!")
    print(f"Successfully processed: {len(results_df):,} samples")
    print(f"Failed requests: {failed_requests}")
    print(f"Success rate: {(len(results_df)/(len(results_df)+failed_requests))*100:.1f}%")
    print(f"Total time: {total_time/3600:.2f} hours")
    print(f"Results saved to 'evaluation_results_separated.csv'")
    print("=" * 80)
else:
    print("\n[ERROR] No results generated. All requests failed.")
    print("Check your API key and model availability.")

Starting evaluation of 2,126 samples (from original index 870 to end)
Using model: meta/llama-4-maverick-instruct
Expected time: ~106.3 minutes (with 3s delay per request)
--------------------------------------------------------------------------------
Progress: 0/2,126 (0.0%) | Rate: 0.0/min | ETA: 0.0min | Failed: 0
Progress: 25/2,126 (1.2%) | Rate: 0.1/min | ETA: 240.3min | Failed: 0
Progress: 25/2,126 (1.2%) | Rate: 0.1/min | ETA: 240.3min | Failed: 0
[CHECKPOINT] Checkpoint saved at sample 50 - Total successful: 50
[CHECKPOINT] Checkpoint saved at sample 50 - Total successful: 50
Progress: 50/2,126 (2.4%) | Rate: 0.1/min | ETA: 241.7min | Failed: 0
Progress: 50/2,126 (2.4%) | Rate: 0.1/min | ETA: 241.7min | Failed: 0
Progress: 75/2,126 (3.5%) | Rate: 0.1/min | ETA: 236.6min | Failed: 0
Progress: 75/2,126 (3.5%) | Rate: 0.1/min | ETA: 236.6min | Failed: 0
[CHECKPOINT] Checkpoint saved at sample 100 - Total successful: 100
[CHECKPOINT] Checkpoint saved at sample 100 - Total successf

In [13]:
# Analyze evaluation results from separated dataset
try:
    results_df = pd.read_csv('evaluation_results_separated.csv')
    
    print("=== FINAL EVALUATION RESULTS ===")
    print(f"Dataset: Separated dataset (original indices 870-2999)")
    print(f"Model: {model_name}")
    print(f"Total samples evaluated: {len(results_df):,}")
    print()
    
    print("Average Scores:")
    for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
        avg_score = results_df[metric].mean()
        std_score = results_df[metric].std()
        print(f"{metric.upper()}: {avg_score:.4f} (±{std_score:.4f})")
    
    print()
    print("Score Ranges:")
    for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
        min_score = results_df[metric].min()
        max_score = results_df[metric].max()
        print(f"{metric.upper()}: {min_score:.4f} - {max_score:.4f}")
    
    print()
    print("Sample Results:")
    print(results_df[['original_index', 'bleu1', 'rouge1', 'rougeL']].head())
    
    print(f"\n[OK] Results saved to 'evaluation_results_separated.csv'")
    
except FileNotFoundError:
    print("[ERROR] No evaluation results found. Please run the evaluation first.")
    print("Expected file: evaluation_results_separated.csv")
except Exception as e:
    print(f"[ERROR] Error loading results: {str(e)}")

=== FINAL EVALUATION RESULTS ===
Dataset: Separated dataset (original indices 870-2999)
Model: meta/llama-4-maverick-instruct
Total samples evaluated: 2,126

Average Scores:
BLEU1: 0.2137 (±0.0817)
BLEU4: 0.0134 (±0.0150)
ROUGE1: 0.2648 (±0.0792)
ROUGE2: 0.0389 (±0.0273)
ROUGEL: 0.1276 (±0.0320)

Score Ranges:
BLEU1: 0.0124 - 0.5083
BLEU4: 0.0006 - 0.1622
ROUGE1: 0.0150 - 0.5435
ROUGE2: 0.0000 - 0.2277
ROUGEL: 0.0148 - 0.3037

Sample Results:
   original_index     bleu1    rouge1    rougeL
0             870  0.199153  0.275168  0.120805
1             871  0.113074  0.173611  0.083333
2             872  0.183007  0.274854  0.140351
3             873  0.253333  0.312343  0.161209
4             874  0.169118  0.247678  0.136223

[OK] Results saved to 'evaluation_results_separated.csv'


In [1]:
# Combine evaluation results from both datasets
import pandas as pd
import numpy as np

print("=== COMBINING EVALUATION RESULTS ===")
print("Loading both datasets...")

# Load the original evaluation results (indices 0-869)
try:
    df_original = pd.read_csv('evaluation_results.csv')
    print(f"Original dataset loaded: {len(df_original)} samples (indices 0-{len(df_original)-1})")
except FileNotFoundError:
    print("[ERROR] evaluation_results.csv not found")
    df_original = pd.DataFrame()

# Load the separated evaluation results (indices 870-2999)
try:
    df_separated = pd.read_csv('evaluation_results_separated.csv')
    print(f"Separated dataset loaded: {len(df_separated)} samples (original indices 870-2999)")
except FileNotFoundError:
    print("[ERROR] evaluation_results_separated.csv not found")
    df_separated = pd.DataFrame()

if not df_original.empty and not df_separated.empty:
    # Align column structures
    # For the separated dataset, we need to adjust the index to continue from where original left off
    df_separated_aligned = df_separated.copy()
    
    # Rename index column to match original format and adjust indices
    if 'original_index' in df_separated_aligned.columns:
        # Keep the original_index for reference but create new sequential index
        df_separated_aligned['index'] = range(len(df_original), len(df_original) + len(df_separated_aligned))
        # Drop original_index column to match original dataset structure
        df_separated_aligned = df_separated_aligned.drop('original_index', axis=1)
    
    # Ensure column order matches
    columns_order = ['index', 'question', 'reference', 'generated', 'bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']
    df_original = df_original[columns_order]
    df_separated_aligned = df_separated_aligned[columns_order]
    
    # Combine datasets
    df_combined = pd.concat([df_original, df_separated_aligned], ignore_index=True)
    
    # Save combined dataset
    df_combined.to_csv('evaluation_results_combined.csv', index=False)
    
    print(f"\n[SUCCESS] Combined dataset created!")
    print(f"Total samples: {len(df_combined):,}")
    print(f"- Original (0-869): {len(df_original):,} samples")
    print(f"- Separated (870-2999): {len(df_separated):,} samples")
    print(f"Combined dataset saved as 'evaluation_results_combined.csv'")
    
else:
    print("[ERROR] Cannot combine datasets - one or both files are missing or empty")
    df_combined = pd.DataFrame()

=== COMBINING EVALUATION RESULTS ===
Loading both datasets...
Original dataset loaded: 865 samples (indices 0-864)
Separated dataset loaded: 2126 samples (original indices 870-2999)

[SUCCESS] Combined dataset created!
Total samples: 2,991
- Original (0-869): 865 samples
- Separated (870-2999): 2,126 samples
Combined dataset saved as 'evaluation_results_combined.csv'


In [2]:
# Comprehensive analysis of the combined evaluation results
if not df_combined.empty:
    print("\n" + "="*60)
    print("=== FINAL EVALUATION RESULTS ===")
    print("="*60)
    print(f"Dataset: Complete dataset (indices 0-2999)")
    print(f"Model: meta/llama-4-maverick-instruct")
    print(f"Total samples evaluated: {len(df_combined):,}")
    print()
    
    # Calculate overall statistics
    print("Average Scores:")
    for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
        avg_score = df_combined[metric].mean()
        std_score = df_combined[metric].std()
        print(f"{metric.upper()}: {avg_score:.4f} (±{std_score:.4f})")
    
    print()
    print("Score Ranges:")
    for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
        min_score = df_combined[metric].min()
        max_score = df_combined[metric].max()
        print(f"{metric.upper()}: {min_score:.4f} - {max_score:.4f}")
    
    print()
    print("="*60)
    print("=== BREAKDOWN BY DATASET SEGMENTS ===")
    print("="*60)
    
    # Analyze original segment (0-869)
    df_segment1 = df_combined[df_combined['index'] < 870]
    print(f"\nSegment 1 (Original indices 0-869): {len(df_segment1):,} samples")
    print("Average Scores:")
    for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
        avg_score = df_segment1[metric].mean()
        std_score = df_segment1[metric].std()
        print(f"  {metric.upper()}: {avg_score:.4f} (±{std_score:.4f})")
    
    # Analyze separated segment (870-2999)
    df_segment2 = df_combined[df_combined['index'] >= 870]
    print(f"\nSegment 2 (Original indices 870-2999): {len(df_segment2):,} samples")
    print("Average Scores:")
    for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
        avg_score = df_segment2[metric].mean()
        std_score = df_segment2[metric].std()
        print(f"  {metric.upper()}: {avg_score:.4f} (±{std_score:.4f})")
    
    print()
    print("="*60)
    print("=== PERFORMANCE COMPARISON ===")
    print("="*60)
    
    # Compare performance between segments
    print("\nPerformance Difference (Segment 2 - Segment 1):")
    for metric in ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']:
        avg1 = df_segment1[metric].mean()
        avg2 = df_segment2[metric].mean()
        diff = avg2 - avg1
        diff_pct = (diff / avg1) * 100 if avg1 > 0 else 0
        print(f"  {metric.upper()}: {diff:+.4f} ({diff_pct:+.1f}%)")
    
    print(f"\n[SUCCESS] Complete analysis saved to 'evaluation_results_combined.csv'")
    print(f"Total dataset size: {len(df_combined):,} samples")
    
else:
    print("[ERROR] No combined data available for analysis")


=== FINAL EVALUATION RESULTS ===
Dataset: Complete dataset (indices 0-2999)
Model: meta/llama-4-maverick-instruct
Total samples evaluated: 2,991

Average Scores:
BLEU1: 0.2089 (±0.0859)
BLEU4: 0.0132 (±0.0152)
ROUGE1: 0.2597 (±0.0838)
ROUGE2: 0.0381 (±0.0277)
ROUGEL: 0.1260 (±0.0340)

Score Ranges:
BLEU1: 0.0000 - 0.5083
BLEU4: 0.0000 - 0.1622
ROUGE1: 0.0000 - 0.5435
ROUGE2: 0.0000 - 0.2277
ROUGEL: 0.0000 - 0.3037

=== BREAKDOWN BY DATASET SEGMENTS ===

Segment 1 (Original indices 0-869): 870 samples
Average Scores:
  BLEU1: 0.1970 (±0.0940)
  BLEU4: 0.0125 (±0.0156)
  ROUGE1: 0.2471 (±0.0928)
  ROUGE2: 0.0359 (±0.0285)
  ROUGEL: 0.1222 (±0.0382)

Segment 2 (Original indices 870-2999): 2,121 samples
Average Scores:
  BLEU1: 0.2137 (±0.0818)
  BLEU4: 0.0134 (±0.0151)
  ROUGE1: 0.2648 (±0.0793)
  ROUGE2: 0.0389 (±0.0273)
  ROUGEL: 0.1276 (±0.0320)

=== PERFORMANCE COMPARISON ===

Performance Difference (Segment 2 - Segment 1):
  BLEU1: +0.0168 (+8.5%)
  BLEU4: +0.0009 (+7.1%)
  ROUGE1: 