In [1]:
# Minimal setup for Llama 3 8B Instruct evaluation
import pandas as pd
import os
import replicate

# Load environment variables if needed
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# Load the entire dataset for full evaluation
df = pd.read_csv('dataset.csv')
df_sample = df.reset_index(drop=True)
print(f"Using {len(df_sample)} samples for evaluation")
print(f"Dataset columns: {df_sample.columns.tolist()}")
print(df_sample.head(1))

Using 2996 samples for evaluation
Dataset columns: ['index', 'question', 'reference', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5']
   index                                           question  \
0      0  Hi doctor,\nI am a 51-year-old female with a h...   

                                           reference  Unnamed: 3  Unnamed: 4  \
0  Hi,\nWelcome to icliniq.com.\nSevere jugular v...         NaN         NaN   

   Unnamed: 5  
0         NaN  


In [3]:
# Setup Replicate API for Llama 3 8B Instruct
replicate_api_token = os.getenv('REPLICATE_API_TOKEN')
if replicate_api_token:
    os.environ['REPLICATE_API_TOKEN'] = replicate_api_token
    print("[OK] Replicate API token loaded from environment")
else:
    print("[ERROR] No REPLICATE_API_TOKEN found. Please set it in your .env file")

model_name = "meta/meta-llama-3-8b-instruct"  # Llama 3 8B Instruct model
print(f"Using model: {model_name}")
print("✅ Using Llama 3 8B Instruct model from Replicate")

[OK] Replicate API token loaded from environment
Using model: meta/meta-llama-3-8b-instruct
✅ Using Llama 3 8B Instruct model from Replicate


In [4]:
# Simple test connection for Llama 3 8B Instruct
def test_replicate_connection():
    if not replicate_api_token:
        print("[ERROR] No API token found. Please set REPLICATE_API_TOKEN environment variable.")
        return False
    
    try:
        test_prompt = "Hello, please respond with 'Hi! I can help with medical questions.'"
        
        output = replicate.run(
            model_name,
            input={
                "prompt": test_prompt,
                "max_new_tokens": 50,
                "temperature": 0.1
            }
        )
        
        if isinstance(output, list):
            response = "".join(output)
        else:
            response = str(output)
            
        print(f"[OK] Connection successful with {model_name}")
        print(f"Test response: {response.strip()}")
        return True
            
    except Exception as e:
        print(f"[ERROR] Connection error: {str(e)}")
        return False

# Test connection
api_ready = test_replicate_connection()

[OK] Connection successful with meta/meta-llama-3-8b-instruct
Test response: Hi! I can help with medical questions.


In [5]:
# Simple function to get response from Llama 3 8B Instruct
def get_response(question):
    try:
        output = replicate.run(
            model_name,
            input={
                "prompt": question,
                "max_new_tokens": 512,
                "temperature": 0.2
            }
        )
        if isinstance(output, list):
            return ''.join(output).strip()
        return str(output).strip()
    except Exception as e:
        return f"Error: {str(e)}"

In [6]:
# Test the model with a sample question
if api_ready:
    print("Testing with a medical question:")
    test_question = "What are the common symptoms of type 2 diabetes?"
    test_response = get_response(test_question)
    print(f"Question: {test_question}")
    print(f"Response: {test_response}")
    
    if "Error:" in test_response:
        print("[ERROR] Test failed - check API configuration")
    else:
        print("[OK] Test passed - ready for evaluation!")
else:
    print("[ERROR] API is not ready. Please check your Replicate API token.")

Testing with a medical question:
Question: What are the common symptoms of type 2 diabetes?
Response: Type 2 diabetes is a chronic condition characterized by high blood sugar levels, and it can develop gradually over time. The common symptoms of type 2 diabetes may not always be noticeable, especially in the early stages. However, some common symptoms include:

1. Increased thirst and urination: As the body tries to rid itself of excess glucose, you may feel the need to drink more water and urinate more frequently.
2. Fatigue: High blood sugar levels can cause fatigue, weakness, and a general feeling of being tired or sluggish.
3. Blurred vision: High blood sugar levels can cause the lens in your eye to swell, leading to blurred vision.
4. Slow healing of cuts and wounds: High blood sugar levels can impede the healing process, making cuts and wounds take longer to heal.
5. Tingling or numbness in the hands and feet: High blood sugar levels can damage the nerves, causing tingling, numbn

In [7]:
# Evaluation summary for full dataset
print("FULL DATASET EVALUATION SUMMARY")
print("=" * 50)
print(f"Total samples to process: {len(df_sample)}")
print(f"Model: {model_name}")
print(f"Output file: llama3_8b_full_evaluation.csv")
print(f"Checkpoint interval: Every 100 samples")
print(f"Estimated time: ~{(len(df_sample) * 2) / 60:.1f} minutes")
print("=" * 50)

if api_ready:
    print("[OK] API is ready - starting full evaluation")
else:
    print("[ERROR] API not ready. Please check your setup first.")

FULL DATASET EVALUATION SUMMARY
Total samples to process: 2996
Model: meta/meta-llama-3-8b-instruct
Output file: llama3_8b_full_evaluation.csv
Checkpoint interval: Every 100 samples
Estimated time: ~99.9 minutes
[OK] API is ready - starting full evaluation


In [11]:
# Run evaluation on full dataset with checkpoints
import time

results = []
start_time = time.time()
total_samples = len(df_sample)
failed_requests = 0

print(f"Starting evaluation of {total_samples} samples using {model_name}...")
print(f"Checkpoints will be saved every 100 samples")
print("-" * 60)

for i, row in df_sample.iterrows():
    # Progress updates every 50 samples
    if i % 50 == 0:
        elapsed = time.time() - start_time
        rate = (i / elapsed * 3600) if elapsed > 0 else 0
        eta_hours = (total_samples - i) / rate if rate > 0 else 0
        completed_pct = (i / total_samples) * 100
        print(f"Progress: {i}/{total_samples} ({completed_pct:.1f}%) | Rate: {rate:.1f}/hour | ETA: {eta_hours:.1f}h")
        print(f"Failed requests: {failed_requests}")
    
    question = row['question']
    reference = row['reference'] if 'reference' in row else ''
    
    # Get model response
    generated = get_response(question)
    
    # Check for errors
    if "Error:" in str(generated):
        failed_requests += 1
        print(f"[WARNING] Failed request at sample {i}")
        continue
    
    # Store results
    results.append({
        'index': i,
        'question': question,
        'reference': reference,
        'generated': generated
    })
    
    # Save checkpoint every 100 samples
    if (i + 1) % 100 == 0:
        temp_df = pd.DataFrame(results)
        checkpoint_file = f'checkpoint_{i+1}.csv'
        temp_df.to_csv(checkpoint_file, index=False)
        print(f"[CHECKPOINT] Saved {len(results)} results to {checkpoint_file}")
    
    # Rate limiting - 1 second delay to avoid hitting API limits
    time.sleep(1)

# Final save
if len(results) > 0:
    results_df = pd.DataFrame(results)
    results_df.to_csv('llama3_8b_full_evaluation.csv', index=False)
    total_time = time.time() - start_time

    print("\n" + "=" * 60)
    print("EVALUATION COMPLETED!")
    print(f"Successfully processed: {len(results_df)} samples")
    print(f"Failed requests: {failed_requests}")
    print(f"Total time: {total_time/3600:.2f} hours")
    print(f"Results saved to 'llama3_8b_full_evaluation.csv'")
    print("=" * 60)
else:
    print("\n[ERROR] NO VALID RESULTS - Evaluation failed")
    print("Please check the model configuration")

Starting evaluation of 2996 samples using meta/meta-llama-3-8b-instruct...
Checkpoints will be saved every 100 samples
------------------------------------------------------------
Progress: 0/2996 (0.0%) | Rate: 0.0/hour | ETA: 0.0h
Failed requests: 0


KeyboardInterrupt: 

In [12]:
# Resume evaluation from the last checkpoint
import time
import pandas as pd

# Load existing results to determine where to resume
try:
    existing_results = pd.read_csv('llama3_8b_full_evaluation.csv')
    last_processed_index = existing_results['index'].max()
    print(f"Found existing results up to index: {last_processed_index}")
    print(f"Total existing samples: {len(existing_results)}")
    
    # Convert existing results to list format for continuation
    results = existing_results.to_dict('records')
    print(f"Loaded {len(results)} existing results")
    
except FileNotFoundError:
    print("No existing results file found. Starting fresh.")
    results = []
    last_processed_index = -1

# Resume from the next index
start_index = last_processed_index + 1
total_samples = len(df_sample)
failed_requests = 0

print(f"\nResuming evaluation from index {start_index}")
print(f"Remaining samples to process: {total_samples - start_index}")
print(f"Progress: {start_index}/{total_samples} ({(start_index/total_samples)*100:.1f}% completed)")
print("-" * 60)

# Resume evaluation loop
start_time = time.time()

for i in range(start_index, total_samples):
    row = df_sample.iloc[i]
    
    # Progress updates every 50 samples
    if i % 50 == 0:
        elapsed = time.time() - start_time
        rate = ((i - start_index) / elapsed * 3600) if elapsed > 0 else 0
        eta_hours = (total_samples - i) / rate if rate > 0 else 0
        completed_pct = (i / total_samples) * 100
        print(f"Progress: {i}/{total_samples} ({completed_pct:.1f}%) | Rate: {rate:.1f}/hour | ETA: {eta_hours:.1f}h")
        print(f"Failed requests: {failed_requests}")
    
    question = row['question']
    reference = row['reference'] if 'reference' in row else ''
    
    # Get model response
    generated = get_response(question)
    
    # Check for errors
    if "Error:" in str(generated):
        failed_requests += 1
        print(f"[WARNING] Failed request at sample {i}")
        continue
    
    # Store results
    results.append({
        'index': i,
        'question': question,
        'reference': reference,
        'generated': generated
    })
    
    # Save checkpoint every 100 samples
    if (i + 1) % 100 == 0:
        temp_df = pd.DataFrame(results)
        checkpoint_file = f'checkpoint_{i+1}.csv'
        temp_df.to_csv(checkpoint_file, index=False)
        print(f"[CHECKPOINT] Saved {len(results)} results to {checkpoint_file}")
    
    # Rate limiting - 1 second delay to avoid hitting API limits
    time.sleep(1)

# Final save
if len(results) > 0:
    results_df = pd.DataFrame(results)
    results_df.to_csv('llama3_8b_full_evaluation.csv', index=False)
    total_time = time.time() - start_time

    print("\n" + "=" * 60)
    print("EVALUATION COMPLETED!")
    print(f"Successfully processed: {len(results_df)} samples")
    print(f"Failed requests: {failed_requests}")
    print(f"Total time for this session: {total_time/3600:.2f} hours")
    print(f"Results saved to 'llama3_8b_full_evaluation.csv'")
    print("=" * 60)
else:
    print("\n[ERROR] NO VALID RESULTS - Evaluation failed")
    print("Please check the model configuration")

Found existing results up to index: 1427
Total existing samples: 1427
Loaded 1427 existing results

Resuming evaluation from index 1428
Remaining samples to process: 1568
Progress: 1428/2996 (47.7% completed)
------------------------------------------------------------
Progress: 1450/2996 (48.4%) | Rate: 514.6/hour | ETA: 3.0h
Failed requests: 0
[CHECKPOINT] Saved 1499 results to checkpoint_1500.csv
Progress: 1500/2996 (50.1%) | Rate: 500.5/hour | ETA: 3.0h
Failed requests: 0
Progress: 1550/2996 (51.7%) | Rate: 510.1/hour | ETA: 2.8h
Failed requests: 0
[CHECKPOINT] Saved 1599 results to checkpoint_1600.csv
Progress: 1600/2996 (53.4%) | Rate: 515.2/hour | ETA: 2.7h
Failed requests: 0
Progress: 1650/2996 (55.1%) | Rate: 516.6/hour | ETA: 2.6h
Failed requests: 0
[CHECKPOINT] Saved 1699 results to checkpoint_1700.csv
Progress: 1700/2996 (56.7%) | Rate: 507.9/hour | ETA: 2.6h
Failed requests: 0
Progress: 1750/2996 (58.4%) | Rate: 509.0/hour | ETA: 2.4h
Failed requests: 0
[CHECKPOINT] Saved 

In [13]:
# Display full evaluation results
try:
    results_df = pd.read_csv('llama3_8b_full_evaluation.csv')
    
    print("=== FULL EVALUATION RESULTS ===")
    print(f"Total samples evaluated: {len(results_df)}")
    print(f"Model: {model_name}")
    
    print("\nSample results:")
    for i in range(min(5, len(results_df))):
        print(f"\n--- Sample {i+1} ---")
        print(f"Question: {results_df.iloc[i]['question'][:100]}...")
        print(f"Generated: {results_df.iloc[i]['generated'][:100]}...")
    
    print(f"\nFull results saved to 'llama3_8b_full_evaluation.csv'")
    print(f"Dataset size: {len(results_df)} samples")
    
    # Show statistics
    avg_response_length = results_df['generated'].str.len().mean()
    print(f"Average response length: {avg_response_length:.1f} characters")
    
except FileNotFoundError:
    print("[ERROR] No evaluation results found. Please run the evaluation first.")
    print("Looking for: llama3_8b_full_evaluation.csv")
except Exception as e:
    print(f"[ERROR] Error loading results: {str(e)}")

=== FULL EVALUATION RESULTS ===
Total samples evaluated: 2992
Model: meta/meta-llama-3-8b-instruct

Sample results:

--- Sample 1 ---
Question: Hi doctor,
I am a 51-year-old female with a height of 5 feet 8 inches and a weight of 145 lbs. I hav...
Generated: I'm not a doctor, but I can try to help you understand your situation and provide some general infor...

--- Sample 2 ---
Question: Hello doctor,
I am 29 years old.
CT scan came up negative for problems associated with infection. I ...
Generated: Thank you for reaching out and sharing your concerns with me. I'm here to help you get a better unde...

--- Sample 3 ---
Question: Hi doctor,
For the past two days, I am suffering from a cold because of climatic change. I am having...
Generated: Sorry to hear that you're not feeling well! It's not uncommon to catch a cold due to climatic change...

--- Sample 4 ---
Question: Hi doctor,
I am suffering from kidney stones problem for the past four days.  I took an ultrasound s...
Generated: 

In [14]:
# Import evaluation metrics libraries
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Download required NLTK data
nltk.download('punkt', quiet=True)

# Setup metrics
rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
smoothing = SmoothingFunction().method1

print("Evaluation metrics libraries imported successfully")

Evaluation metrics libraries imported successfully


In [15]:
# Define metrics calculation function
def calculate_metrics(reference, generated):
    """Calculate BLEU and ROUGE metrics for a reference-generated pair"""
    try:
        # Tokenize for BLEU calculation
        ref_tokens = nltk.word_tokenize(reference.lower())
        gen_tokens = nltk.word_tokenize(generated.lower())
        
        # Calculate BLEU scores
        bleu1 = sentence_bleu([ref_tokens], gen_tokens, weights=(1,0,0,0), smoothing_function=smoothing)
        bleu4 = sentence_bleu([ref_tokens], gen_tokens, weights=(0.25,0.25,0.25,0.25), smoothing_function=smoothing)
        
        # Calculate ROUGE scores
        rouge_scores = rouge_scorer_obj.score(reference, generated)
        
        return {
            'bleu1': bleu1,
            'bleu4': bleu4,
            'rouge1': rouge_scores['rouge1'].fmeasure,
            'rouge2': rouge_scores['rouge2'].fmeasure,
            'rougeL': rouge_scores['rougeL'].fmeasure
        }
    except Exception as e:
        print(f"Error calculating metrics for sample: {e}")
        return {
            'bleu1': 0.0,
            'bleu4': 0.0,
            'rouge1': 0.0,
            'rouge2': 0.0,
            'rougeL': 0.0
        }

print("Metrics calculation function ready")

Metrics calculation function ready


In [16]:
# Calculate metrics for each sample in the evaluation results
try:
    # Load the evaluation results
    results_df = pd.read_csv('llama3_8b_full_evaluation.csv')
    print(f"Loaded evaluation results with {len(results_df)} samples")
    
    # Check if metrics columns already exist
    metric_columns = ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']
    missing_metrics = [col for col in metric_columns if col not in results_df.columns]
    
    if missing_metrics:
        print("Calculating metrics for each sample...")
        
        # Add metrics columns
        for metric in metric_columns:
            results_df[metric] = None
        
        # Calculate metrics for each sample
        valid_metrics_count = 0
        for i, row in results_df.iterrows():
            if i % 100 == 0:
                print(f"Processing sample {i}/{len(results_df)}")
            
            reference = row['reference'] if pd.notna(row['reference']) else ''
            generated = row['generated'] if pd.notna(row['generated']) else ''
            
            # Only calculate metrics if both reference and generated text exist
            if reference and reference.strip() and generated and generated.strip():
                metrics = calculate_metrics(reference, generated)
                for metric, value in metrics.items():
                    results_df.at[i, metric] = value
                valid_metrics_count += 1
        
        # Save updated results with metrics
        results_df.to_csv('llama3_8b_full_evaluation_with_metrics.csv', index=False)
        print(f"\nMetrics calculated for {valid_metrics_count} samples")
        print("Updated results saved to 'llama3_8b_full_evaluation_with_metrics.csv'")
    else:
        print("Metrics columns already exist in the dataset")
        valid_metrics_count = results_df['bleu1'].notna().sum()
        print(f"Found metrics for {valid_metrics_count} samples")
    
except FileNotFoundError:
    print("Error: No evaluation results found.")
    print("Please run the evaluation first to generate 'llama3_8b_full_evaluation.csv'")
except Exception as e:
    print(f"Error calculating metrics: {e}")

Loaded evaluation results with 2992 samples
Calculating metrics for each sample...
Processing sample 0/2992
Processing sample 100/2992
Processing sample 100/2992
Processing sample 200/2992
Processing sample 200/2992
Processing sample 300/2992
Processing sample 300/2992
Processing sample 400/2992
Processing sample 400/2992
Processing sample 500/2992
Processing sample 500/2992
Processing sample 600/2992
Processing sample 600/2992
Processing sample 700/2992
Processing sample 700/2992
Processing sample 800/2992
Processing sample 800/2992
Processing sample 900/2992
Processing sample 900/2992
Processing sample 1000/2992
Processing sample 1000/2992
Processing sample 1100/2992
Processing sample 1100/2992
Processing sample 1200/2992
Processing sample 1200/2992
Processing sample 1300/2992
Processing sample 1300/2992
Processing sample 1400/2992
Processing sample 1400/2992
Processing sample 1500/2992
Processing sample 1500/2992
Processing sample 1600/2992
Processing sample 1600/2992
Processing sam

In [17]:
# Calculate and display average metrics
try:
    # Load results with metrics
    try:
        results_df = pd.read_csv('llama3_8b_full_evaluation_with_metrics.csv')
        print("Loaded results with calculated metrics")
    except FileNotFoundError:
        results_df = pd.read_csv('llama3_8b_full_evaluation.csv')
        print("Loaded original results (metrics may not be available)")
    
    print("\n" + "=" * 60)
    print("EVALUATION METRICS ANALYSIS")
    print("=" * 60)
    print(f"Total samples: {len(results_df)}")
    print(f"Model: {model_name}")
    
    # Check which metrics are available
    metric_columns = ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']
    metric_names = ['BLEU-1', 'BLEU-4', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']
    
    available_metrics = [col for col in metric_columns if col in results_df.columns]
    
    if available_metrics:
        print(f"\n=== AVERAGE METRICS ({len(available_metrics)} metrics available) ===")
        
        metrics_summary = {}
        for metric, name in zip(metric_columns, metric_names):
            if metric in results_df.columns:
                # Filter out None/NaN values
                valid_scores = results_df[metric].dropna()
                if len(valid_scores) > 0:
                    avg_score = valid_scores.mean()
                    std_score = valid_scores.std()
                    min_score = valid_scores.min()
                    max_score = valid_scores.max()
                    
                    metrics_summary[metric] = {
                        'avg': avg_score, 
                        'std': std_score, 
                        'min': min_score, 
                        'max': max_score,
                        'count': len(valid_scores)
                    }
                    
                    print(f"{name:>10}: {avg_score:.4f} (±{std_score:.4f}) | Range: [{min_score:.4f}, {max_score:.4f}] | {len(valid_scores)} samples")
                else:
                    print(f"{name:>10}: No valid scores")
        
        # Show quartile distribution
        if metrics_summary:
            print(f"\n=== SCORE DISTRIBUTION (QUARTILES) ===")
            for metric, name in zip(metric_columns, metric_names):
                if metric in metrics_summary:
                    valid_scores = results_df[metric].dropna()
                    q25 = valid_scores.quantile(0.25)
                    q50 = valid_scores.quantile(0.50)  # median
                    q75 = valid_scores.quantile(0.75)
                    print(f"{name:>10}: Q25={q25:.4f} | Median={q50:.4f} | Q75={q75:.4f}")
        
        # Show best and worst performing samples
        if 'bleu1' in results_df.columns:
            print(f"\n=== SAMPLE ANALYSIS ===")
            valid_samples = results_df.dropna(subset=['bleu1'])
            
            if len(valid_samples) > 0:
                # Best performing sample (highest BLEU-1)
                best_idx = valid_samples['bleu1'].idxmax()
                best_sample = results_df.loc[best_idx]
                print(f"\nBest performing sample (highest BLEU-1: {best_sample['bleu1']:.4f}):")
                print(f"Question: {best_sample['question'][:100]}...")
                print(f"Reference: {best_sample['reference'][:100]}...")
                print(f"Generated: {best_sample['generated'][:100]}...")
                
                # Worst performing sample (lowest BLEU-1)
                worst_idx = valid_samples['bleu1'].idxmin()
                worst_sample = results_df.loc[worst_idx]
                print(f"\nWorst performing sample (lowest BLEU-1: {worst_sample['bleu1']:.4f}):")
                print(f"Question: {worst_sample['question'][:100]}...")
                print(f"Reference: {worst_sample['reference'][:100]}...")
                print(f"Generated: {worst_sample['generated'][:100]}...")
        
        # Summary statistics
        print(f"\n=== SUMMARY STATISTICS ===")
        samples_with_metrics = results_df[available_metrics[0]].notna().sum()
        samples_with_reference = results_df['reference'].notna().sum() if 'reference' in results_df.columns else 0
        avg_response_length = results_df['generated'].str.len().mean()
        
        print(f"Samples with calculated metrics: {samples_with_metrics}")
        print(f"Samples with reference text: {samples_with_reference}")
        print(f"Average response length: {avg_response_length:.1f} characters")
        
    else:
        print("\nNo metrics available. Please run the metrics calculation first.")
    
    print("=" * 60)
    
except FileNotFoundError:
    print("Error: No evaluation results found.")
    print("Please run the evaluation and metrics calculation first.")
except Exception as e:
    print(f"Error analyzing metrics: {e}")

Loaded results with calculated metrics

EVALUATION METRICS ANALYSIS
Total samples: 2992
Model: meta/meta-llama-3-8b-instruct

=== AVERAGE METRICS (5 metrics available) ===
    BLEU-1: 0.1739 (±0.0926) | Range: [0.0000, 0.4878] | 2992 samples
    BLEU-4: 0.0127 (±0.0147) | Range: [0.0000, 0.1400] | 2992 samples
   ROUGE-1: 0.2419 (±0.0896) | Range: [0.0000, 0.5686] | 2992 samples
   ROUGE-2: 0.0379 (±0.0270) | Range: [0.0000, 0.2289] | 2992 samples
   ROUGE-L: 0.1219 (±0.0351) | Range: [0.0000, 0.2793] | 2992 samples

=== SCORE DISTRIBUTION (QUARTILES) ===
    BLEU-1: Q25=0.1163 | Median=0.1701 | Q75=0.2301
    BLEU-4: Q25=0.0040 | Median=0.0082 | Q75=0.0162
   ROUGE-1: Q25=0.1831 | Median=0.2435 | Q75=0.3025
   ROUGE-2: Q25=0.0202 | Median=0.0340 | Q75=0.0499
   ROUGE-L: Q25=0.1017 | Median=0.1243 | Q75=0.1420

=== SAMPLE ANALYSIS ===

Best performing sample (highest BLEU-1: 0.4878):
Question: Hello doctor,I am a 65-year-old woman concerned about my overall respiratory health, particul

In [None]:
# Export metrics summary to CSV
try:
    # Load results with metrics
    try:
        results_df = pd.read_csv('llama3_8b_full_evaluation_with_metrics.csv')
    except FileNotFoundError:
        results_df = pd.read_csv('llama3_8b_full_evaluation.csv')
    
    # Create comprehensive metrics summary
    metric_columns = ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']
    summary_data = []
    
    print("Creating metrics summary report...")
    
    for metric in metric_columns:
        if metric in results_df.columns:
            valid_scores = results_df[metric].dropna()
            if len(valid_scores) > 0:
                summary_data.append({
                    'metric': metric.upper(),
                    'mean': valid_scores.mean(),
                    'std': valid_scores.std(),
                    'min': valid_scores.min(),
                    'q25': valid_scores.quantile(0.25),
                    'median': valid_scores.quantile(0.50),
                    'q75': valid_scores.quantile(0.75),
                    'max': valid_scores.max(),
                    'sample_count': len(valid_scores),
                    'model': model_name,
                    'total_samples': len(results_df)
                })
    
    if summary_data:
        # Create and save summary DataFrame
        summary_df = pd.DataFrame(summary_data)
        summary_df.to_csv('llama3_8b_metrics_summary.csv', index=False)
        
        print("Metrics summary exported to 'llama3_8b_metrics_summary.csv'")
        print("\nMETRICS SUMMARY TABLE:")
        print("-" * 80)
        
        # Display formatted summary
        for _, row in summary_df.iterrows():
            print(f"{row['metric']:>8}: {row['mean']:.4f} ± {row['std']:.4f} "
                  f"(range: {row['min']:.4f}-{row['max']:.4f}) | {row['sample_count']} samples")
        
        # Calculate overall performance score (weighted average)
        if len(summary_df) >= 3:  # If we have at least 3 metrics
            weights = {'BLEU1': 0.2, 'BLEU4': 0.2, 'ROUGE1': 0.2, 'ROUGE2': 0.2, 'ROUGEL': 0.2}
            total_score = 0
            total_weight = 0
            
            for _, row in summary_df.iterrows():
                metric_name = row['metric']
                if metric_name in weights:
                    total_score += row['mean'] * weights[metric_name]
                    total_weight += weights[metric_name]
            
            if total_weight > 0:
                overall_score = total_score / total_weight
                print(f"\nOVERALL PERFORMANCE SCORE: {overall_score:.4f}")
                
                # Performance interpretation
                if overall_score >= 0.4:
                    print("INTERPRETATION: EXCELLENT performance")
                elif overall_score >= 0.3:
                    print("INTERPRETATION: GOOD performance") 
                elif overall_score >= 0.2:
                    print("INTERPRETATION: MODERATE performance")
                else:
                    print("INTERPRETATION: NEEDS IMPROVEMENT")
        
        print(f"\nFiles created:")
        print(f"- llama3_8b_full_evaluation_with_metrics.csv (detailed results)")
        print(f"- llama3_8b_metrics_summary.csv (summary statistics)")
        
    else:
        print("No metrics data available to export")
        
except Exception as e:
    print(f"Error creating metrics summary: {e}")

In [None]:
# Final Metrics Summary - Calculate 5 metrics for all samples and show averages
try:
    # Load the final results
    results_df = pd.read_csv('llama3_8b_full_evaluation_with_metrics.csv')
    
    print("=" * 70)
    print("FINAL EVALUATION SUMMARY - LLAMA 3 8B INSTRUCT")
    print("=" * 70)
    print(f"Model: {model_name}")
    print(f"Total samples processed: {len(results_df)}")
    
    # Calculate metrics for all samples
    metric_columns = ['bleu1', 'bleu4', 'rouge1', 'rouge2', 'rougeL']
    metric_names = ['BLEU-1', 'BLEU-4', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']
    
    print(f"\n{'METRIC':<12} {'AVERAGE':<10} {'SAMPLES':<8} {'MIN':<8} {'MAX':<8}")
    print("-" * 60)
    
    final_metrics = {}
    for metric, name in zip(metric_columns, metric_names):
        if metric in results_df.columns:
            valid_scores = results_df[metric].dropna()
            if len(valid_scores) > 0:
                avg = valid_scores.mean()
                min_val = valid_scores.min()
                max_val = valid_scores.max()
                count = len(valid_scores)
                
                final_metrics[name] = avg
                print(f"{name:<12} {avg:<10.4f} {count:<8} {min_val:<8.4f} {max_val:<8.4f}")
    
    print("\n" + "=" * 70)
    print("EVALUATION COMPLETED SUCCESSFULLY!")
    print("Files saved:")
    print("- llama3_8b_full_evaluation.csv (raw results)")
    print("- llama3_8b_full_evaluation_with_metrics.csv (with metrics)")
    print("- llama3_8b_metrics_summary.csv (summary report)")
    print("=" * 70)
    
except FileNotFoundError:
    print("Error: Evaluation files not found. Please run the complete evaluation pipeline first.")
except Exception as e:
    print(f"Error in final summary: {e}")