In [None]:
import argparse
import os
import sys
from pathlib import Path
from typing import Dict, List, Optional

import pandas as pd
import numpy as np
from tqdm import tqdm
from openai import OpenAI

from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    factual_correctness,
    semantic_similarity,
    answer_relevancy
)

In [None]:
def load_dataset(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} examples from {csv_path}")
    return df

In [None]:
def generate_answers(
    df: pd.DataFrame,
    endpoint: str,
    model_name: str,
    api_key: str,
    answer_column: str,
    max_new_tokens: int = 256,
    temperature: float = 0.0
) -> pd.DataFrame:
    
    client = OpenAI(
        base_url=endpoint,
        api_key=api_key
    )
    
    answers = []
    
    print(f"Generating answers with {model_name}.")
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        question = row['question']
        contexts = row['contexts']
        
        if isinstance(contexts, str):
            context_text = contexts
        elif isinstance(contexts, list):
            context_text = "\n\n".join(contexts)
        else:
            context_text = str(contexts)
        
        prompt = f"""Context:
{context_text}

Question: {question}

Answer:"""
        
        try:
            response = client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "You are an expert radiologist with over 20 years of clinical experience in diagnostic imaging and medical image interpretation. Provide accurate, evidence-based answers using the provided medical context. Your responses should reflect deep clinical expertise while remaining clear and precise."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=max_new_tokens,
                temperature=temperature
            )
            answer = response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error generating answer for row {idx}: {e}")
            answer = ""
        
        answers.append(answer)
    
    df[answer_column] = answers
    return df

In [None]:
def prepare_ragas_dataset(
    df: pd.DataFrame,
    answer_column: str
) -> Dataset:
    
    contexts_list = []
    for ctx in df['contexts']:
        if isinstance(ctx, str):
            contexts_list.append([ctx])
        elif isinstance(ctx, list):
            contexts_list.append(ctx)
        else:
            contexts_list.append([str(ctx)])
    
    data_dict = {
        'question': df['question'].tolist(),
        'contexts': contexts_list,
        'answer': df[answer_column].tolist(),
        'ground_truth': df['reference_answer'].tolist()
    }
    
    return Dataset.from_dict(data_dict)

In [None]:
def evaluate_model(
    df: pd.DataFrame,
    answer_column: str,
    model_name: str
) -> Dict[str, List[float]]:
    
    print(f"Evaluating {model_name}.")
    
    dataset = prepare_ragas_dataset(df, answer_column)
    
    metrics = [
        factual_correctness,
        semantic_similarity,
        answer_relevancy
    ]
    
    results = evaluate(
        dataset=dataset,
        metrics=metrics
    )
    
    # extractinf per example scores.
    scores = {
        'factual_correctness': results['factual_correctness'],
        'semantic_similarity': results['semantic_similarity'],
        'answer_relevancy': results['answer_relevancy']
    }
    
    return scores

In [None]:
def compute_statistics(scores: List[float]) -> Dict[str, float]:
    scores_array = np.array(scores)
    return {
        'mean': np.mean(scores_array),
        'std': np.std(scores_array)
    }

In [None]:
def create_comparison_df(
    df: pd.DataFrame,
    baseline_scores: Dict[str, List[float]],
    finetuned_scores: Dict[str, List[float]]
) -> pd.DataFrame:
    
    comparison_data = {
        'id': df.get('id', range(len(df))),
        'question': df['question'],
    }
    
    # scores for each metric.
    for metric in baseline_scores.keys():
        comparison_data[f'{metric}_baseline'] = baseline_scores[metric]
        comparison_data[f'{metric}_finetuned'] = finetuned_scores[metric]
        comparison_data[f'{metric}_diff'] = [
            ft - bl for bl, ft in zip(baseline_scores[metric], finetuned_scores[metric])
        ]
    
    return pd.DataFrame(comparison_data)


def generate_summary_markdown(
    baseline_scores: Dict[str, List[float]],
    finetuned_scores: Dict[str, List[float]],
    output_path: str
):
    
    summary = "# RAGAS evaluation results\n\n"
    summary += "## Model comparison\n\n"
    summary += "| Metric | Baseline (Mean ± Std) | Fine-tuned (Mean ± Std) | Difference | Winner |\n"
    summary += "|--------|-----------------------|-------------------------|------------|--------|\n"
    
    for metric in baseline_scores.keys():
        bl_stats = compute_statistics(baseline_scores[metric])
        ft_stats = compute_statistics(finetuned_scores[metric])
        diff = ft_stats['mean'] - bl_stats['mean']
        winner = "Fine-tuned" if diff > 0 else "Baseline" if diff < 0 else "Tie"
        
        summary += f"| {metric} | {bl_stats['mean']:.4f} ± {bl_stats['std']:.4f} | "
        summary += f"{ft_stats['mean']:.4f} ± {ft_stats['std']:.4f} | "
        summary += f"{diff:+.4f} | {winner} |\n"
    
    summary += "\n## Detailed Statistics\n\n"
    
    for metric in baseline_scores.keys():
        bl_stats = compute_statistics(baseline_scores[metric])
        ft_stats = compute_statistics(finetuned_scores[metric])
        
        summary += f"### {metric}\n\n"
        summary += f"- **Baseline**: {bl_stats['mean']:.4f} ± {bl_stats['std']:.4f}\n"
        summary += f"- **Fine-tuned**: {ft_stats['mean']:.4f} ± {ft_stats['std']:.4f}\n"
        summary += f"- **Improvement**: {(ft_stats['mean'] - bl_stats['mean']):.4f} ({((ft_stats['mean'] - bl_stats['mean']) / bl_stats['mean'] * 100):.2f}%)\n\n"
    
    with open(output_path, 'w') as f:
        f.write(summary)
    
    print(f"Summary saved to {output_path}")

In [None]:
def plot_comparison(
    baseline_scores: Dict[str, List[float]],
    finetuned_scores: Dict[str, List[float]],
    output_path: str
):
    try:
        import matplotlib.pyplot as plt
        
        metrics = list(baseline_scores.keys())
        baseline_means = [compute_statistics(baseline_scores[m])['mean'] for m in metrics]
        finetuned_means = [compute_statistics(finetuned_scores[m])['mean'] for m in metrics]
        
        x = np.arange(len(metrics))
        width = 0.35
        
        fig, ax = plt.subplots(figsize=(10, 6))
        ax.bar(x - width/2, baseline_means, width, label='Baseline', alpha=0.8)
        ax.bar(x + width/2, finetuned_means, width, label='Fine-tuned', alpha=0.8)
        
        ax.set_xlabel('Metrics')
        ax.set_ylabel('Score')
        ax.set_title('RAGAS Metrics: Baseline vs Fine-tuned')
        ax.set_xticks(x)
        ax.set_xticklabels(metrics, rotation=15, ha='right')
        ax.legend()
        ax.grid(axis='y', alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        
        print(f"Plot saved to {output_path}")
    except ImportError:
        print("Skipping plot generation.")

In [None]:
def main():
    parser = argparse.ArgumentParser(
        description="Evaluate baseline and fine-tuned models using RAGAS metrics"
    )
    
    parser.add_argument('--data_csv', type=str, required=True,
                        help='Path to input CSV file')
    parser.add_argument('--mode', type=str, choices=['inference', 'precomputed'],
                        required=True, help='Evaluation mode')
    
    # inference mode arguments.
    parser.add_argument('--baseline_endpoint', type=str,
                        help='Baseline model API endpoint')
    parser.add_argument('--finetuned_endpoint', type=str,
                        help='Fine-tuned model API endpoint')
    parser.add_argument('--baseline_model', type=str, default='Llama-3.1-8B-Instruct',
                        help='Baseline model name')
    parser.add_argument('--finetuned_model', type=str,
                        help='Fine-tuned model name')
    parser.add_argument('--api_key', type=str,
                        help='API keys in format BASELINE_KEY:FINETUNED_KEY')
    
    # generation parameters.
    parser.add_argument('--max_new_tokens', type=int, default=256,
                        help='Maximum new tokens to generate')
    parser.add_argument('--temperature', type=float, default=0.0,
                        help='Sampling temperature')
    
    # output arguments.
    parser.add_argument('--output_dir', type=str, default='./eval_outputs',
                        help='Output directory for results')
    
    args = parser.parse_args()
    
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    df = load_dataset(args.data_csv)
    
    if args.mode == 'inference':
        if not all([args.baseline_endpoint, args.finetuned_endpoint, args.api_key]):
            print("Error: inference mode requires baseline_endpoint, finetuned_endpoint and api_key".)
            sys.exit(1)
        

        api_keys = args.api_key.split(':')
        if len(api_keys) != 2:
            print("Error: api_key must be in format BASELINE_KEY:FINETUNED_KEY.")
            sys.exit(1)
        
        baseline_key, finetuned_key = api_keys
        
        # baseline answers.
        df = generate_answers(
            df, args.baseline_endpoint, args.baseline_model,
            baseline_key, 'answer_baseline',
            args.max_new_tokens, args.temperature
        )
        
        # fine-tuned answers.
        df = generate_answers(
            df, args.finetuned_endpoint, args.finetuned_model,
            finetuned_key, 'answer_finetuned',
            args.max_new_tokens, args.temperature
        )
        
        output_csv = output_dir / 'dataset_with_answers.csv'
        df.to_csv(output_csv, index=False)
        print(f"Dataset with answers saved to {output_csv}")
    
    else:
        if 'answer_baseline' not in df.columns or 'answer_finetuned' not in df.columns:
            print("Error: precomputed mode requires answer_baseline and answer_finetuned columns in csv.")
            sys.exit(1)
    
    # evaluating the baseline model.
    baseline_scores = evaluate_model(df, 'answer_baseline', 'Baseline')
    
    # evaluating the fine-tuned model.
    finetuned_scores = evaluate_model(df, 'answer_finetuned', 'Fine-tuned')
    

    comparison_df = create_comparison_df(df, baseline_scores, finetuned_scores)
    comparison_csv = output_dir / 'detailed_comparison.csv'
    comparison_df.to_csv(comparison_csv, index=False)
    print(f"\n Detailed comparison saved to {comparison_csv}")
    

    summary_md = output_dir / 'summary.md'
    generate_summary_markdown(baseline_scores, finetuned_scores, summary_md)
    

    plot_path = output_dir / 'comparison_plot.png'
    plot_comparison(baseline_scores, finetuned_scores, plot_path)
    

    print("\n" + "-"*60)
    print("Evaluation Summary.")
    print("-"*60)
    
    for metric in baseline_scores.keys():
        bl_stats = compute_statistics(baseline_scores[metric])
        ft_stats = compute_statistics(finetuned_scores[metric])
        diff = ft_stats['mean'] - bl_stats['mean']
        
        print(f"\n{metric}:")
        print(f"  Baseline:    {bl_stats['mean']:.4f} ± {bl_stats['std']:.4f}")
        print(f"  Fine-tuned:  {ft_stats['mean']:.4f} ± {ft_stats['std']:.4f}")
        print(f"  Difference:  {diff:+.4f} ({(diff/bl_stats['mean']*100):+.2f}%)")
    
    print("\n" + "-"*60)
    print(f"Evaluation complete. Results saved to {output_dir}")


if __name__ == "__main__":
    main()

# How to use it.

## Answers already in CSV.
python evaluate_with_ragas.py \
  --data_csv ragdology.csv \
  --mode precomputed \
  --output_dir ./eval_outputs

## Generate answers from the model endpoints.
python evaluate_with_ragas.py \
  --data_csv ragdology.csv \
  --mode inference \
  --baseline_endpoint http://localhost:8000/v1 \
  --finetuned_endpoint http://localhost:9000/v1 \
  --baseline_model Llama-3.1-8B-Instruct \
  --finetuned_model my-llama-3.1-8b-instruct-finetuned \
  --api_key BASEKEY:FINEKEY \
  --output_dir ./eval_outputs

## Inference mode with custom generation parameters.
python evaluate_with_ragas.py \
  --data_csv ragdology.csv \
  --mode inference \
  --baseline_endpoint http://localhost:8000/v1 \
  --finetuned_endpoint http://localhost:9000/v1 \
  --baseline_model Llama-3.1-8B-Instruct \
  --finetuned_model my-llama-3.1-8b-instruct-finetuned \
  --api_key BASEKEY:FINEKEY \
  --max_new_tokens 512 \
  --temperature 0.1 \
  --output_dir ./eval_outputs