In [None]:
import os
import sys

# If benchmark_runner.py, scoring.py, and plot_results.py are in the
# SAME FOLDER as this notebook, no special sys.path modification is needed.
# Python will find them automatically.
# If they were in a 'scripts' subdirectory, the previous sys.path code would be correct.

from benchmark_runner import run_benchmark_on_dataset
from plot_results import plot_benchmark_results

datasets = [
    "./data/dblp_scholar",
    # Add more dataset directories here if you have them, e.g.,
    # "./data/abt_buy",
]

model_categories = [
    
    #"speed_critical",
    "maximum_quality",
    "research_experimental",
    "production_balanced",
    "memory_constrained",
    "instruction_following",
    "fast_embedding",
    
    # "large_models", # Uncomment if you want to run large models
]

# --- Run Benchmarks ---
print("--- Starting all benchmarks ---")
for dataset_path in datasets:
    for category in model_categories:
        run_benchmark_on_dataset(dataset_path, category)
print("--- All benchmarks completed ---")

# --- Generate Plots ---
print("\n--- Starting plot generation ---")
for dataset_path in datasets:
    dataset_base_name = os.path.basename(dataset_path)
    for category in model_categories:
        metrics_file_path = f"./results/{dataset_base_name}_{category}_alethia_accuracy.csv"
        # The predictions_file_path is kept for the function signature, even if not directly used for plotting
        predictions_file_path = f"./results/{dataset_base_name}_{category}_alethia_results.csv"
        plots_output_directory = f"./plots/{dataset_base_name}_{category}_alethia_benchmark_plots"
        pdf_report_filename = f"{dataset_base_name}_{category}_alethia_benchmark_report.pdf"

        # Check if the metrics file exists before attempting to plot
        if os.path.exists(metrics_file_path):
            plot_benchmark_results(metrics_file_path, predictions_file_path, plots_output_directory, pdf_report_filename)
        else:
            print(f"Skipping plotting for {metrics_file_path}: Metrics file not found. Benchmark might have skipped for this combination.")
print("--- All plots generated ---")

--- Starting all benchmarks ---

--- Starting benchmark for Dataset: dblp_scholar, Model Category: maximum_quality ---
Loading data from: ./data/dblp_scholar
Data loaded: 3207 queries, 3207 references, 64263 candidates.

🔍 Evaluating baseline fuzzy matcher (RapidFuzz)...
--- DEBUG: rapidfuzz (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
RapidFuzz evaluation completed.

🚀 Evaluating embedding models...

⚙️ Evaluating model: Salesforce/SFR-Embedding-2_R
--- DEBUG: Salesforce/SFR-Embedding-2_R (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
DEBUG: Attempting to load model: Salesforce/SFR-Embedding-2_R


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.22s/it]


Successfully loaded LLM: Salesforce/SFR-Embedding-2_R
Evaluation for Salesforce/SFR-Embedding-2_R completed.

⚙️ Evaluating model: Salesforce/SFR-Embedding-Mistral
--- DEBUG: Salesforce/SFR-Embedding-Mistral (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
DEBUG: Attempting to load model: Salesforce/SFR-Embedding-Mistral


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.27s/it]


Successfully loaded LLM: Salesforce/SFR-Embedding-Mistral
Evaluation for Salesforce/SFR-Embedding-Mistral completed.

⚙️ Evaluating model: Alibaba-NLP/gte-Qwen2-7B-instruct
--- DEBUG: Alibaba-NLP/gte-Qwen2-7B-instruct (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
DEBUG: Attempting to load model: Alibaba-NLP/gte-Qwen2-7B-instruct


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:05<00:00,  1.21it/s]


Successfully loaded LLM: Alibaba-NLP/gte-Qwen2-7B-instruct
Evaluation for Alibaba-NLP/gte-Qwen2-7B-instruct completed.

⚙️ Evaluating model: Qwen/Qwen2-7B-Instruct
--- DEBUG: Qwen/Qwen2-7B-Instruct (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
DEBUG: Attempting to load model: Qwen/Qwen2-7B-Instruct


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.35it/s]


Successfully loaded LLM: Qwen/Qwen2-7B-Instruct
Evaluation for Qwen/Qwen2-7B-Instruct completed.

⚙️ Evaluating model: intfloat/e5-mistral-7b-instruct
--- DEBUG: intfloat/e5-mistral-7b-instruct (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
DEBUG: Attempting to load model: intfloat/e5-mistral-7b-instruct


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.44s/it]


Successfully loaded LLM: intfloat/e5-mistral-7b-instruct
Evaluation for intfloat/e5-mistral-7b-instruct completed.

⚙️ Evaluating model: microsoft/Phi-4-mini-instruct
--- DEBUG: microsoft/Phi-4-mini-instruct (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
DEBUG: Attempting to load model: microsoft/Phi-4-mini-instruct


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.56it/s]


Successfully loaded SentenceTransformer model: microsoft/Phi-4-mini-instruct


Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101/101 [00:49<00:00,  2.04it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 2009/2009 [17:16<00:00,  1.94it/s]


Evaluation for microsoft/Phi-4-mini-instruct completed.

📄 All predictions saved to: ./results/dblp_scholar_maximum_quality_alethia_results.csv
📊 All metrics saved to: ./results/dblp_scholar_maximum_quality_alethia_accuracy.csv

✅ Benchmark completed for dblp_scholar and maximum_quality.
--- Benchmark for dblp_scholar finished ---

--- Starting benchmark for Dataset: dblp_scholar, Model Category: research_experimental ---
Loading data from: ./data/dblp_scholar
Data loaded: 3207 queries, 3207 references, 64263 candidates.

🔍 Evaluating baseline fuzzy matcher (RapidFuzz)...
--- DEBUG: rapidfuzz (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
RapidFuzz evaluation completed.

🚀 Evaluating embedding models...

⚙️ Evaluating model: GritLM/GritLM-7B
--- DEBUG: GritLM/GritLM-7B (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
DEBUG: Attempting to load model: GritLM/GritLM-7B


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:03<00:00,  1.10s/it]


Successfully loaded LLM: GritLM/GritLM-7B
Evaluation for GritLM/GritLM-7B completed.

📄 All predictions saved to: ./results/dblp_scholar_research_experimental_alethia_results.csv
📊 All metrics saved to: ./results/dblp_scholar_research_experimental_alethia_accuracy.csv

✅ Benchmark completed for dblp_scholar and research_experimental.
--- Benchmark for dblp_scholar finished ---

--- Starting benchmark for Dataset: dblp_scholar, Model Category: production_balanced ---
Loading data from: ./data/dblp_scholar
Data loaded: 3207 queries, 3207 references, 64263 candidates.

🔍 Evaluating baseline fuzzy matcher (RapidFuzz)...
--- DEBUG: rapidfuzz (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
RapidFuzz evaluation completed.

🚀 Evaluating embedding models...

⚙️ Evaluating model: mixedbread-ai/mxbai-embed-large-v1
--- DEBUG: mixedbread-ai/mxbai-embed-large-v1 (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
DEBUG: Attempt

Batches: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 101/101 [00:05<00:00, 17.62it/s]
Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 2009/2009 [02:01<00:00, 16.54it/s]


Evaluation for mixedbread-ai/mxbai-embed-large-v1 completed.

⚙️ Evaluating model: Alibaba-NLP/gte-Qwen2-1.5B-instruct
--- DEBUG: Alibaba-NLP/gte-Qwen2-1.5B-instruct (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
DEBUG: Attempting to load model: Alibaba-NLP/gte-Qwen2-1.5B-instruct


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.69s/it]


Successfully loaded LLM: Alibaba-NLP/gte-Qwen2-1.5B-instruct
Evaluation for Alibaba-NLP/gte-Qwen2-1.5B-instruct completed.

⚙️ Evaluating model: Linq-AI-Research/Linq-Embed-Mistral
--- DEBUG: Linq-AI-Research/Linq-Embed-Mistral (Data Alignment) ---
Total references: 3207, References found in candidate_pool: 3207
DEBUG: Attempting to load model: Linq-AI-Research/Linq-Embed-Mistral


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.25it/s]


Successfully loaded SentenceTransformer model: Linq-AI-Research/Linq-Embed-Mistral


Batches:  64%|██████████████████████████████████████████████████████████████████████▏                                      | 65/101 [01:30<00:41,  1.14s/it]

In [1]:
# --- Generate Plots ---

import os
import sys

# If benchmark_runner.py, scoring.py, and plot_results.py are in the
# SAME FOLDER as this notebook, no special sys.path modification is needed.
# Python will find them automatically.
# If they were in a 'scripts' subdirectory, the previous sys.path code would be correct.

from benchmark_runner import run_benchmark_on_dataset
from plot_results import plot_benchmark_results

datasets = [
    "./data/dblp_scholar",
    # Add more dataset directories here if you have them, e.g.,
    # "./data/abt_buy",
]

model_categories = [
    
    #"speed_critical",
    "maximum_quality",
    "research_experimental",
    
    
    # "large_models", # Uncomment if you want to run large models
]

print("\n--- Starting plot generation ---")
for dataset_path in datasets:
    dataset_base_name = os.path.basename(dataset_path)
    for category in model_categories:
        metrics_file_path = f"./results/{dataset_base_name}_{category}_alethia_accuracy.csv"
        # The predictions_file_path is kept for the function signature, even if not directly used for plotting
        predictions_file_path = f"./results/{dataset_base_name}_{category}_alethia_results.csv"
        plots_output_directory = f"./plots/{dataset_base_name}_{category}_alethia_benchmark_plots"
        pdf_report_filename = f"{dataset_base_name}_{category}_alethia_benchmark_report.pdf"

        # Check if the metrics file exists before attempting to plot
        if os.path.exists(metrics_file_path):
            plot_benchmark_results(metrics_file_path, predictions_file_path, plots_output_directory, pdf_report_filename)
        else:
            print(f"Skipping plotting for {metrics_file_path}: Metrics file not found. Benchmark might have skipped for this combination.")
print("--- All plots generated ---")

  from .autonotebook import tqdm as notebook_tqdm



--- Starting plot generation ---
Loading metrics from: ./results/dblp_scholar_maximum_quality_alethia_accuracy.csv
Loading predictions from: ./results/dblp_scholar_maximum_quality_alethia_results.csv
Added Top-1 Accuracy plot to PDF.
Added Mean Reciprocal Rank (MRR) plot to PDF.
Added Average Inference Time plot to PDF.
Added Memory Usage plot to PDF.
Added Accuracy vs. Inference Time plot to PDF.
Skipped Match Score Distribution plot as requested.
Added Correct vs Incorrect Predictions plot to PDF.

All requested plots generated and saved to './plots/dblp_scholar_maximum_quality_alethia_benchmark_plots/dblp_scholar_maximum_quality_alethia_benchmark_report.pdf'.
Loading metrics from: ./results/dblp_scholar_research_experimental_alethia_accuracy.csv
Loading predictions from: ./results/dblp_scholar_research_experimental_alethia_results.csv
Added Top-1 Accuracy plot to PDF.
Added Mean Reciprocal Rank (MRR) plot to PDF.
Added Average Inference Time plot to PDF.
Added Memory Usage plot to 