# Evaluation Test Notebook

This notebook tests all evaluation functions and workflows.

## How to Run

1. Press "Run All" to execute all tests
2. Verifies metrics computation, statistical tests, and robustness analysis


In [1]:
# Setup
import sys
import os
import pandas as pd
import numpy as np
from pathlib import Path

# Notebook is located at notebook/test.ipynb
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root / 'src'))

from eval import (
    compute_metrics_from_files,
    load_run_file,
    load_qrels_file,
    compute_all_metrics,
    compare_runs,
    compute_query_slices,
    load_vocabulary
)
from eval.utils import (
    ensure_directory,
    create_dummy_run_file,
    create_dummy_qrels_file,
    create_dummy_vocab_file
)

print("Setup complete!")


Setup complete!


## Test 1: Create Test Data


In [2]:
# Create test data directory
test_dir = project_root / 'data' / 'test'
ensure_directory(test_dir)

# Create test files
test_run_path = test_dir / 'test_run.csv'
test_qrels_path = test_dir / 'test_qrels.csv'
test_vocab_path = test_dir / 'test_vocab.txt'

create_dummy_run_file(str(test_run_path), num_queries=5, num_docs_per_query=20)
create_dummy_qrels_file(str(test_qrels_path), num_queries=5, num_relevant_per_query=3)
create_dummy_vocab_file(str(test_vocab_path), num_tokens=1000)

print("Test data created!")
print(f"Run file: {test_run_path}")
print(f"Qrels file: {test_qrels_path}")
print(f"Vocab file: {test_vocab_path}")


Test data created!
Run file: c:\Users\Lenovo\CodeProjects\VSC\domain-specific-query-expansion-with-llms\data\test\test_run.csv
Qrels file: c:\Users\Lenovo\CodeProjects\VSC\domain-specific-query-expansion-with-llms\data\test\test_qrels.csv
Vocab file: c:\Users\Lenovo\CodeProjects\VSC\domain-specific-query-expansion-with-llms\data\test\test_vocab.txt


## Test 2: Load Files and Compute Metrics


In [3]:
# Load files
run = load_run_file(str(test_run_path))
qrels = load_qrels_file(str(test_qrels_path))

print(f"Loaded {len(run)} queries in run")
print(f"Loaded {len(qrels)} queries in qrels")

# Compute all metrics
metrics = compute_all_metrics(run, qrels, k=10)

print("\nMetrics:")
for metric_name, value in metrics.items():
    print(f"  {metric_name}: {value:.4f}")

assert all(0 <= v <= 1 for v in metrics.values()), "Metrics should be in [0, 1]"
print("\nMetrics computation test passed!")


Loaded 5 queries in run
Loaded 5 queries in qrels

Metrics:
  ndcg@10: 1.0000
  map: 1.0000
  recall@100: 1.0000
  mrr: 1.0000

Metrics computation test passed!


## Test 3: Statistical Tests


In [4]:
# Create second run for comparison
test_run2_path = test_dir / 'test_run2.csv'
create_dummy_run_file(str(test_run2_path), num_queries=5, num_docs_per_query=20)

# Compare runs
stats = compare_runs(
    str(test_run_path),
    str(test_run2_path),
    str(test_qrels_path),
    metric='ndcg@10',
    k=10
)

print("Statistical Comparison:")
print(f"  Baseline mean: {stats['baseline_mean']:.4f}")
print(f"  System mean:   {stats['system_mean']:.4f}")
print(f"  Difference:    {stats['mean_difference']:.4f}")
print(f"  p-value:       {stats['p_value']:.4f}")
print(f"  95% CI:        [{stats['ci_lower']:.4f}, {stats['ci_upper']:.4f}]")

print("\nStatistical tests passed!")


Statistical Comparison:
  Baseline mean: 1.0000
  System mean:   1.0000
  Difference:    0.0000
  p-value:       nan
  95% CI:        [0.0000, 0.0000]

Statistical tests passed!


## Test 4: Robustness Slices


In [5]:
# Load vocabulary
vocab = load_vocabulary(str(test_vocab_path), top_n=1000)

# Create queries dict
queries = {qid: qid for qid in run.keys()}

# Compute slices
slices = compute_query_slices(
    queries,
    str(test_run_path),
    vocab=vocab,
    output_file=str(test_dir / 'test_slices.csv')
)

familiar = sum(1 for s in slices.values() if s['label'] == 'familiar')
unfamiliar = len(slices) - familiar

print(f"Familiar queries: {familiar}")
print(f"Unfamiliar queries: {unfamiliar}")
print(f"Slices saved to: {test_dir / 'test_slices.csv'}")

print("\nRobustness slices test passed!")


Familiar queries: 0
Unfamiliar queries: 5
Slices saved to: c:\Users\Lenovo\CodeProjects\VSC\domain-specific-query-expansion-with-llms\data\test\test_slices.csv

Robustness slices test passed!


## Test 5: End-to-End Workflow


In [6]:
# Test complete workflow: load → compute → save
from eval.compute_metrics import compute_and_save_metrics

output_metric_path = test_dir / 'test_metrics.csv'
computed = compute_and_save_metrics(
    str(test_run_path),
    str(test_qrels_path),
    str(output_metric_path),
    dataset='test',
    method='test',
    retrieval='test',
    k=10
)

print("Computed and saved metrics:")
for metric_name, value in computed.items():
    print(f"  {metric_name}: {value:.4f}")

print(f"\nSaved to: {output_metric_path}")

# Verify file exists
assert output_metric_path.exists(), "Metrics file should exist"
df = pd.read_csv(output_metric_path)
assert len(df) == 1, "Should have one row"
print("\nEnd-to-end workflow test passed!")

print("\n" + "="*50)
print("All tests passed!")
print("="*50)


Computed and saved metrics:
  ndcg@10: 1.0000
  map: 1.0000
  recall@100: 1.0000
  mrr: 1.0000

Saved to: c:\Users\Lenovo\CodeProjects\VSC\domain-specific-query-expansion-with-llms\data\test\test_metrics.csv

End-to-end workflow test passed!

All tests passed!
