In [None]:
# EVALUATION NOTEBOOK
import pandas as pd
from src.models.recommender import SHLRecommender
from src.evaluation.metrics import recall_at_k, mean_average_precision

# Defining benchmark queries with known relevant assessments
benchmarks = [
    {
        "query": "Hiring for Java developers who can collaborate with business teams, need 40-min assessment",
        "relevant": ["Java Programming Test", "Inductive Reasoning", "Situational Judgement Test"]
    },
    # Adding more benchmark queries
]

recommender = SHLRecommender()
results = []

for benchmark in benchmarks:
    recommendations = recommender.get_recommendations(benchmark["query"])
    rec_names = [r["description"] for r in recommendations]
    
    recall = recall_at_k(benchmark["relevant"], rec_names, k=3)
    map_score = mean_average_precision(benchmark["relevant"], rec_names, k=3)
    
    results.append({
        "query": benchmark["query"],
        "recall@3": recall,
        "map@3": map_score
    })

results_df = pd.DataFrame(results)
print(f"Mean Recall@3: {results_df['recall@3'].mean()}")
print(f"Mean MAP@3: {results_df['map@3'].mean()}")
