# News Recommendation System Evaluation

This notebook evaluates the performance of our news recommendation system across multiple dimensions:
- Search Quality (Retrieval)
- Recommendation Accuracy
- Personalization Effectiveness
- System Performance

## Setup


In [None]:
import sys
import os
sys.path.append('src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from typing import List, Dict, Tuple, Optional
import time
import json

# Core system imports
from storage import ArticleDB
from embeddings import EmbeddingSystem
from retrieval import MultiRAGRetriever
from scoring import ScoringEngine
from reranker import RerankingEngine
from recommendation_learner import AIRecommender
from data_models import Article, UserProfile, SearchQuery
from config import RAGConfig, EmbeddingModelConfig

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Imports successful")


## 1. System Initialization


In [None]:
# Initialize core components
print(" Initializing system components...")

# Database
db = ArticleDB()
print(f" Database: {len(db.get_all_articles())} articles")

# Embeddings
embeddings = EmbeddingSystem()
print(f" Embeddings: {embeddings.model_name} loaded")

# Retrieval system
retriever = MultiRAGRetriever(db, embeddings)
print(" MultiRAG Retriever ready")

# Scoring engine
config = RAGConfig()
scoring_engine = ScoringEngine(config)
print(" Scoring engine ready")

# Recommendation system
recommender = AIRecommender(db, embeddings)
print(" Recommendation system ready")

print("\n System initialization complete!")


## 2. Experiment 1: Search Quality Evaluation

**Objective**: Evaluate retrieval performance using precision, recall, and relevance scores


In [None]:
def evaluate_search_quality(retriever: MultiRAGRetriever, test_queries: List[Dict]) -> Dict:
    """
    Evaluate search quality using test queries with known relevant articles.
    
    Args:
        retriever: MultiRAGRetriever instance
        test_queries: List of test queries with expected results
        
    Returns:
        Dictionary with evaluation metrics
    """
    results = {
        'precision_at_5': [],
        'precision_at_10': [],
        'recall_at_10': [],
        'mrr': [],  # Mean Reciprocal Rank
        'query_times': []
    }
    
    for query_data in test_queries:
        query = query_data['query']
        expected_articles = set(query_data['expected_articles'])
        
        # Measure query time
        start_time = time.time()
        search_results = retriever.search(query, limit=10)
        query_time = time.time() - start_time
        
        # Extract article IDs from results
        retrieved_articles = set([result.article_id for result in search_results])
        
        # Calculate metrics
        relevant_retrieved = retrieved_articles.intersection(expected_articles)
        
        # Precision@K
        precision_5 = len(relevant_retrieved.intersection(set([r.article_id for r in search_results[:5]]))) / 5
        precision_10 = len(relevant_retrieved) / 10
        
        # Recall@10
        recall_10 = len(relevant_retrieved) / len(expected_articles) if expected_articles else 0
        
        # MRR (Mean Reciprocal Rank)
        mrr = 0
        for i, result in enumerate(search_results):
            if result.article_id in expected_articles:
                mrr = 1.0 / (i + 1)
                break
        
        results['precision_at_5'].append(precision_5)
        results['precision_at_10'].append(precision_10)
        results['recall_at_10'].append(recall_10)
        results['mrr'].append(mrr)
        results['query_times'].append(query_time)
    
    # Calculate averages
    return {
        'avg_precision_at_5': np.mean(results['precision_at_5']),
        'avg_precision_at_10': np.mean(results['precision_at_10']),
        'avg_recall_at_10': np.mean(results['recall_at_10']),
        'avg_mrr': np.mean(results['mrr']),
        'avg_query_time': np.mean(results['query_times']),
        'total_queries': len(test_queries)
    }

# Load SPICED dataset for proper evaluation
import pandas as pd
import json

def load_spiced_dataset():
    """Load SPICED dataset for evaluation."""
    try:
        df = pd.read_csv('evaluation/spiced.csv')
        print(f"SPICED dataset loaded: {len(df)} pairs")
        print(f"Topics: {df['Type'].unique()}")
        return df
    except Exception as e:
        print(f"Failed to load SPICED dataset: {e}")
        return None

# Load SPICED dataset
spiced_data = load_spiced_dataset()

if spiced_data is not None:
    print(f"\nDataset Statistics:")
    print(f"Total pairs: {len(spiced_data)}")
    print(f"Topics: {', '.join(spiced_data['Type'].unique())}")
    print(f"Topic distribution:")
    print(spiced_data['Type'].value_counts())
    
    print(f"\nSample SPICED pairs:")
    for i, (idx, row) in enumerate(spiced_data.head(3).iterrows()):
        print(f"\nPair {i+1} ({row['Type']}):")
        print(f"  Text 1: {row['text_1'][:100]}...")
        print(f"  Text 2: {row['text_2'][:100]}...")
        print(f"  URL 1: {row['URL_1']}")
        print(f"  URL 2: {row['URL_2']}")
else:
    print("SPICED dataset not available. Using fallback evaluation.")

print("Running search quality evaluation...")
search_metrics = evaluate_search_quality(retriever, test_queries)

print("\n Search Quality Results:")
for metric, value in search_metrics.items():
    if metric != 'total_queries':
        print(f"  {metric}: {value:.3f}")
    else:
        print(f"  {metric}: {value}")


In [None]:
# Run comprehensive evaluation pipeline
import subprocess
import json

def run_evaluation_pipeline():
    """Run the comprehensive evaluation pipeline."""
    try:
        print("Running comprehensive evaluation pipeline...")
        result = subprocess.run(['python', 'evaluation/evaluation_pipeline.py'], 
                              capture_output=True, text=True, check=True)
        print("Pipeline output:")
        print(result.stdout)
        
        # Load results
        with open('evaluation/evaluation_results.json', 'r') as f:
            results = json.load(f)
        
        return results
    except subprocess.CalledProcessError as e:
        print(f"Pipeline failed: {e}")
        print(f"Error output: {e.stderr}")
        return None
    except Exception as e:
        print(f"Error running pipeline: {e}")
        return None

# Run evaluation
evaluation_results = run_evaluation_pipeline()

if evaluation_results:
    print("\nEvaluation Results Summary:")
    print("=" * 40)
    
    metadata = evaluation_results.get('metadata', {})
    print(f"Evaluation Date: {metadata.get('evaluation_date', 'N/A')}")
    print(f"Total Queries: {metadata.get('total_queries', 0)}")
    print(f"SPICED Pairs: {metadata.get('spiced_pairs', 0)}")
    print(f"Topics: {', '.join(metadata.get('topics', []))}")
    
    # Overall performance
    overall = evaluation_results.get('overall_performance', {})
    print(f"\nOverall Performance:")
    print(f"  Similarity Score: {overall.get('similarity_score', 0.0):.3f}")
    print(f"  Topic Relevance: {overall.get('topic_relevance', 0.0):.3f}")
    print(f"  Diversity Score: {overall.get('diversity_score', 0.0):.3f}")
    print(f"  Intratopic Score: {overall.get('intratopic_score', 0.0):.3f}")
    print(f"  Intertopic Score: {overall.get('intertopic_score', 0.0):.3f}")
    print(f"  Hard Examples Score: {overall.get('hard_examples_score', 0.0):.3f}")
    print(f"  Overall Score: {overall.get('overall_score', 0.0):.3f}")
    
    # Difficulty-based performance
    print(f"\nDifficulty-Based Performance:")
    if 'intratopic_performance' in evaluation_results:
        intratopic = evaluation_results['intratopic_performance']
        print(f"  Intratopic (Same Topic): {intratopic.get('intratopic_score', 0.0):.3f} ({intratopic.get('total_pairs', 0)} pairs)")
    
    if 'intertopic_performance' in evaluation_results:
        intertopic = evaluation_results['intertopic_performance']
        print(f"  Intertopic (Cross Topic): {intertopic.get('intertopic_score', 0.0):.3f} ({intertopic.get('total_pairs', 0)} pairs)")
    
    if 'hard_examples_performance' in evaluation_results:
        hard_examples = evaluation_results['hard_examples_performance']
        if hard_examples.get('total_pairs', 0) > 0:
            print(f"  Hard Examples: {hard_examples.get('hard_examples_score', 0.0):.3f} ({hard_examples.get('total_pairs', 0)} pairs)")
        else:
            print(f"  Hard Examples: No hard examples available")
    
    # Topic-specific performance
    if 'topic_retrieval' in evaluation_results and 'topic_metrics' in evaluation_results['topic_retrieval']:
        print(f"\nTopic Performance:")
        for topic, metrics in evaluation_results['topic_retrieval']['topic_metrics'].items():
            print(f"  {topic}: {metrics['mean_relevance']:.3f} ({metrics['query_count']} queries)")
    
    # Baseline comparison
    if 'baseline_comparison' in evaluation_results:
        print(f"\nBaseline Comparison:")
        baselines = evaluation_results['baseline_comparison']
        for baseline_name, baseline_data in baselines.items():
            method = baseline_data.get('method', baseline_name)
            score = baseline_data.get('avg_score', 0.0)
            queries = baseline_data.get('total_queries', 0)
            print(f"  {method}: {score:.3f} ({queries} queries)")
    
    # Dataset statistics
    metadata = evaluation_results.get('metadata', {})
    print(f"\nDataset Statistics:")
    print(f"  Train Pairs: {metadata.get('train_pairs', 0)}")
    print(f"  Test Pairs: {metadata.get('test_pairs', 0)}")
    print(f"  Total Queries: {metadata.get('total_queries', 0)}")
    print(f"  Topics: {', '.join(metadata.get('topics', []))}")
else:
    print("Evaluation pipeline failed or no results available.")


## SPICED Dataset Integration

This evaluation uses the **SPICED (Similarity Detection in News)** dataset with proper train/test splits and difficulty-based evaluation:

### Dataset Structure
- **Combined Dataset**: 977 pairs total (679 train, 298 test)
- **Intratopic Pairs**: Same topic, similar content (easier)
- **Intertopic Pairs**: Different topics, similar content (harder)  
- **Hard Examples**: Challenging pairs for robustness testing

### Evaluation Metrics
- **Similarity Detection**: How well the system finds similar articles
- **Topic Retrieval**: Accuracy of topic-based recommendations
- **Diversity Assessment**: Coverage across different topics
- **Difficulty-Based Performance**: Performance on different complexity levels

### Ground Truth
The SPICED dataset provides human-verified similarity pairs as ground truth, ensuring reliable evaluation of the recommendation system's performance.


## 3. User Profiles for Personalization Testing


In [None]:
# Convert user profiles data to UserProfile objects
user_profiles = []
for profile_data in user_profiles_data:
    user_profile = UserProfile(
        id=profile_data['id'],
        preferred_topics=profile_data['preferred_topics'],
        reading_history=profile_data['reading_history'],
        preferred_sources=profile_data['preferred_sources']
    )
    user_profiles.append(user_profile)

print(f"👤 Created {len(user_profiles)} user profiles for personalization testing")
print("\nUser Profiles:")
for profile in user_profiles:
    print(f"  {profile.id}: {', '.join(profile.preferred_topics)}")
