# Testing

In [41]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict, Union

### Ranking of recommendation lists

**`MetricsCalculator`** class is designed for evaluating our recommendation system's results with **Precision@k**, **Recall@k**, **MAP**, and **NDCG** metrics.

In [42]:
class MetricsCalculator:
    @staticmethod
    def precision_at_k(relevant_items: List[str], recommended_items: List[str], k: int) -> float:
        top_k = recommended_items[:k]
        relevant_in_top_k = len(set(top_k) & set(relevant_items))
        return relevant_in_top_k / k if k > 0 else 0.0
    
    @staticmethod
    def recall_at_k(relevant_items: List[str], recommended_items: List[str], k: int) -> float:
        top_k = recommended_items[:k]
        relevant_in_top_k = len(set(top_k) & set(relevant_items))
        return relevant_in_top_k / len(relevant_items) if relevant_items else 0.0
    
    @staticmethod
    def average_precision(relevant_items: List[str], recommended_items: List[str]) -> float:
        ap = 0.0
        num_relevant = len(relevant_items)
        relevant_positions = [i+1 for i, item in enumerate(recommended_items) if item in relevant_items]
        
        for i, pos in enumerate(relevant_positions):
            ap += (i+1) / pos
        
        return ap / num_relevant if num_relevant > 0 else 0.0
    
    @staticmethod
    def mean_average_precision(relevant_items_list: List[List[str]], recommended_items_list: List[List[str]]) -> float:
        ap_scores = [
            MetricsCalculator.average_precision(relevant, recommended)
            for relevant, recommended in zip(relevant_items_list, recommended_items_list)
        ]
        return np.mean(ap_scores) if ap_scores else 0.0
    
    @staticmethod
    def ndcg_at_k(relevant_items: List[str], recommended_items: List[str], k: int, relevance_scores: Dict[str, float] = None) -> float:
        top_k = recommended_items[:k]
        if relevance_scores is None:
            relevance_scores = {item: 1.0 for item in relevant_items}
        
        dcg = sum(
            (relevance_scores.get(item, 0) / np.log2(i + 2) 
            for i, item in enumerate(top_k))
        )
        
        ideal_relevance = sorted([relevance_scores.get(item, 0) for item in relevant_items], reverse=True)[:k]
        idcg = sum(rel / np.log2(i + 2) for i, rel in enumerate(ideal_relevance))
        
        return dcg / idcg if idcg > 0 else 0.0

    @staticmethod
    def calculate_all_metrics(
        relevant_items_list: List[List[str]],
        recommended_items_list: List[List[str]],
        k_values: List[int] = [5],
        relevance_scores_list: List[Dict[str, float]] = None
    ) -> Dict[str, Union[float, Dict[int, float]]]:
        results = {
            'MAP': MetricsCalculator.mean_average_precision(relevant_items_list, recommended_items_list),
            'Precision@k': {},
            'Recall@k': {},
            'NDCG@k': {}
        }
        
        for k in k_values:
            results['Precision@k'][k] = np.mean([
                MetricsCalculator.precision_at_k(relevant, recommended, k)
                for relevant, recommended in zip(relevant_items_list, recommended_items_list)
            ])
            
            results['Recall@k'][k] = np.mean([
                MetricsCalculator.recall_at_k(relevant, recommended, k)
                for relevant, recommended in zip(relevant_items_list, recommended_items_list)
            ])
            
            if relevance_scores_list:
                results['NDCG@k'][k] = np.mean([
                    MetricsCalculator.ndcg_at_k(relevant, recommended, k, rel_scores)
                    for relevant, recommended, rel_scores in zip(relevant_items_list, recommended_items_list, relevance_scores_list)
                ])
        
        return results

In [None]:
import json
import time
from typing import List, Tuple
from annoy import AnnoyIndex
import faiss
from rank_bm25 import BM25Okapi

class SearchEvaluator:
    def __init__(self, data_path: str, dataset_path: str = "final_dataset.csv"):
        with open(data_path, 'r') as f:
            self.validation_data = json.load(f)
        
        # Load movie dataset
        self.dataset = pd.read_csv(dataset_path)
        self.movie_titles = self.dataset['title'].tolist()
        self.title_to_index = {title: idx for idx, title in enumerate(self.movie_titles)}
        
        # Prepare queries and relevant movies
        self.queries = [item['query'] for item in self.validation_data]
        self.relevant_items_list = [item['relevant_movies'] for item in self.validation_data]
        
        # Load embeddings and initialize models
        self.embeddings = {}
        self.models = {
            'all_mpnet_base_v2': None,
            'all_MiniLM_L12_v2': None,
            'multi_qa_distilbert_cos_v1': None
        }
        
        for model_name in self.models:
            emb_path = f"embeddings/embeddings_{model_name.replace('-', '_')}.npy"
            self.embeddings[model_name] = np.load(emb_path)
            self.models[model_name] = SentenceTransformer(model_name.replace("_", "-"))

        # Initialize BM25
        tokenized_titles = [title.lower().split() for title in self.movie_titles]
        self.bm25 = BM25Okapi(tokenized_titles)

    def _get_similarity_scores(self, query: str, movies: List[str], method: str, model_name: str = None) -> List[float]:
        """Get similarity scores for movies using the specified method"""
        if method == 'bm25':
            # TODO: починить
            tokenized_query = query.lower().split()
            return [self.bm25.get_scores(tokenized_query)[self.title_to_index[movie]] for movie in movies]
        elif method in ['cosine', 'faiss_flat', 'faiss_hnsw', 'annoy']:
            model = self.models[model_name]
            query_emb = model.encode([query])
            emb_matrix = self.embeddings[model_name]
            
            # Get embeddings for relevant movies
            movie_indices = [self.title_to_index[movie] for movie in movies]
            movie_embs = emb_matrix[movie_indices]
            
            # Normalize and compute cosine similarity
            query_emb = query_emb / np.linalg.norm(query_emb)
            movie_embs = movie_embs / np.linalg.norm(movie_embs, axis=1, keepdims=True)
            return np.dot(movie_embs, query_emb.T).flatten().tolist()
        else:
            raise ValueError(f"Unknown method: {method}")

    def _print_recommendations(self, query: str, recommendations: List[str], rec_scores: List[float],
                             relevant_movies: List[str], method: str, model_name: str = None):
        """Print recommendations with scores and relevant movies with their scores"""
        print(f"\nQuery: '{query}'")
        
        # Get scores for relevant movies using the same metric
        relevant_scores = self._get_similarity_scores(query, relevant_movies, method, model_name)
        
        print("\nRelevant movies with scores:")
        for movie, score in zip(relevant_movies, relevant_scores):
            print(f"  - {movie} (score: {score:.4f})")
        
        print("\nRecommended movies with scores:")
        for rank, (movie, score) in enumerate(zip(recommendations, rec_scores), 1):
            relevant_flag = " [RELEVANT]" if movie in relevant_movies else ""
            print(f"  {rank}. {movie} (score: {score:.4f}){relevant_flag}")

    def bm25_search(self, query: str, top_k: int = 5) -> Tuple[List[str], List[float]]:
        tokenized_query = query.lower().split()
        scores = self.bm25.get_scores(tokenized_query)
        top_indices = np.argsort(scores)[-top_k:][::-1]
        return [self.movie_titles[i] for i in top_indices], [scores[i] for i in top_indices]

    def cosine_search(self, query: str, model_name: str, top_k: int = 5) -> Tuple[List[str], List[float]]:
        model = self.models[model_name]
        query_emb = model.encode([query])
        emb_matrix = self.embeddings[model_name]
        
        query_emb = query_emb / np.linalg.norm(query_emb)
        emb_matrix = emb_matrix / np.linalg.norm(emb_matrix, axis=1, keepdims=True)
        scores = np.dot(emb_matrix, query_emb.T).flatten()
        
        top_indices = np.argsort(scores)[-top_k:][::-1]
        return [self.movie_titles[i] for i in top_indices], [scores[i] for i in top_indices]
    
    def faiss_flat_search(self, query: str, model_name: str, top_k: int = 5) -> Tuple[List[str], List[float]]:
        model = self.models[model_name]
        query_emb = model.encode([query])
        emb_matrix = self.embeddings[model_name]
        
        dimension = emb_matrix.shape[1]
        index = faiss.IndexFlatIP(dimension)
        index.add(emb_matrix.astype('float32'))
        
        distances, indices = index.search(query_emb.astype('float32'), top_k)
        return [self.movie_titles[i] for i in indices[0]], distances[0].tolist()

    def faiss_hnsw_search(self, query: str, model_name: str, top_k: int = 5) -> Tuple[List[str], List[float]]:
        model = self.models[model_name]
        query_emb = model.encode([query])
        emb_matrix = self.embeddings[model_name]
        
        dimension = emb_matrix.shape[1]
        index = faiss.IndexHNSWFlat(dimension, 32)
        index.add(emb_matrix.astype('float32'))
        
        distances, indices = index.search(query_emb.astype('float32'), top_k)
        return [self.movie_titles[i] for i in indices[0]], distances[0].tolist()

    def annoy_search(self, query: str, model_name: str, top_k: int = 5) -> Tuple[List[str], List[float]]:
        model = self.models[model_name]
        query_emb = model.encode([query])
        emb_matrix = self.embeddings[model_name]
        
        dimension = emb_matrix.shape[1]
        annoy_index = AnnoyIndex(dimension, 'angular')
        for i, emb in enumerate(emb_matrix):
            annoy_index.add_item(i, emb)
        annoy_index.build(10)
        
        indices, distances = annoy_index.get_nns_by_vector(query_emb[0], top_k, include_distances=True)
        # Convert angular distance to similarity score (higher is better)
        scores = [(1 - (d**2) / 2) for d in distances]  # Convert angular distance to cosine similarity
        return [self.movie_titles[i] for i in indices], scores

    def evaluate_all_approaches(self, output_file: str = "results.json", show_examples: int = 3):
        results = {}
        search_approaches = {
            'bm25': lambda q: self.bm25_search(q, 5),
            'cosine': lambda q, m: self.cosine_search(q, m, 5),
            'faiss_flat': lambda q, m: self.faiss_flat_search(q, m, 5),
            'faiss_hnsw': lambda q, m: self.faiss_hnsw_search(q, m, 5),
            'annoy': lambda q, m: self.annoy_search(q, m, 5)
        }
        
        for model_name in self.models:
            results[model_name] = {}
            
            for approach_name, search_func in search_approaches.items():
                if approach_name == 'bm25':
                    print(f"\n{'='*50}\nEvaluating BM25...\n{'='*50}")
                else:
                    print(f"\n{'='*50}\nEvaluating {model_name} with {approach_name}...\n{'='*50}")
                
                start_time = time.time()
                recommended_items_list = []
                rec_scores_list = []
                
                for i, (query, relevant) in enumerate(zip(self.queries, self.relevant_items_list)):
                    if approach_name == 'bm25':
                        recs, rec_scores = search_func(query)
                    else:
                        recs, rec_scores = search_func(query, model_name)

                    recommended_items_list.append(recs)
                    rec_scores_list.append(rec_scores)

                    if i < show_examples:
                        self._print_recommendations(
                            query, recs, rec_scores, 
                            relevant, approach_name, model_name
                        )

                # Prepare relevance scores for NDCG
                relevance_scores_list = []
                for relevant, recs, rec_scores in zip(self.relevant_items_list, recommended_items_list, rec_scores_list):
                    # Create dictionary with relevance scores (1 for relevant items, 0 for others)
                    rel_scores = {item: 1.0 for item in relevant}
                    relevance_scores_list.append(rel_scores)

                # Calculate metrics
                metrics = MetricsCalculator.calculate_all_metrics(
                    self.relevant_items_list,
                    recommended_items_list,
                    [5],
                    relevance_scores_list
                )

                elapsed = time.time() - start_time
                metrics['time_per_query'] = elapsed / len(self.queries)

                results[model_name][approach_name] = metrics
        
        with open(output_file, 'w') as f:
            json.dump(results, f, indent=2)
        
        return results

In [None]:
if __name__ == "__main__":
    evaluator = SearchEvaluator("validation_set.json", "final_dataset.csv")
    results = evaluator.evaluate_all_approaches(show_examples=3)