In [1]:
import math

import csv
from typing import Dict, List

import import_ipynb

import sys
sys.path.append('../')  
from query_processing.Query_processer import Query_processer

importing Jupyter notebook from D:\GitHub\query_performance\..\query_processing\Query_processer.ipynb
importing Jupyter notebook from D:\GitHub\query_performance\..\pre_processing\TextProcessor.ipynb
importing Jupyter notebook from D:\GitHub\query_performance\..\utilities\General_Utilities.ipynb
importing Jupyter notebook from D:\GitHub\query_performance\..\query_processing\DAAT.ipynb
importing Jupyter notebook from D:\GitHub\query_performance\..\structures\DocumentIndex.ipynb
importing Jupyter notebook from D:\GitHub\query_performance\..\structures\DocumentIndexRow.ipynb
importing Jupyter notebook from D:\GitHub\query_performance\..\structures\Lexicon.ipynb
importing Jupyter notebook from D:\GitHub\query_performance\..\structures\LexiconRow.ipynb
importing Jupyter notebook from D:\GitHub\query_performance\..\building_data_structures\CollectionStatistics.ipynb
importing Jupyter notebook from D:\GitHub\query_performance\..\structures\PostingListHandler.ipynb
importing Jupyter notebook f

In [2]:
query_performance_path = "../query_performance/msmarco-test2019-queries"
query_file = "msmarco-test2019-queries.tsv"
relevance_file = "2019qrels-pass.txt"

In [3]:
def load_queries(file_path: str) -> Dict[int, str]:
    queries = {}
    with open(file_path, "r", encoding="utf-8") as file:
        reader = csv.reader(file, delimiter="\t")
        for row in reader:
            query_id, query_text = int(row[0]), row[1]
            queries[query_id] = query_text # sono 200, essendo poche le metto tutte in memoria
    return queries

def load_relevance(file_path: str) -> Dict[int, Dict[int, int]]:
    relevance = {}
    with open(file_path, "r", encoding="utf-8") as file:
        reader = csv.reader(file, delimiter=" ")
        for row in reader:
            query_id, _, document_id, relevance_score = int(row[0]), row[1], int(row[2]), int(row[3])
            if query_id not in relevance:
                relevance[query_id] = {}
            relevance[query_id][document_id] = relevance_score
    return relevance

def precision_at_k(query_results: list, relevance_data: dict, k: int) -> float:
    """
    Calculate precision at k for a list of query results, so the number of relevant document between the first k document returner by the system.

    Args:
        query_results (list): List of query results.
        relevance_data (dict): Dictionary mapping document IDs to their relevance scores.
        k (int): The number of top results to consider.

    Returns:
        float: Precision at k.
    """
    # Check if the list of query results is empty
    if len(query_results) == 0:
        return 0

    # # Adjust k if it is greater than the number of query results
    # if k > len(query_results):
    #     k = len(query_results)

    relevant = 0
    for i in range(min(k, len(query_results))): # Iterate over the top k results or all results if k is greater than the list length
        # Check if the relevance score for the current document ID is greater than 0
        if relevance_data.get(query_results[i], 0) > 0:
            relevant += 1

    # Calculate precision at k
    return relevant / k

# Mettiamo che una query ha ritornato questi documenti più rilevanti: [2071723,8412682,2874503,527690]
# Nella struttura dati relevance vado a cercare col query_id di quella query, mettiamo sia 19335 -> relevance[19335] = {1017759: 0, 1082489: 1, ... }
# Qui infine chiamerei:  average_precision([2071723,8412682,2874503,527690], relevance[19335])
def average_precision(query_results: List[int], relevance_data: dict) -> float:
    """
    Calculate the Average Precision for a list of query results.

    Args:
        query_results (List[int]): List of document IDs representing the order of query results.
        relevance_data (dict): Dictionary mapping document IDs to their relevance scores.

    Returns:
        float: Average Precision.
    """
    # Check if the list of query results is empty
    if len(query_results) == 0:
        return 0

    kRB = len(query_results)
    total_sum = 0

    # Iterate from 1 to kRB (inclusive)
    for i in range(1, kRB + 1):
        # Accumulate precision at each cutoff point
        total_sum += precision_at_k(query_results, relevance_data, i)

    # Calculate average precision
    return total_sum / kRB

def mean_average_precision(query_results_list: List[List[int]], relevance_data: List[dict]) -> float:
    """
    Compute Mean Average Precision (MAP) for a list of query results.

    Args:
        query_results_list (list): List of query results, where each element is a list of document IDs.
        relevance_data (dict): Dictionary mapping document IDs to their relevance scores.

    Returns:
        float: Mean Average Precision (MAP).
    """
    total_queries = len(query_results_list)
    if total_queries == 0:
        return 0

    average_precision_sum = 0
    for i, query_results in enumerate(query_results_list):
        average_precision_sum += average_precision(query_results_list[i], relevance_data[i])

    return average_precision_sum / total_queries

def recall_at_k(query_results: List[int], relevance_data: dict, k: int) -> float:
    """
    Calculate recall at k for a list of query results.
    The ratio between relevant documents found by system and the total relevant document for that query.

    Args:
        query_results (list): List of query results.
        relevance_data (dict): Dictionary mapping document IDs to their relevance scores.
        k (int): The number of top results to consider.

    Returns:
        float: Recall at k.
    """
    # Check if the list of query results is empty
    if len(query_results) == 0:
        return 0

    # Get the total number of relevant documents
    total_relevant = sum(1 for doc_id, relevance in relevance_data.items() if relevance > 0)

    # Adjust k if it is greater than the number of query results
    if k > len(query_results):
        k = len(query_results)

    # Count the number of relevant documents among the top k results
    relevant_in_top_k = sum(1 for i in range(min(k, len(query_results))) if relevance_data.get(query_results[i], 0) > 0)

    # Calculate recall at k
    return relevant_in_top_k / total_relevant if total_relevant > 0 else 0

def discounted_cumulative_gain(b: int, k: int, query_results: List[int], relevance_data: dict) -> float:        
    """
    Calculate Discounted Cumulative Gain (DCG) for a list of query results.

    Args:
        b (int): Base of the logarithm.
        k (int): The number of top results to consider.
        query_results (List[int]): List of document IDs representing the order of query results.
        relevance_data (dict): Dictionary mapping document IDs to their relevance scores.

    Returns:
        float: Discounted Cumulative Gain (DCG).
    """
    DCG = 0

    # Iterate from 1 to min(k, len(query_results))
    for i in range(1, min(k, len(query_results)) + 1):
        # Calculate the gain for the current document and add it to DCG
        DCG += ((dict(relevance_data).get(query_results[i-1], 0)) / max(1, math.log(i, b)))

    return DCG

def evaluate_queries(queries, relevance, flag: bool, scoring_function: str, alg: str) -> Dict[int, List[int]]:
    results = {}
    query_processer = Query_processer(flag)
    
    for query_id_to_check, query_text in queries.items():
        doc_id_score_pairs = relevance.get(query_id_to_check, {}).items()
        print("Quelli giusti erano: ", doc_id_score_pairs)
        if len(doc_id_score_pairs) == 0:
            continue

        result = query_processer.process_query(query_text, scoring_function, alg, 7, False)
        print("Trovati: ", result)
        print(query_id_to_check)
          
        discounted_cumulative_gain(2, 5,result ,doc_id_score_pairs)
        print("discounted_cumulative_gain: ", discounted_cumulative_gain(2, 7,result ,doc_id_score_pairs))
    
    return results

# def write_sorted_queries_to_file(queries: Dict[int, str], output_file: str) -> None:
#     sorted_queries = sorted(queries.items(), key=lambda x: x[0])

#     with open(output_file, "w", encoding="utf-8", newline="") as file:
#         writer = csv.writer(file, delimiter="\t")
#         for query_id, query_text in sorted_queries:
#             writer.writerow([query_id, query_text])



queries = load_queries(query_performance_path + "/" + query_file)
#print(queries)
# for id, text in queries.items():
#     print(id, " : ", text)

# write_queries_to_file(queries, "sorted_queries.tsv") # riscrivili ordinatamente
            
relevance = load_relevance(query_performance_path + "/" + relevance_file)
# print(relevance)
# for query_id, document_scores in relevance.items():
#     print(f"Query ID: {query_id}")
#     for doc_id, score in document_scores.items():
#         print(f"  Document ID: {doc_id}, Score: {score}")
# unique_query_ids = len(relevance)
# print(unique_query_ids) # solo 43 query_id su 200 totali......
# print(relevance[19335]) 

# print(mean_average_precision([ [2071723,8412682,2874503,527690], [1720388,4379804,8151642,527690] ], [relevance[19335], relevance[131843] ]))
# print(discounted_cumulative_gain(2, 5, [8412682,2046505], relevance[19335]))

# query_processer = Query_processer(False)
# result = query_processer.process_query(queries[19335], "bm25", "daat", 7, False)
# print(result)

query_results = evaluate_queries(queries, relevance, False, "bm25", "daat")

# for query_id, result in query_results.items():
#     print(f"Query {query_id}: Relevant Documents {result}")

Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([])
Quelli giusti erano:  dict_items([(1034435, 1), (1101607, 2), (1113693, 0), (1171591, 0), (117188, 0), (1172283, 0), (1172287, 0), (1277720, 2), (1277721, 2), (1277722, 2), (1277723, 2), (1277724, 2), (1277725, 2), (1277726, 2), (1277727, 2), (1277728, 2), (13

KeyboardInterrupt: 