# Ranking and Recall Data Analysis

This notebook is dedicated to the analysis of ranking and recall data related to the MIRCV project. It examines the performance of the developed ranking system using data extracted from text files. The following custom functions are used to read, process, and prepare the data for detailed analysis.

## Notebook Structure
- **Reading Functions**:  Define the method for reading data from text files and organizing them into useful data structures.
- **Data Loading**: Loads ranking and recall data from specific files for analysis.
- **Data Access and Analysis**: Displays and analyzes specific parts of the data to gain initial insights into the performance of the ranking system.

In [None]:
import os

In [None]:
# Reading Functions

# read_data: Reads ranking data from a file and organizes it into a dictionary
def read_data(file_path):
    """Reads ranking data from a file.

    Args:
        file_path (str): Path of the file to read.

    Returns:
        dict: Dictionary containing the ranking data.
    """
    system_ranking = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            qid, doc_id, relevance = parts[0], str(int(parts[2])), int(parts[3])
            if qid not in system_ranking:
                system_ranking[qid] = []
            system_ranking[qid].append((doc_id, relevance))
        
    return system_ranking

# read_data_recall_base: Reads base recall data from a file and organizes it into a dictionary
def read_data_recall_base(file_path):
    """Reads base recall data from a file.

    Args:
        file_path (str): Path of the file to read.

    Returns:
        dict: Dictionary containing the base recall data.
    """
    recalls = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            qid, base = parts[0], int(parts[1])
            if qid not in recalls:
                recalls[qid] = {}
            recalls[qid] = base

    return recalls

In [None]:
def calculate_map(system_ranking, recall_bases):
    map_scores = []

    for query in system_ranking:
        num = 0
        count = 0
        relevants = 1

        for element in system_ranking[query]:
            if count != 10:
                if element[1] > 0:
                    num += relevants / (count + 1)
                    relevants += 1
                count += 1
        
        rb = recall_bases[query]

        ap = num / rb
        map_scores.append(ap)
        print(f"AP {query}: {ap}")

    map = sum(map_scores) / len(map_scores)
    print(f"MAP: {map}")

In [None]:
def calculate_mrr(system_ranking):
    mrr_scores = []
    for query in system_ranking:
        count = 0

        for element in system_ranking[query]:
            if element[1] > 0:
                rr = 1 / (count + 1)
                mrr_scores.append(rr)
                print(f"RR {query}: {rr}")
                break
            if count == 9:
                mrr_scores.append(0)
                print(f"RR {query}: 0")
                break
            count += 1
        
    mrr = sum(mrr_scores) / len(mrr_scores)
    print(f"MRR: {mrr}")

In [None]:
# Data Loading

# Loading ranking and recall data from the specified files
system_rank_file = "./relevance_file/DAATBM25withRelevance.txt"
recall_base_file = "./qrel_file/id_counts.txt"

# Using reading functions to load the data
system_ranking = read_data(system_rank_file)
recall_bases = read_data_recall_base(recall_base_file)

In [None]:
# Data Access and Analysis

# Accessing a specific element in the system_ranking dictionary for analysis
sample_query_id = '19335'
sample_ranking_data = system_ranking[sample_query_id][0][0]

# Output: Displaying ranking data for the specific query
print(f"Dati di Ranking per la Query {sample_query_id}: {sample_ranking_data}")

In [None]:
folder_path = "relevance_file"

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        path = os.path.join(folder_path, filename)
        
        system_ranking = read_data(path)
        calculate_map(system_ranking, recall_bases)

In [None]:
folder_path = "relevance_file"

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        path = os.path.join(folder_path, filename)
        
        system_ranking = read_data(path)
        calculate_mrr(system_ranking)