# Ranking and Recall Data Analysis

This notebook is dedicated to the analysis of ranking and recall data related to the MIRCV project. It examines the performance of the developed ranking system using data extracted from text files. The following custom functions are used to read, process, and prepare the data for detailed analysis.

## Notebook Structure
- **Reading Functions**:  Define the method for reading data from text files and organizing them into useful data structures.
- **Data Loading**: Loads ranking and recall data from specific files for analysis.
- **Data Access and Analysis**: Displays and analyzes specific parts of the data to gain initial insights into the performance of the ranking system.

In [None]:
import os
import math

In [None]:
# Reading Functions

# read_data: Reads ranking data from a file and organizes it into a dictionary
def read_data(file_path):
    """Reads ranking data from a file.

    Args:
        file_path (str): Path of the file to read.

    Returns:
        dict: Dictionary containing the ranking data.
    """
    system_ranking = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            qid, doc_id, relevance = parts[0], str(int(parts[2])), int(parts[3])
            if qid not in system_ranking:
                system_ranking[qid] = []
            system_ranking[qid].append((doc_id, relevance))
        
    return system_ranking

# read_data_recall_base: Reads base recall data from a file and organizes it into a dictionary
def read_data_recall_base(file_path):
    """Reads base recall data from a file.

    Args:
        file_path (str): Path of the file to read.

    Returns:
        dict: Dictionary containing the base recall data.
    """
    recalls = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            qid, base = parts[0], int(parts[1])
            if qid not in recalls:
                recalls[qid] = {}
            recalls[qid] = base

    return recalls

# read_data: Reads ranking data from a file and organizes it into a dictionary
def read_data_2(file_path):
    """Reads ranking data from a file.

    Args:
        file_path (str): Path of the file to read.

    Returns:
        dict: Dictionary containing the ranking data.
    """
    system_ranking = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            qid, doc_id = parts[0], str(int(parts[2]))
            if qid not in system_ranking:
                system_ranking[qid] = []
            system_ranking[qid].append(doc_id)
        
    return system_ranking

# read_data_ground_true: Reads ground truth data from a file and organizes it into a dictionary
def read_data_ground_true(file_path, keys):
    """Reads ground truth data from a file.

    Args:
        file_path (str): Path of the file to read.
        keys (list): List of query IDs to consider.

    Returns:
        dict: Dictionary containing the ground truth data.
    """
    ground_true = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            qid, doc_id, relevance = parts[0], str(int(parts[2])), int(parts[3])
            if qid not in keys:
                continue
            if qid not in ground_true:
                ground_true[qid] = {}
            ground_true[qid][doc_id] = relevance
    return ground_true

In [None]:
# Function to calculate Mean Average Precision (MAP)
def calculate_map(system_ranking, recall_bases):
    map_scores = []

    for query in system_ranking:
        num = 0
        count = 0
        relevants = 1

        for element in system_ranking[query]:
            if count != 10:
                if element[1] > 0:
                    num += relevants / (count + 1)
                    relevants += 1
                count += 1
        
        rb = recall_bases[query]

        ap = num / rb
        map_scores.append(ap)
        print(f"AP {query}: {ap}")

    map_avg = sum(map_scores) / len(map_scores)
    print(f"MAP: {map_avg}")

In [None]:
# Function to calculate Mean Reciprocal Rank (MRR)
def calculate_mrr(system_ranking):
    mrr_scores = []
    for query in system_ranking:
        count = 0

        for element in system_ranking[query]:
            if element[1] > 0:
                rr = 1 / (count + 1)
                mrr_scores.append(rr)
                print(f"RR {query}: {rr}")
                break
            if count == 9:
                mrr_scores.append(0)
                print(f"RR {query}: 0")
                break
            count += 1
        
    mrr_avg = sum(mrr_scores) / len(mrr_scores)
    print(f"MRR: {mrr_avg}")

In [None]:
# dcg: Calculates the Discounted Cumulative Gain
def dcg(relevance, k) -> float:
    """
    Calculates the Discounted Cumulative Gain (DCG).

    Args:
        relevance (list): List of relevance scores.
        k (int): The depth of ranking to consider.

    Returns:
        float: The calculated DCG.
    """
    return relevance[0] + sum([relevance[i] / math.log(i + 1, 2) for i in range(1, min(k, len(relevance)))])

# ndcg: Calculates the Normalized Discounted Cumulative Gain
def ndcg(system_ranking, ground_true, k):
    """
    Calculates the Normalized Discounted Cumulative Gain (nDCG).

    Args:
        system_ranking (list): List of ranked items.
        ground_true (dict): Ground truth relevance scores.
        k (int): The depth of ranking to consider.

    Returns:
        float: The calculated nDCG.
    """
    # Get relevance scores for items in the system ranking
    relevances = [ground_true.get(rank, 0) for rank in system_ranking]

    # Get ideal relevance scores
    base_ideal = [i for i in ground_true.values()]

    # Calculate and return nDCG
    return dcg(relevances, k) / dcg(base_ideal, k)

In [None]:
# Data Loading

# Loading recall data from the specified files
recall_base_file = "./qrel_file/id_counts.txt"
ground_true_file="./qrel_file/ordered-qrel-msmarco-2019.txt"

# Using reading functions to load the data
recall_bases = read_data_recall_base(recall_base_file)

# Using reading functions to load the data
recall_bases = read_data_recall_base(recall_base_file)
folder_path = "relevance_file"
ndcg_path = "NDCG_file"

In [None]:
# Calculate and write nDCG scores to files
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        path = os.path.join(folder_path, filename)
        path_ndcg = os.path.join(ndcg_path, filename)
        
        system_ranking = read_data_2(path)
        ground_true = read_data_ground_true(ground_true_file, system_ranking.keys())

        with open(path_ndcg, 'w') as file:
            for qid in system_ranking.keys():
                file.write(qid + "\t" + str(ndcg(system_ranking[qid], ground_true[qid], 10)) + "\n")

In [None]:
# Calculate and print MAP for each ranking file
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        path = os.path.join(folder_path, filename)
        
        system_ranking = read_data(path)
        calculate_map(system_ranking, recall_bases)

In [None]:
# Calculate and print MRR for each ranking file
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        path = os.path.join(folder_path, filename)
        
        system_ranking = read_data(path)
        calculate_mrr(system_ranking)

MAP: 0.0890959203131625 daat bm25

MAP: 0.0826614187414304 daat tfidf

MAP: 0.08909592031316252 dynamicpruning bm25

MAP: 0.08266141874143042 dynamicpruning tfidf


MRR: 0.712015503875969 daat bm25

MRR: 0.6866925064599484 daat tfidf

MRR: 0.7120155038759689 dynamicpruning bm25

MRR: 0.6866925064599484 dynamicpruning tfidf