# **Text, Web, & Media Analytics Assignment 2**

# Setup

In [1]:
# NOTE: ORDER ME PLEASE 🙇‍♂️
import os
import math

# import sklearn
# import matplotlib.pyplot as plt
# import numpy as np

from ir_models import BM25, JM_LM
from ir_tools import write_scores_to_file, term_specificity, score_normalisation
from parsing_functions import parse_stop_words, parse_collection, parse_query, parse_query_set, parse_evaluations

from data_structures import bow_document_collection

In [2]:
# Parse in stop words
stop_words = parse_stop_words('common-english-words.txt')

# Load the document set (series of collection objects)
document_set = {}
input_path = 'Data_Collection'
for collection_path in os.listdir(input_path):
    data_key = collection_path.split('_C', 1)[1]
    document_set[data_key] = parse_collection(stop_words, os.path.join(input_path, collection_path))

# Parse in evaluation benchmarks
evaluations = parse_evaluations('EvaluationBenchmark/')

# Parse in query set, apply term specificity to parsed queries
query_frame = parse_query_set('the50Queries.txt')
query_frame['parsed_titles'] = query_frame['title'].apply(lambda row: parse_query(row, stop_words))
query_frame['weighted_terms'] =\
    query_frame.apply(lambda row: 
        term_specificity(
            document_set[row['number']], row['parsed_titles'], 
            evaluations[row['number']], 0.0, 0.5
            ), 
        axis = 1)

# Task 1: BM25 ✔️

# Task 2: Jelinek-Mercer Language Model ✔️

# Task 3: Pseudo-Relevance Model

In [15]:
def cosine_similarity(vector1: list, vector2: list) -> float:
    """
    This function calculates the cosine similarity for two vectors.

    Cosine similarity is a measure of similarity between two non-zero vectors
    of an inner product space that measures the cosine of the angle between them.
    The cosine similarity is particularly used in positive spaces where the
    outcome is neatly bounded in [0,1].
    """

    # Calculate the dot product of the two vectors; sum of the products of the corresponding elements of the vectors
    dot_product = sum(x * y for x, y in zip(vector1, vector2))

    # Compute the Euclidean norms (also called the magnitude or length); square root of the sum of the squares of the elements
    norm1 = math.sqrt(sum(x ** 2 for x in vector1))
    norm2 = math.sqrt(sum(x ** 2 for x in vector2))

    # Compute the cosine similarity; dot product of the vectors divided by the product of their norms.
    return dot_product / (norm1 * norm2) if (norm1 and norm2) != 0 else 0.0  # return similarity, handling 0 case for a Euclidean norm of 0

def Vector_Space_Model(collection: bow_document_collection, query: dict) -> dict:
    """
    The Vector Space Model is an algorithm that calculates scores for each document
    based on a given query. The model incorporates certain dimensions that can be
    changed appropriately, such as similiarity measures, query and the equation for 
    calculating document weights. The following function uses the cosine similarity 
    and the tf-idf equation. It returns a dictionary that contains documents and 
    scores respectively.
    """

    # Type check to ensure coll is a bow_document_collection
    if not isinstance(collection, bow_document_collection):
        raise TypeError("collection: must be a bow_document_collection object.")
    
    # If collection contains no documents, raise attribute error
    if len(collection.docs) == 0:
        raise AttributeError("collection: object contains no documents (bow_document objects).")
    
    # Type check to ensure query is a dict
    if not isinstance(query, dict):
        raise TypeError("query: must be a dict object.")
    
    # Initializations
    document_similarity_scores = {}
    document_vectors = {}
    idf_component = {}

    collection_term_frequency = collection.term_frequency  # dict of the total number of times each term appears in collection
    N = len(collection.docs)  # total number of documents in the collection

    # Calculate idf for all terms
    for term, doc_freq in collection_term_frequency.items():
        idf_component[term] = math.log(N / doc_freq, 10)

    # Get all terms of the collection
    all_terms = set(collection_term_frequency.keys())

    # Represents a general n-dimensional vector, which will be used to construct
    # the query- and document-vector. Therefore, a normalization is not necessary, 
    # due to the fact that the algorithm doesn't compare documents and different queries against each other.
    vector = [0] * len(all_terms)
    query_vector = [0] * len(all_terms)

    # Convert the query parameter into a vector representation
    for index, term in enumerate(all_terms):
        query_vector[index] = query.get(term, 0)
    
    # Calculate weights for each document
    for doc_ID, doc in collection.docs.items():
        term_freq_dict = doc.get_term_list()

        # Loop through all terms
        for index, term in enumerate(all_terms):
            # Calculate the term weigths using tf-idf equation
            vector[index] = term_freq_dict.get(term, 0) * idf_component.get(term, 0)
        
        # Store weighted vector for every document
        document_vectors[doc_ID] = vector 

    # Calculate cosine similarity for query and documents
    for doc_ID, doc_vector in document_vectors.items():
        document_similarity_scores[doc_ID] = cosine_similarity(doc_vector, query_vector)

    # Return documents in descending order (based on values of the weights)
    return dict(sorted(document_similarity_scores.items(), key=lambda item: item[1], reverse=True))


In [5]:
def w5(collection: bow_document_collection, evaluations: dict, theta: int | float = 0) -> dict:
    """
    This function calculates the weight-5 score for the whole collection based 
    on a dictionary that contains relevant and irrelevant documents. Theta is 
    a parameter which can be changed due to receive reasonable results.
    This function returns a dictionary with features and their calculated 
    weights.
    """
    
    # Type check(s)
    if not isinstance(collection, bow_document_collection):
        raise TypeError("collection: must be a bow_document_collection object.")
    if not isinstance(evaluations, dict):
        raise TypeError("evaluations: value must be a dict.")
    if not isinstance(theta, (int, float)):
        raise TypeError("theta: value must be an int or float.")
    
    # Collection check
    if len(collection.docs) == 0:
        raise AttributeError("collection: object contains no documents (bow_document objects).")

    # Initialisations
    term_relevance_count = {}
    term_total_count = {}
    mean_weight5 = 0  # average

    N = len(collection.docs)  # total number of documents in the collection
    R = sum(1 for relevancy in evaluations.values() if relevancy == 1)  # total number of relevant documents

    # Count relevant and total occurrences of each term
    for doc_id, doc in collection.docs.items():
        for term in doc.get_term_list().keys():
            if evaluations[doc_id] == 1:
                term_relevance_count[term] = term_relevance_count.get(term, 0) + 1  # initialise term with 1 if not present, otherwise add 1
            term_total_count[term] = term_total_count.get(term, 0) + 1  # initialise term with 1 if not present, otherwise add 1

    # Calculate the weight-5 score for each term
    weight5_scores = {}
    for term, relevance_count in term_relevance_count.items():
        n_tk = term_total_count[term]
        score_numerator = ((relevance_count + 0.5) / (R - relevance_count + 0.5))
        score_denominator = ((n_tk - relevance_count + 0.5) / (N - n_tk - R + relevance_count + 0.5))
        weight5_scores[term] = score_numerator / score_denominator

    # Calculate the mean of all weight-5 scores if there are relevant documents
    if R > 0:
        mean_weight5 = sum(weight5_scores.values()) / len(weight5_scores)

    # Select features based on the mean weight5-score and theta
    selected_features = {term: score for term, score in weight5_scores.items() if score > mean_weight5 + theta}

    return selected_features

In [40]:
def My_PRM(weighting_function, collection: bow_document_collection, query: dict, threshold: int, theta: int) -> dict:
    """
    This function represents a Pseudo-Relevance-Model (PRM) to rank documents 
    based on pseudo feedback. It accepts a weighting function as parameter such
    as the BM25 or Vector Space Model algorithm to calculate weights for 
    each document in a given collection with respect to a given query. The 
    parameters threshold and theta are used to fine-tune the algorithm. The PRM returns
    a sorted dictionary.
    """
    
    # Ensure that the weighting function is callable
    if not callable(weighting_function):
        raise TypeError("weighting_function must be a callable function.")
    
    # Initializations
    relevant_documents = {}
    documents_scores = {}

    # 1) Based on the given query, PRM calculates bm25-score/VSM-score for each document
    weighting_result = weighting_function(collection, query)

    # 2) Based on a defined threshold (e.g. value=1.0), the algorithm marks relevant documents
    # (positive label 1) that are greater than the defined threshold, otherwise as irrelevant (negative label 0)
    for doc, score in weighting_result.items():
        relevant_documents[doc] = 1 if score > threshold else 0

    # 3) Calculates w5-score to identify a set of features
    w5_results = w5(collection=collection, evaluations=relevant_documents, theta=theta)

    # 4) Calculate document scores based on the identified features
    for doc_ID, doc in collection.docs.items():
        documents_scores[doc_ID] = 0
        for term, frequency in doc.get_term_list().items():
            documents_scores[doc_ID] += frequency * w5_results.get(term, 0)
        
    # Order, normalise, and return results
    documents_scores = dict(sorted(documents_scores.items(), key=lambda item: item[1], reverse=True))
    documents_scores = score_normalisation(documents_scores)
    return documents_scores

In [41]:
# Initialise result dicts
BM25_results = {}
JM_LM_results = {}
My_PRM_results = {}

# Loop over queries/collection objects 
for query_key, collection in document_set.items():
    query = query_frame.loc[query_frame['number'] == query_key, 'weighted_terms'].iloc[0]  # retrieve weighted query

    # Rank documents
    JM_LM_results[query_key] = JM_LM(collection=collection, query=query)
    BM25_results[query_key] = BM25(collection=collection, query=query)
    My_PRM_results[query_key] = My_PRM(weighting_function=BM25, collection=collection, query=query, threshold=0.7, theta=0)  # NOTE: GSCV threshold/theta?

    # Save results
    write_scores_to_file(BM25_results[query_key], f"BM25_R{query_key}Ranking")
    write_scores_to_file(JM_LM_results[query_key], f"JM_LM_R{query_key}Ranking")
    write_scores_to_file(My_PRM_results[query_key], f"My_PRM{query_key}Ranking")

# Task 4: Model Testing

**Description:** Use Python to implement three models: `BM25`, `JM_LM`, and `My_PRM`, and **test them on the given 50 data collections for the corresponding 50 queries (topics)**. 

Design Python programs to implement these three models. You can use a .py file (or a .ipynb file) for each model.


For each long query, your python programs will produce ranked results and save them into .dat files. For example, for query R107, you can save the ranked results of three models into “BM25_R107Ranking.dat”, “JM_LM_R107Ranking.dat”, and “My_PRM_R107Ranking.dat”, respectively by using the following format:
- The first column is the document id (the itemid in the corresponding XML document)
- The second column is the document score (or probability).

**Describe:** 
- Python packages or modules (or any open-source software) you used
- The data structures used to represent a single document and a set of documents for each model (you can use different data structures for different models).


You also need to **test the three models on the given 50 data collections for the 50 queries (topics) by *printing out the top 15 documents* for each data collection (in descending order)**. The **output will also be put in the appendix of your final report**.

In [50]:
# NOTE: the outputs of the top-15 for each model goes in the appendix of the final report
# NOTE: Could have the sorting in the earlier code, before writing it into the file?
#       ✔️ Already done, doesn't hurt to sort again like we do here


def get_top15(model_results):
    """
    Takes the model results, prints out the top-15 sorted by weights

    input:
    model results (dict), in the form of {query number, {documentID, document_score}}

    output:
    Prints top-15 list of documents for each query in descending order
    """

    # NOTE: check for less than 15, prob none but worth evaluating

    # Iterate terating over each set of {query:predictions}, where predictions is a dictionary of {doc_id : document weight}
    for(query, predictions) in model_results.items():
        print('Query' + str(query) + ' (DocID Weight):') # print result header information

        # For the given result set, sort the document weights and take the top 15 scores
        sorted_weights_top15 = {k: v for k, v in sorted(predictions.items(), key=lambda item: item[1], reverse=True)[:15]}

        # Iterate over each doc_id:weight for the predictions
        for (doc_id, weight) in sorted_weights_top15.items():
            print(doc_id + ': ' + str(weight))  # print results data

        print()  # print linebreak for readability

In [51]:
get_top15(BM25_results)

Query101 (DocID Weight):
46547: 1.0
46974: 1.0
62325: 0.7341896585684348
6146: 0.60550764161471
22170: 0.5418384358126043
61329: 0.18114697633537039
61780: 0.15335920046666113
22513: 0.1204218983126723
82330: 0.09704349892160438
39496: 0.08339330638207847
18586: 0.0
26642: 0.0
26847: 0.0
27577: 0.0
30647: 0.0

Query102 (DocID Weight):
73038: 1.0
65414: 0.8585234553285452
26061: 0.7198133666237954
57914: 0.6718867897268519
33203: 0.666559745440119
76635: 0.6571770698872418
12769: 0.6462544842294775
12767: 0.6429722989844446
58476: 0.6055429131461574
25096: 0.5675513752492433
26611: 0.5485989282045247
82227: 0.5457606220472528
33172: 0.5388810732446038
29908: 0.5375754117144166
28662: 0.5368629255051036

Query103 (DocID Weight):
27426: 1.0
27106: 0.9587981841811211
59459: 0.9198469632250956
83370: 0.8924476974340898
20159: 0.8164035278386376
54533: 0.5798807360443394
14314: 0.5313123321086101
26258: 0.50357530859969
82912: 0.5021889116268063
56704: 0.5016396479068459
85889: 0.50142172831

# Task 5: Model Evaluation

**Description:** Use three effectiveness measures to evaluate the three models.

In this task, you need to **use the relevance judgments (EvaluationBenchmark.zip)** to **compare with the ranking outputs in the folder of “RankingOutputs” for the selected effectiveness metric** for the three models.


You need to use the following three different effectiveness measures to evaluate the document ranking results you saved in the folder “RankingOutputs”:
1) Average precision (and MAP)
2) Precision@10 (and their average)
3) Discounted cumulative gain at rank position 10 ($p = 10$), $DCG_{10}$ (and their average):  
    $DCG_p=rel_i+\sum_{i=2}^p\frac{rel_i}{log_2(i)}$  
        $rel_i=1$ if the document at position $i$ is releveant; otherwise, it is 0.

Evaluation results can be summarized in tables or graphs. Examples are provided in the sepcification sheet.

### Getting evaluation benchmark set

In [54]:
folder_path = 'EvaluationBenchmark'
evaluations = parse_evaluations(folder_path)
evaluations

{'101': {'6146': 0,
  '18586': 0,
  '22170': 0,
  '22513': 0,
  '26642': 0,
  '26847': 0,
  '27577': 0,
  '30647': 0,
  '39496': 1,
  '46547': 1,
  '46974': 1,
  '61329': 0,
  '61780': 0,
  '62325': 1,
  '63261': 1,
  '77909': 0,
  '80425': 0,
  '80950': 0,
  '81463': 0,
  '82330': 1,
  '82454': 1,
  '82912': 0,
  '83167': 0},
 '102': {'3265': 1,
  '3827': 1,
  '3828': 1,
  '3833': 1,
  '3834': 1,
  '3835': 1,
  '3837': 1,
  '3972': 0,
  '3976': 1,
  '4306': 1,
  '4310': 1,
  '4358': 1,
  '4395': 1,
  '4439': 1,
  '4881': 1,
  '4933': 0,
  '5862': 1,
  '6497': 1,
  '6498': 1,
  '6503': 1,
  '6534': 1,
  '6635': 0,
  '6636': 1,
  '6735': 1,
  '7115': 1,
  '7118': 1,
  '7502': 1,
  '7937': 0,
  '8327': 1,
  '8333': 1,
  '8455': 1,
  '9358': 0,
  '9479': 0,
  '9703': 1,
  '9726': 0,
  '9790': 1,
  '10182': 0,
  '11083': 0,
  '11481': 1,
  '11485': 1,
  '11922': 1,
  '11923': 1,
  '11930': 1,
  '11960': 0,
  '11979': 1,
  '12479': 1,
  '12767': 0,
  '12769': 0,
  '12812': 1,
  '12825': 0,


### Average Precision (MAP)

In [29]:
def calculate_avg_precision(rel_docs, model_results, threshold):

    """"
    This function calculates the mean average precision (MAP) for each topic in a collection

    MAP is calculated by checking whether the document deemed relevant by the model (score exceeds the threshold) is considered relevant by the evaluation benchmark.

    Inputs:
    rel_docs (dict): A dictionary of {topic : {docid:relevance_score}} for the benchmark sets: used as the 'ground truth' scores
    model_results (dict): A dictionary of predictions for doc relevance, in the form of {topic : {docid:relevance_score}} from the model 
    threshold (float): A defined value, where if the model predicts a score above the threshold, the document has predicted the document is 'relevant'

    Returns:
    coll_precisions(dict): A dictionary of {topic : precision} for each topic in the collection
    """

    coll_precisions = {}

    for topic in rel_docs.keys():
        if topic in model_results:

           #getting the inner {docID:score} dictionaries
            rel_dict = rel_docs[topic]        #docID:relevance from benchmark
            model_dict = model_results[topic]  #docID: score from model - retrieved docs

            # Calculate the number of relevant docs
            R = 0
            for value in rel_dict.values():
                if value == 1:
                    R = R + 1


            # Calculate the number of retrieved docs
            R1 = 0
            for value in model_dict.values():
                if value > threshold:
                    R1 = R1 + 1
            

            # Calculate the number of correctly retrived docs (true positives)
            RR1 = 0
            for (docID, value) in model_dict.items():
                if (value > threshold) and (rel_dict[docID] == 1):
                    RR1 = RR1 + 1


            #calculate precision
           
            #if statement to handle any instances where there were no retrieved documents - returns score of 0
            if R1 > 0.0:
                p = float(RR1)/float(R1)
            else:
                p = 0.0


            #storing precision for each topic as {topic : precision}
            ####Note##### Can have R, R1, RR1 here to get the relevant docs, retrieved docs, and correctly retrieved docs if needed for checking outputs.
            coll_precisions[topic] = {
                'precision' : p
            }
        else:
            coll_precisions[topic] = {
                'precision' : 0
            }

    return coll_precisions



#function to calculate the averages
def calculate_map(coll_scores):
    """
    This function calculates the mean average precision, by averaging the precision for each query in a collection

    input:
    collection_scores(dict): A dict of {topic : scores}
    
    output:
    avg_score(float) : an average score for the collection
    """
    total_score = 0.0       #initialise score

    for topic, preds in coll_scores.items():    #iterate over each topic
        for score in preds.values():            #get the {docid:precision}
            total_score += score

    num_scores = len(coll_scores)

    avg_score = total_score / num_scores

    return avg_score


#### MAP for BM_25

In [30]:
# Define the threshold
threshold = 0.25

# Calculate precision for each query
bm25_precision = calculate_avg_precision(relevance_scores, BM25_results, threshold)

# Print the precision for each query
for topic, metric in bm25_precision.items():
    print(f" Topic {topic}: Precision: {metric['precision']}")

 Topic 101: Precision: 0.375
 Topic 102: Precision: 0.7878787878787878
 Topic 103: Precision: 0.1724137931034483
 Topic 104: Precision: 0.0
 Topic 105: Precision: 0.0
 Topic 106: Precision: 0.1875
 Topic 107: Precision: 0.25
 Topic 108: Precision: 0.10526315789473684
 Topic 109: Precision: 0.4
 Topic 110: Precision: 0.3125
 Topic 111: Precision: 0.17647058823529413
 Topic 112: Precision: 0.5454545454545454
 Topic 113: Precision: 0.43478260869565216
 Topic 114: Precision: 0.5
 Topic 115: Precision: 0.16666666666666666
 Topic 116: Precision: 0.25
 Topic 117: Precision: 0.0
 Topic 118: Precision: 0.25
 Topic 119: Precision: 0.0
 Topic 120: Precision: 0.2857142857142857
 Topic 121: Precision: 0.45161290322580644
 Topic 122: Precision: 0.6521739130434783
 Topic 123: Precision: 0.18181818181818182
 Topic 124: Precision: 0.0
 Topic 125: Precision: 0.5833333333333334
 Topic 126: Precision: 0.0
 Topic 127: Precision: 0.4166666666666667
 Topic 128: Precision: 0.09090909090909091
 Topic 129: Prec

In [31]:
# Get the mean average precision for bm_25
calculate_map(bm25_precision)

0.2556163850770885

#### MAP for JM_LM

In [32]:
# Define the threshold
threshold = 0.0001

# Calculate precision for each query
jmlm_precision = calculate_avg_precision(relevance_scores, JM_LM_results, threshold)

# Print the precision for each query
for topic, metric in jmlm_precision.items():
    print(f" Topic {topic}: Precision: {metric['precision']}")


 Topic 101: Precision: 1.0
 Topic 102: Precision: 0.0
 Topic 103: Precision: 0.0
 Topic 104: Precision: 0.0
 Topic 105: Precision: 0.0
 Topic 106: Precision: 0.0
 Topic 107: Precision: 0.0
 Topic 108: Precision: 0.0
 Topic 109: Precision: 0.0
 Topic 110: Precision: 0.0
 Topic 111: Precision: 0.0
 Topic 112: Precision: 0.0
 Topic 113: Precision: 0.0
 Topic 114: Precision: 0.0
 Topic 115: Precision: 0.0
 Topic 116: Precision: 0.0
 Topic 117: Precision: 0.125
 Topic 118: Precision: 0.0
 Topic 119: Precision: 0.0
 Topic 120: Precision: 0.0
 Topic 121: Precision: 0.0
 Topic 122: Precision: 0.0
 Topic 123: Precision: 0.0
 Topic 124: Precision: 0.0
 Topic 125: Precision: 0.7142857142857143
 Topic 126: Precision: 0.6153846153846154
 Topic 127: Precision: 0.0
 Topic 128: Precision: 0.0
 Topic 129: Precision: 0.0
 Topic 130: Precision: 0.0
 Topic 131: Precision: 0.16666666666666666
 Topic 132: Precision: 0.0
 Topic 133: Precision: 0.0
 Topic 134: Precision: 0.0
 Topic 135: Precision: 0.0
 Topic 

In [33]:
#Calculate map for jm_lm
calculate_map(jmlm_precision)

0.06969946719946721

#### MAP for My_PRM

### Precision @ 10

In [34]:
# Functions

def precision_at_k(rel_docs, retrieved_docs, k, threshold):

    """
    This function calculates the precision at k for a given set of relevant and retrieved items.
    It sorts values by the scores, from highest to lowest, taking the top-k scores.
    If the model predicts that the document is relevant (score > threshold), and the document exists in the rel_docs (it is actually relevant), then the score is added. 

    Precision is calculated by (number of retrieved docs / number of retrieved relevant docs)
    
    Inputs:
    rel_docs(dict): a dictionary of {docid:relevance_score} for a given query. This is the 'ground truth' used to check if the document is relevant or not
    retrieved_docs(dict): A dictionary of {docid:relevance_score} for a given query. This is the model predictions 
    k(int) : The rank position up to which precision is calculated
    threshold(float): A threshold to determine relevance. If the model's score for that document exceeds the threshold, the model considers that document relevant

    output:
    precision(float): the precision at rank k for the model

    """
    #sort the model's scores in descending order, taking the top 'k' scores
    sorted_predictions = sorted(retrieved_docs.items(), key=lambda item: item[1], reverse=True)[:k]


    #Make dictionary of {docID : relevant_score} using  rel_docs
    relevant_items = {}     #initialise dictionary

    for (docid, score) in rel_docs.items():     #for each document_id:score in the dict
        if score == 1:              #if the score is equal to or above threshold (1 in this case from eval_benchmark)
            relevant_items[docid] = score       #add relevant documentID, and score to the dictionary

    
    #count number of retrieved docs
    R1 = 0
    for (docid, score) in sorted_predictions:
        if (score >= threshold):
            R1 = R1 + 1

    #calculating the number of documents in the retrieved doc collection that are correctly retrieved, using the top-k predictions
    RR1 = 0      #initialise count 

    for (docid, score) in sorted_predictions:       #for each document in the model predictions
        if docid in relevant_items:                 #if the document retrieved is correctly in the 'relevant documents' dictionary
            if score >= threshold:            #if the score given to the model by the document is equal to or greater than the specified threshold
                RR1 = RR1 + 1                        #increases the count by 1
                

    #calculate precision at k

    #if statement to handle any instances where there were no retrieved documents - returns score of 0
    if R1 > 0.0 and RR1 > 0.0:
        precision = float(RR1) / float(R1)      #calculate precision (number of correctly retrieved docs / number of retrieved docs)
    else:
        precision = 0.0

    return precision


def calculate_precision_at_10(rel_docs, model_results, threshold):
    """
    This function calculates the precision at rank 10 for a collection of topics.
    It does this by passing each {docid:query} for a given topic to the precision_at_k function, and adding each result to a dictionary

    input:
    rel_docs(dict): a dictionary of {topic : {docid:relevance_score}}. This is the 'ground truth' used to check if the document is relevant or not
    model_results(dict): a dictionary of {topic : {docid:relevance_score}}. This is the model predictions, or 'retrieved documents'
    threshold(float): documents with a score/weight that exceeds this threshold are considered 'retrieved docs', ie., documents the model has predicted are relevant.

    output:
    precision_at_10_per_topic(dict) : A dictionary of {topic:p10}, where each topic is a string, and each 'p10' is a float value of its precision at rank 10


    """
    precision_at_10_per_topic = {}      #initialise dictionary

    #iterate over each topic in the dict
    for topic in rel_docs.keys():
        if topic in model_results:    #ensuring the topic exists in the predictions
            #passing to the precision_at_k() function
            precision_at_10 = precision_at_k(rel_docs[topic], model_results[topic], 10, threshold)
            #assigning the value for that topic in the dictionary  
            precision_at_10_per_topic[topic] = precision_at_10
        else:
            precision_at_10_per_topic[topic] = 0.0  #otherwise score = 0
    return precision_at_10_per_topic


In [35]:
#note: this can be used for any {topic:score} dict - so at the moment, both p10 and dcg
def calculate_avg_score(collection_scores):
    """
    This function calculates the average score for a collection of {topic:score}. It sums up each value in the dictionary, then divides by the number of entries

    input: 
    collection_scores(dict): A collection of {ID:scores} 

    output:
    avg_score(float): a value for the average score in the 'values' part of the dictionary
    
    """

    total_score = 0.0 #initialise score
    total_score = sum(collection_scores.values())

    num_scores = len(collection_scores)

    avg_score = total_score / num_scores

    return avg_score 


#### BM_25 Precision @ 10 

In [36]:
# Define the threshold
model_threshold = 0.25

# Calculate precision at 10
bm_precision_at_10_per_topic = calculate_precision_at_10(relevance_scores, BM25_results, model_threshold)

# Print the precision at 10 for each group
for topic, precision in bm_precision_at_10_per_topic.items():
    print(f"Topic {topic}: Precision at 10: {precision:.3f}")


Topic 101: Precision at 10: 0.375
Topic 102: Precision at 10: 0.500
Topic 103: Precision at 10: 0.500
Topic 104: Precision at 10: 0.000
Topic 105: Precision at 10: 0.000
Topic 106: Precision at 10: 0.200
Topic 107: Precision at 10: 0.250
Topic 108: Precision at 10: 0.000
Topic 109: Precision at 10: 0.400
Topic 110: Precision at 10: 0.300
Topic 111: Precision at 10: 0.000
Topic 112: Precision at 10: 0.600
Topic 113: Precision at 10: 0.500
Topic 114: Precision at 10: 0.500
Topic 115: Precision at 10: 0.200
Topic 116: Precision at 10: 0.200
Topic 117: Precision at 10: 0.000
Topic 118: Precision at 10: 0.300
Topic 119: Precision at 10: 0.000
Topic 120: Precision at 10: 0.600
Topic 121: Precision at 10: 0.400
Topic 122: Precision at 10: 0.400
Topic 123: Precision at 10: 0.100
Topic 124: Precision at 10: 0.000
Topic 125: Precision at 10: 0.700
Topic 126: Precision at 10: 0.000
Topic 127: Precision at 10: 0.400
Topic 128: Precision at 10: 0.100
Topic 129: Precision at 10: 0.300
Topic 130: Pre

In [37]:
calculate_avg_score(bm_precision_at_10_per_topic)

0.262

#### JM_LM Precision @ 10

In [38]:
# Define the threshold
threshold = 0.00001

# Calculate precision at 10
jmlm_precision_at_10_per_topic = calculate_precision_at_10(relevance_scores, JM_LM_results,threshold)

# Print the precision at 10 for each group
for topic, precision in jmlm_precision_at_10_per_topic.items():
    print(f"Topic {topic}: Precision at 10: {precision:.3f}")

Topic 101: Precision at 10: 1.000
Topic 102: Precision at 10: 0.000
Topic 103: Precision at 10: 0.000
Topic 104: Precision at 10: 0.000
Topic 105: Precision at 10: 0.000
Topic 106: Precision at 10: 0.000
Topic 107: Precision at 10: 0.000
Topic 108: Precision at 10: 0.000
Topic 109: Precision at 10: 0.000
Topic 110: Precision at 10: 0.000
Topic 111: Precision at 10: 0.000
Topic 112: Precision at 10: 1.000
Topic 113: Precision at 10: 0.000
Topic 114: Precision at 10: 0.000
Topic 115: Precision at 10: 0.000
Topic 116: Precision at 10: 0.100
Topic 117: Precision at 10: 0.200
Topic 118: Precision at 10: 0.000
Topic 119: Precision at 10: 0.000
Topic 120: Precision at 10: 0.000
Topic 121: Precision at 10: 0.000
Topic 122: Precision at 10: 1.000
Topic 123: Precision at 10: 0.000
Topic 124: Precision at 10: 0.400
Topic 125: Precision at 10: 0.700
Topic 126: Precision at 10: 0.600
Topic 127: Precision at 10: 0.000
Topic 128: Precision at 10: 0.000
Topic 129: Precision at 10: 0.000
Topic 130: Pre

In [39]:
calculate_avg_score(jmlm_precision_at_10_per_topic)

0.14422222222222222

#### My_PRM Precision @ 10

In [40]:
#implement p@10 function for My_PRM model

### Discounted Cumulative Gain at rank position 10

In [41]:
# Function to calculate dcg at position k

def dcg_at_k(rel_docs, model_predictions, k):
    """
    This function calculates the discounted cumulative rank gain (dcg) at position k for a set of {docid:score} predictions.

    DCG is calculated by summing relevance scores, divided by the log(2) of their rank position.

    For each document in top-k, check whether it is relevant.
    Relevance is determined by whether the document has the score (1) in the relevance_docs set
        if document at position i is relevant, relevance score of 1 is assigned
        if document at position i is irrelevant, score is zero

    Inputs:
    rel_docs (dict): A dictionary with the benchmark relevance for a given query. Stored in the form of {docid : relevance}
    model_predictions (dict): A dictionary of model predictions for a given query (topic). Stored in the form of {docid : relevance}
    k (int): The rank position DCG is calculated for.

    Output:
    dcg (float): Calculated dcg value for position k 

    """

    #sort the model scores based on values, taking only top-k results
    sorted_preds = sorted(model_predictions.items(), key = lambda item : item[1], reverse=True)[:k]

    #initialise dcg counter
    dcg = 0.0

    #calculate dcg for position 1
    #check whether the document at that position is relevant (has a score of 1 in the rel_docs set)
    docid, score = sorted_preds[0]      #get the docid & score for the highest-ranked document
    if rel_docs.get(docid, 0) == 1:     #check if the document is relevant in the benchmark set
        relevance = 1                   #assign a value of 1
        dcg += relevance                #add relevance to the dcg 

    #iterate over the top-k sorted predictions
    for i in range (1, k):              #starting from position 2
        if i < len(sorted_preds):       #ensuring we haven't exceeded the number of predictions
            docid, score = sorted_preds[i]  #get the document ID and score for the current rank
            if rel_docs.get(docid, 0) == 1:     #check whether the score exists in the benchmark set
                relevance = 1
                discounted_rel = relevance / math.log2(i + 1)       #apply the dcg formula by dividing by log i 

                dcg += discounted_rel           #add to running total
    

    #return calculated DCG value at position k
    return dcg


#Function to calculate dcg for a collection of topics

def calculate_dcg10(rel_docs, model_predictions):
    """
    This calculates dcg at position 10 for each topic (query) in a collection of {topic : {docid:score}}
    
    For each query, it passes the {docid:relevance score} for the benchmark set, and the model predictions to the dcg_at_k() function 

    Inputs:
    rel_docs(dict): A dictionary where the keys are the topics, and values are a {docid:score} dictionary from the benchmark set (relevance docs)
    model_predictions(dict): A dictionary where the keys are the topics, and values are a {docid:score} dictionary for predictions from our model (retrieved docs)

    output:
    dcg10_scores(dict): A dictionary where the keys are the topics, and the values are the DCG10 values for the given model
    """

    #initialise a dictionary to store the DCG at 10 score for each topic
    dcg10_scores = {}

    #iterate over each topic in the relevance scores documents
    for topic in rel_docs.keys():

        #check if the topic also exists in the model predictions
        if topic in model_predictions:

            #calculate dcg @ 10 for this topic
            #pass to calculate dcg_at_k, using k = 10 
            dcg_at_10 = dcg_at_k(rel_docs[topic], model_predictions[topic], k = 10)

            #store the results in a dictionary
            dcg10_scores[topic] = dcg_at_10

        else:
            dcg10_scores[topic] = 0.0

    return dcg10_scores


#### BM_25 DCG10

In [42]:
bm25_dcg10 = calculate_dcg10(relevance_scores, BM25_results)

for (topic, score) in bm25_dcg10.items():
    print(f"Topic {topic}: {score:.3f}")

Topic 101: 3.247
Topic 102: 2.749
Topic 103: 2.674
Topic 104: 3.868
Topic 105: 2.232
Topic 106: 1.387
Topic 107: 1.500
Topic 108: 0.000
Topic 109: 1.819
Topic 110: 1.320
Topic 111: 0.000
Topic 112: 2.398
Topic 113: 3.432
Topic 114: 3.018
Topic 115: 1.500
Topic 116: 0.732
Topic 117: 1.133
Topic 118: 1.220
Topic 119: 0.000
Topic 120: 3.189
Topic 121: 2.964
Topic 122: 2.005
Topic 123: 0.500
Topic 124: 1.667
Topic 125: 4.120
Topic 126: 2.948
Topic 127: 2.088
Topic 128: 0.333
Topic 129: 1.931
Topic 130: 1.764
Topic 131: 1.690
Topic 132: 1.631
Topic 133: 3.518
Topic 134: 0.764
Topic 135: 1.421
Topic 136: 0.887
Topic 137: 2.631
Topic 138: 0.315
Topic 139: 1.518
Topic 140: 4.868
Topic 141: 4.754
Topic 142: 0.301
Topic 143: 0.301
Topic 144: 0.688
Topic 145: 0.000
Topic 146: 1.000
Topic 147: 1.174
Topic 148: 0.000
Topic 149: 0.301
Topic 150: 1.877


In [43]:
#calculate dcg average for bm_25 
calculate_avg_score(bm25_dcg10)

1.7475467002197471

#### JM_LM DCG10

In [44]:
jmlm_dcg10 = calculate_dcg10(relevance_scores, JM_LM_results)

for (topic, score) in jmlm_dcg10.items():
    print(f"Topic {topic}: {score:.3f}")

Topic 101: 3.247
Topic 102: 3.434
Topic 103: 1.818
Topic 104: 4.567
Topic 105: 2.754
Topic 106: 2.315
Topic 107: 1.431
Topic 108: 0.688
Topic 109: 4.196
Topic 110: 2.377
Topic 111: 0.000
Topic 112: 3.918
Topic 113: 3.597
Topic 114: 3.395
Topic 115: 1.062
Topic 116: 1.000
Topic 117: 1.301
Topic 118: 1.018
Topic 119: 0.000
Topic 120: 2.918
Topic 121: 4.134
Topic 122: 2.475
Topic 123: 1.000
Topic 124: 2.856
Topic 125: 3.453
Topic 126: 2.860
Topic 127: 2.392
Topic 128: 1.000
Topic 129: 3.178
Topic 130: 1.856
Topic 131: 1.301
Topic 132: 1.431
Topic 133: 3.562
Topic 134: 0.631
Topic 135: 3.059
Topic 136: 2.319
Topic 137: 2.631
Topic 138: 0.657
Topic 139: 2.000
Topic 140: 4.254
Topic 141: 3.820
Topic 142: 0.315
Topic 143: 0.000
Topic 144: 2.178
Topic 145: 0.000
Topic 146: 1.000
Topic 147: 2.377
Topic 148: 0.634
Topic 149: 0.616
Topic 150: 1.818


In [45]:
#calculate dcg average for jm_lm 
calculate_avg_score(jmlm_dcg10)

2.0968713213581576

#### My_PRM DCG10

In [46]:
# Implement DCG10 for My_PRM model

# Task 6: Recommendation

**Description:** Recommend a model based on significance test and your analysis. 

You need to conduct a significance test to compare models. You can choose a t-test to perform a significance test on the evaluation results (e.g., in Tables 1, 2 and 3). 

You can compare models between:
- **BM25** and **JM_LM**
- **BM25** and **My_PRM**
- **JM_LM** and **My_PRM**

Based on $t$-test results ($p$-value and $t$-statistic), you can recommend a model (You ***want the proposed "My_RPM" to be the best because it is your own model***). You can perform the $t$-test using a single effectiveness measure or multiple measures. Generally, using more effectiveness measures provides stronger evidence against the null hypothesis. Note that if the $t$-test is unsatisfactory, you can use the evaluation results to refine **My_PRM** mode. For example, you can adjust parameter settings or update your design and implementation.