# **Text, Web, & Media Analytics Assignment 2**

# Setup

In [1]:
# NOTE: ORDER ME PLEASE 🙇‍♂️

import nltk
import pandas as pd

import os
import regex as re
import string
import math
import csv

import sklearn
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Define the class for Bag-of-Word representation
class bow_document:
    def __init__(self, item_id: str):
        # Type check to ensure object is initialised correctly
        if not isinstance(item_id, str):
            raise TypeError("item_id: value must be a string.")
            # Technically could work with str or int indexing (for key in collection),
            # using *only* str ensures no double-up of pointers
            # (e.g. item_id '1' vs item_id 1)

        self.doc_id = item_id  # assigning doc_id from 'item_id'
        self.terms = {}  # dictionary for terms and their frequencies
        self._doc_len = 0  # document length, private attribute

    def add_term(self, term: str):
        """Add a term to the document or update its frequency if it already exists."""
        
        # Type check to ensure term is a str
        if not isinstance(term, str):
            raise TypeError("term: value must be a string.")
        
        self.doc_len += 1  # extend doc_len

        if term in self.terms:
            self.terms[term] += 1  # add frequency if the term exists
        else:
            self.terms[term] = 1  # if it doesn't exist, add it (setting frequency to 1)
        
    def get_doc_id(self) -> str:
        """Return the document ID."""
        return self.doc_id
    
    def get_term_list(self, sorted_by_freq: bool = None) -> dict:
        """
        Return a list of terms occurring in the document, optionally sorted by their frequency.
        If sorted_by_freq is True, the terms are returned sorted by their frequency in descending order.
        If sorted_by_freq is False or None (default), the terms are returned in arbitrary order.
        """

        # Type check to ensure sorted_by_freq is either None or a boolean
        if not isinstance(sorted_by_freq, (bool, type(None))):
            raise TypeError("sorted_by_freq: must be a boolean or None.")

        if sorted_by_freq:
            # If sorted_by_freq is True
            sorted_terms = sorted(self.terms.items(), key=lambda word: word[1], reverse=sorted_by_freq)  # generate a sorted list of terms by frequency
            return {term: freq for term, freq in sorted_terms}  # return key:value pairs based on sorted terms
        else:
            # If sorted_by_freq is False or None, return the terms as is (i.e., unsorted and as they are added in)
            return self.terms
        
    def get_bag_of_words(self, sorted_by_freq: bool = None) -> str:
        """Return full bag-of-words representation for bow_document object, including; doc_id, term_count, doc_len, and terms."""
        
        # Type check to ensure sorted_by_freq is either None or a boolean
        if not isinstance(sorted_by_freq, (bool, type(None))):
            raise TypeError("sorted_by_freq: must be a boolean or None.")

        # Defining formatted string for bag-of-word representation
        bag_of_words = f"""doc_id='{self.doc_id}',term_count={len(self.get_term_list())},doc_len={self.doc_len},terms={self.get_term_list(sorted_by_freq)}"""

        return bag_of_words  # return BOW representation; this kind of data can be stored and "unpacked" easily
    
    @property  # accessor (get) method for doc_len
    def doc_len(self) -> int:
        """The doc_len property getter method."""
        return self._doc_len

    @doc_len.setter  # mutator (setter) method for doc_len
    def doc_len(self, value: int):
        """The doc_len property setter method."""
        if not isinstance(value, int):
            raise TypeError("doc_len: must be an int.")
        if value < 0:
            raise ValueError("doc_len: must not be negative.")
        
        self._doc_len = value

# Define the class for collection of bow_document objects
class bow_document_collection:
    def __init__(self):
        self.docs = {}  # initialise dictionary to hold collection (dict) of doc_id:bow_document

        self.term_doc_count = {}  # initialise dictionary to track the number of documents each term appears in

    # Method to add a doc (bow_document object)
    def add_doc(self, doc: bow_document):
        """Add bow_document object to the collection, using doc_id as the key, and update the inverted index."""

        # Type check to ensure doc is a bow_document object
        if not isinstance(doc, bow_document):
            raise TypeError("doc: must be an instance of bow_document.")
        
        # Add to the docs dict; key as doc_id and value as bow_document object (doc_id:bow_document)
        self.docs[doc.get_doc_id()] = doc

        # Update term document count for each term
        for term in doc.terms:
            if term in self.term_doc_count:
                self.term_doc_count[term] += 1  # add one if the term exists in the corpus dictionary
            else:
                self.term_doc_count[term] = 1  # if it does not exist in the corpus dictionary, initialise by setting to 1
    
    def get_collection_ids(self) -> str:
        """Return list of document IDs present in the collection."""

        # Type check to ensure doc_id is a string
        if not len(self.docs) > 0:
            raise AttributeError("bow_document_collection object is empty, no IDs to return.")  # Corrected to match the check
        
        doc_ids_str = "'" + "', '".join(self.docs.keys()) + "'"  # create a string that lists doc_ids

        collection_ids = f"bow_document_collection(doc_ids: {doc_ids_str})"  # format the return variable

        return collection_ids

In [3]:
def stop_word_parser(stop_word_path: str) -> list:
    """Parse defined list of stop words (assumes txt file with words delimited with ',')."""

    # Type check to ensure stop_word_path is a str
    if not isinstance(stop_word_path, str):
        raise TypeError("stop_word_path: value must be a str.")
    
    # NOTE: need attribute check the path exists

    # Open file in read mode
    with open(stop_word_path, 'r') as file:
        stop_words = file.read()  # read text in given file into stop_words

    # We know what the format is ahead of time, so not a lot of processing needed;
    # i.e., assumes we don't need to make something more robust and that we're using the same txt
    stop_words = stop_words.lower().split(",")  # tokenize stop_words; delimited with ','
    stop_words = list(set(stop_words))  # reduce stop_words to uniques
    
    return stop_words  # return stop_words as a list object

def tokenization(words: str) -> list:
    """Tokenize input text by removing line breaks, numbers, punctuation, normalizing whitespace, stripping leading/trailing spaces, and splitting into lowercased words."""

    # Type check to ensure words is a str
    if not isinstance(words, str):
        raise TypeError("words: value must be a str.")

    words = words.replace("\n", "")  # don't want line breaks to contribute
    words = re.sub(r'\d', '', words)  # not interested in numbers for this particular task, remove
    words = re.sub(f'[{re.escape(string.punctuation)}]', ' ', words)  # not interested in punctuation, remove
    words = re.sub(r'\s+|\t+|\v+|\n+|\r+|\f+', ' ', words).strip()  # standardise the whitespaces, remove leading/trailing whitespace
    words = words.lower()  # standardise words as lower
    words = words.split()  # tokenize, deftault split on space

    # Filter out small words; can be important in some queries, usually in combinations, opting not to handle for simplicity.
    # For example, with no discrete management of apostrophes (indicating contractions or posession) aside from replacement 
    # of punctuation with a single space, we will get the following: "Amelia's" → ["Amelia", "s"] → ["Amelia"].
    # Unless they are actual words (e.g., "I" versus "s" or "t"), they won't be removed in stopping process.
    words = [word for word in words if len(word) >= 3]

    return words  # return list object of string words

def xml_parser(stop_words: list, xml_path: str) -> bow_document:
    """Parse a single XML file, process text, and return an bow_document object with term frequencies."""
    
    # Type check to ensure stop_words is a list of str
    if not isinstance(stop_words, list) or not all(isinstance(word, str) for word in stop_words):
        raise TypeError("stop_words: must be a list of strings.")
    
    # Type check to ensure xml_path is a str
    if not isinstance(xml_path, str):
        raise TypeError("xml_path: value must be a str.")
    
    # Check if provided xml_path is a valid xml file, raise AttributeError if it is not
    if not ((os.path.isfile(xml_path)) and (xml_path.lower().endswith(".xml"))):
        raise AttributeError(f"""xml_path: '{xml_path}' is not a valid xml file.""")
        # NOTE: check is included here for targeting single xml (wheras parse_rcv1v2() executes this check in loop)

    # DOCUMENT PARSING - recognition of the content and structure of text documents
    # Open file in read mode
    with open(xml_path, 'r') as file:
        xml = file.read()  # read xml in given file

    text = re.search(r'<text>\s*((?:<p>.*?</p>\s*)+)</text>', xml, re.DOTALL)  # find all text within the <text> tag

    # If no text found, raise attribute error; else return match group 1
    if not text:
        raise AttributeError(fr"""xml_path: '{xml_path}' did not contain any text, see text tag (expect match at '<text>\s*((?:<p>.*?</p>\s*)+)</text>' with re.DOTALL).""") 
    else:
        text = text.group(1)

    # Replace HTML entities with their corresponding characters
    html_entities = {"&lt;": "<", "&gt;": ">", "&amp;": "&", "&quot;": "\"", "&apos;": "'", "&nbsp;": " " }
    for entity, char in html_entities.items():
        text = text.replace(entity, char)
    
    text = re.sub(r'<.*?>', ' ', text).strip()  # remove any XML tags (p tags in our case)
    
    # TOKENIZING - forming words from sequence of characters; critically, generating a list of tokens
    words = tokenization(text)
    
    # POSTING - a collection of arbitrary data (including a pointer)
    item_id = re.search(r'<newsitem itemid="(\d+)"', xml)  # POINTER - a unique identifier of a document (item_id attribute from newsitem element in this case)

    if not item_id:
        # If no item_id found, raise attribute error
        raise AttributeError(f"""xml_path: '{xml_path}' did not contain pointer, see item_id attribute in newsitem tag (expect match at '<newsitem itemid="(\\d+)"').""") 
    else:
        item_id = item_id.group(1)  # otherwise, take group 1 of regex (just the \d+ match component)
        
    document = bow_document(item_id)  # initialise bow_document object with the pointer (item_id)

    # STOPPING - removing stop (function) words from the text being analysed; have little meaning on their own
    words = [word for word in words if word not in stop_words]
    
    # STEMMING - reducing words to their word stem, base or root form (remove morphological variations)
    stemmer = nltk.stem.PorterStemmer()  # Porter Stemmer: efficient for information retrieval and text processing tasks – can often create non-words in favour of faster speeds
    words = [stemmer.stem(word) for word in words] 
    
    # Iterate over each stemmed word
    for stemmed_word in words:
        document.add_term(stemmed_word)  # use method add_term to update the bow_document object (our arbitrary data)          

    return document  # return the bow_document object

def parse_rcv1v2(stop_words: list, input_path: str) -> bow_document_collection:
    """Parse XML documents in a directory, filter stop words, and return a collection of bow_document objects."""
    
    # Type check to ensure stop_words is a list of str
    if not isinstance(stop_words, list) or not all(isinstance(word, str) for word in stop_words):
        raise TypeError("stop_words: must be a list of strings.")
    
    # Type check to ensure input_path is a str
    if not isinstance(input_path, str):
        raise TypeError("input_path: value must be a str.")
    
    # NOTE: need to do attribute check to see if input_path exists

    collection = bow_document_collection()  # initialise bow_document_collection object (collection of bow_document objects)
    
    # Iterate through files in directory
    for xml_file in os.listdir(input_path):
        xml_path = os.path.join(input_path, xml_file)  # build path to files
        if ((os.path.isfile(xml_path)) and (xml_path.lower().endswith(".xml"))):
            doc = xml_parser(stop_words, xml_path)  # parse xml with xml_parser function
            collection.add_doc(doc)  # use method add_doc to update the bow_document_collection object

    # If no xmls parsed (i.e., collection length is 0), raise attribute error
    if len(collection.docs) == 0:
        raise AttributeError(f"""input_path: '{input_path}' did not contain any valid xml files.""")

    return collection  # return the bow_document_collection object

def parse_query(query: str, stop_words: list) -> dict:
    """Tokenize an input query, remove stop words, and return a dictionary of remaining word frequencies."""

    # Type check to ensure stop_words is a list of str
    if not isinstance(stop_words, list) or not all(isinstance(word, str) for word in stop_words):
        raise TypeError("stop_words: must be a list of strings.")
    
    # Type check to ensure query is a str
    if not isinstance(query, str):
        raise TypeError("query: value must be a string.")
    
    # TOKENIZING - forming words from sequence of characters; critically, generating a list of tokens
    words = tokenization(query)
    
    # STOPPING - removing stop (function) words from the text being analysed; have little meaning on their own
    words = [word for word in words if word not in stop_words]
    
    # STEMMING - reducing words to their word stem, base or root form (remove morphological variations)
    stemmer = nltk.stem.PorterStemmer()  # Porter Stemmer: efficient for information retrieval and text processing tasks – though can often create non-words in favour of faster speeds
    words = [stemmer.stem(word) for word in words]
    
    # Constrcut term:frequency dictionary by counting instances of each word (more efficient than for loop + if/else)
    query_term_frequency = {stemmed_word: words.count(stemmed_word) for stemmed_word in set(words)}

    return query_term_frequency  # return the dictionary containing word frequencies

In [4]:
def parse_queries(file_path: str) -> pd.DataFrame:
    # Type check to ensure the file_path is a string
    if not isinstance(file_path, str):
        raise TypeError("file_path: value must be a string.")
    
    with open(file_path, 'r') as file:
        data = file.read()
    
    # Define regex pattern to split queries
    query_pattern = re.compile(r'<Query>(.*?)</Query>', re.DOTALL)
    queries = query_pattern.findall(data)
    
    # Initialize lists for storing parsed data
    nums, titles, descriptions, narratives = [], [], [], []
    
    # Define regex patterns to extract individual fields
    num_pattern = re.compile(r'<num>\s*Number:\s*R(\w+)', re.MULTILINE)
    title_pattern = re.compile(r'<title>([\w\s,.-]*)', re.MULTILINE)
    desc_pattern = re.compile(r'<desc>\s*Description:\s*(.*?)\n\n', re.DOTALL)
    narr_pattern = re.compile(r'<narr>\s*Narrative:\s*(.*?)\n\n', re.DOTALL)
    
    for query in queries:
        # Extract data using regex patterns
        num_match = num_pattern.search(query)
        title_match = title_pattern.search(query)
        desc_match = desc_pattern.search(query)
        narr_match = narr_pattern.search(query)
        
        nums.append(num_match.group(1) if num_match else pd.NA)
        titles.append(title_match.group(1).strip() if title_match else pd.NA)
        descriptions.append(desc_match.group(1).strip() if desc_match else pd.NA)
        narratives.append(narr_match.group(1).strip() if narr_match else pd.NA)

    # Create a pandas DataFrame
    query_frame = pd.DataFrame({
        'Number': nums,
        'Title': titles,
        'Description': descriptions,
        'Narrative': narratives
    })
    
    return query_frame

def write_scores_to_file(scores: dict, filename: str):
    """
    Write the scores dictionary to a .dat file.
    """

    if not isinstance(scores, dict):
        raise TypeError("scores: value must be a dictionary.")
    
    if not all((isinstance(doc_id, str)) and (isinstance(score, (int, float))) for doc_id, score in scores.items()):
        raise ValueError("scores: dictionary must consist of string keys (for documents) and int/float values (for document scores).")

    if not isinstance(filename, str):
        raise TypeError("filename: value must be a string.")
    
    # Combine the directory and filename to form the full path
    directory = 'RankingOutputs'
    filepath = os.path.join(directory, filename)

    # Check if the directory exists, and create it if it doesn't
    if not os.path.exists(directory):
        os.makedirs(directory)

    with open(filepath, 'w', newline='') as file:
        writer = csv.writer(file, delimiter='\t')  # Using tab delimiter for .dat format
        for doc_id, score in scores.items():
            writer.writerow([doc_id, score])

# Task 1: BM25

**Description:** Design a BM25-based IR model (**BM25**) that ranks documents in each data collection using the corresponding topic (query) for all 50 data collections.


**Inputs:** 50 long queries (topics) in *the50Queries.txt* and the corresponding 50 data collections (*Data_C101, Data_C102, …, Data_C150*).


**Output:** 50 ranked document files (e.g., for Query *R107*, the output file name is “BM25_R107Ranking.dat”) for all 50 data collections and save them in the folder “RankingOutputs”.

For each long query (topic) $Q$, you need to use the following equation to calculate a score for each document $D$ in the corresponding data collection (dataset):

$\sum_{i \in Q} \log_{10}(\frac{(r_i + 0.5)/(R-r_i+0.5)}{(n_i-r_i+0.5)/(N-n_i-R+r_i+0.5)})\cdot\frac{(k_1+1)f_i}{K+f_i}\cdot\frac{(k_2+1)qf_i}{k_2+qf_i}$

- $Q$ is the title of the long query, 
- $k_1 = 1.2$
- $k_2=500$
- $b = 0.75$
- $dl = len(D)$
- $avdl$ is the average length of a document in the dataset. 
- $K = k1\cdot((1-b) + b\cdot dl /avdl)$
- The ***base of the log function is 10***. 

Note that *BM25 values can be negative*, and you may need to update the above equation to produce non-negative values but keep the resulting documents in the same rank order.

**Formally describe your design for BM25** in an algorithm to **rank documents in each data collection *using corresponding query* (topic) ***for all 50 data collections*****. When you use the BM25 score to rank the documents of each data collection, you also need to **answer what the query feature function and document feature function are**.

In [5]:
def BM25(collection: bow_document_collection, query: dict) -> dict:
    """
    BM25 ranking function for a collection of documents and a given query.
    Generates a score for a given documents term:frequency set.
    Incorporates term frequency (TF) and inverse document frequency (IDF) factors. 
    It accounts for term frequency saturation as well as document length bias.
    """
    
    # Type check to ensure coll is a bow_document_collection
    if not isinstance(collection, bow_document_collection):
        raise TypeError("collection: must be a bow_document_collection object.")
    
    # If no collection contains no documents, raise attribute error
    if len(collection.docs) == 0:
        raise AttributeError("bow_document_collectionection: object contains no documents (Rcv1Doc objects).")
    
    # Type check to ensure query is a dict
    if not isinstance(query, dict):
        raise TypeError("query: must be a dict object.")
    
    # Setting parameters
    k_1 = 1.2  # Controls non-linear term frequency normalization (saturation)
    k_2 = 500  # Controls non-linear term frequency normalization for query terms
    b = 0.75  # Controls to what degree document length normalizes tf values

    N = len(collection.docs)  # total number of documents in the collection
    R = 0  # number of relevant documents for this query; predefined by task
    r_i = 0  # number of relevant documents containing query term i; predefined by task

    # Calculate the average document length across the entire collection
    total_corpus_length = sum(doc.doc_len for doc in collection.docs.values())
    mean_doc_len = total_corpus_length / N
    
    doc_scores = {}  # initialize doc_score dictionary to store calculated scores

    # Loop through each term in the query.
    for query_term, query_frequency in query.items():
        n_i = collection.term_doc_count.get(query_term, 0)  # the number of documents containing term i (0 if not present)

        # Calculate the inverse document frequency for the term
        idf_component = math.log10(((r_i + 0.5)/(R - r_i + 0.5)) / ((n_i - r_i + 0.5) / (N - n_i - R + r_i + 0.5)))
        # idf_component = math.log10((N - n_i + 0.5) / (n_i + 0.5))  # NOTE: simplified, need feedback from Slack

        # Component measures the rarity of the term across the entire collection; 
        # term appearing in fewer documents will have a higher IDF, making it more influential.
        # Formula ensures that no division by zero occurs by introducing additive smoothing of 0.5 to the numerator and denominator.

        for doc_ID, doc in collection.docs.items():
            doc_len = doc.doc_len  # document length

            K = k_1 * ((1 - b) + b * doc_len / mean_doc_len)  # frequency normaliser
            
            document_term_frequency = doc.terms.get(query_term, 0)  # query term frequency within the document (0 if not present)
            
            # Calculate the term frequency normalization for the document term
            tf_component = ((k_1 + 1) * document_term_frequency) / (K + document_term_frequency)
            # This component adjusts the score based on the frequency of the term in the document.
            # The normalisation (denominator) prevents over-emphasis on terms that appear too frequently within a single document.
            # `k_1` controls the non-linear term frequency saturation, and `K` adjusts the weight based on document length.

            # Calculate the query term frequency normalization
            query_component = ((k_2 + 1) * query_frequency) / (k_2 + query_frequency)
            # Adjusts the score based on the query term's frequency.
            # Denominator prevents over-emphasis on query terms that appear frequently.
            
            score = idf_component * tf_component * query_component  # determine the score (can be non-negative, clamping used below to adjust)
            
            if doc_ID not in doc_scores:
                doc_scores[doc_ID] = 0  # initialize doc_score if not present
            
            doc_scores[doc_ID] += max(score, 0)  # update the document's score with the product of the IDF and TF components (clamping non-negatives to 0)
    
    doc_scores = dict(sorted(doc_scores.items(), key=lambda item: item[1], reverse=True))  # sort the results

    # Return the document score
    return doc_scores

In [6]:
stop_words = stop_word_parser('common-english-words.txt')

query_frame = parse_queries('the50Queries.txt')
query_frame['parsed_titles'] = query_frame['Title'].apply(lambda row: parse_query(row, stop_words))

In [7]:
document_set = {}

input_path = 'Data_Collection'
for collection_path in os.listdir(input_path):
    data_key = collection_path.split('_C', 1)[1]
    document_set[data_key] = parse_rcv1v2(stop_words, os.path.join(input_path, collection_path))

In [8]:
BM25_results = {}

for query_key, collection in document_set.items():
    query = query_frame.loc[query_frame['Number'] == query_key, 'parsed_titles'].iloc[0]
    BM25_results[query_key] = BM25(collection, query)
    write_scores_to_file(BM25_results[query_key], f"BM25_R{query_key}Ranking")

# Task 2: Jelinek-Mercer Language Model

**Description:** Design a Jelinek-Mercer based Language Model (**JM_LM**) that ranks documents in each data collection using the corresponding topic (query) for all 50 data collections.


**Inputs:** 50 long queries (topics) in *the50Queries.txt* and the corresponding 50 data collections (*Data_C101, Data_C102, …, Data_C150*).


**Output:** 50 ranked document files (e.g., for Query *R107*, the output file name is “JM_LM_R107Ranking.dat”) for all 50 data collections and save them in the folder RankingOutputs”.

For each long query (topic) $R_x$, you need to use the following equation to calculate a conditional probability for each document $D$ in the corresponding data collection (dataset):


$p(R_x|D)=\Pi_{i=1}^n ((1-\lambda)\cdot\frac{f_{q_i,D}}{|D|}+\lambda\cdot\frac{c_{q_i}}{|C|})$

- $f_{q_i,D}$ is the number of times query word $q_i$ occurs in document $D$
- $|D|$ is the number of word occurrences in $D$
- $c_{q_i}$ is the number of times query word $q_i$ occurs in the data collection $C$
- $|C|$ is the total number of word occurrences in data collection $C$
- `λ = 0.4`

**Formally describe your design for JM_LM** in an algorithm to **rank documents in each data collection *using corresponding query* (topic) ***for all 50 data collections*****. When you use the probabilities to rank the documents of each data collection, you also need to **answer what the query feature function and document feature function are**.

In [9]:
def JM_LM(collection, query):
    """
    Calculate the conditional probability of each document given a query using the Jelinek-Mercer smoothing Language Model.
    """
    
    # Type check to ensure collection is an bow_document_collection object
    if not isinstance(collection, bow_document_collection):
        raise TypeError("collection: must be a bow_document_collection object.")
    
    # Check if the collection contains any documents
    if len(collection.docs) == 0:
        raise AttributeError("bow_document_collection: object contains no documents (Rcv1Doc objects).")
    
    # Validate that the query is a dictionary
    if not isinstance(query, dict):
        raise TypeError("query: must be a dict object.")
    
    # Set lambda parameter for Jelinek-Mercer smoothing
    lambda_val = 0.4
    
    # Calculate the total length of the corpus by summing the lengths of all documents
    total_corpus_length = sum(doc.doc_len for doc in collection.docs.values())
    
    # Initialize an empty dictionary to store the scores for each document
    doc_scores = {}

    # Iterate through each term in the query
    for query_term in query:
        # Get the frequency of the query term in the entire collection
        c_qi = collection.term_doc_count.get(query_term, 0)
        
        # Iterate through each document in the collection
        for doc_ID, doc in collection.docs.items():
            # Get the frequency of the query term in the current document
            f_qi_D = doc.terms.get(query_term, 0)

            # Calculate the probability of the term occurring in the document
            p_doc = (f_qi_D / doc.doc_len) if doc.doc_len > 0 else 0
            
            # Calculate the probability of the term occurring in the whole collection
            p_coll = (c_qi / total_corpus_length) if total_corpus_length > 0 else 0

            # Calculate the smoothed score for the term using Jelinek-Mercer smoothing
            score = (1 - lambda_val) * p_doc + lambda_val * p_coll

            # Initialize the score for the document if not already done
            if doc_ID not in doc_scores:
                doc_scores[doc_ID] = 1  # Multiplicative identity

            # Multiply the score to the cumulative product if it's greater than zero
            if score > 0:
                doc_scores[doc_ID] *= score

    # Sort the documents by their score in descending order and return the sorted dictionary
    doc_scores = dict(sorted(doc_scores.items(), key=lambda item: item[1], reverse=True))

    # Return the document scores
    return doc_scores

In [10]:
JM_LM_results = {}

for query_key, collection in document_set.items():
    query = query_frame.loc[query_frame['Number'] == query_key, 'parsed_titles'].iloc[0]
    JM_LM_results[query_key] = JM_LM(collection, query)
    write_scores_to_file(JM_LM_results[query_key], f"JM_LM_R{query_key}Ranking")    

# Task 3: Pseudo-Relevance Model

**Description:** Based on the knowledge you gained from this unit, design a pseudo-relevance model (My_PRM) to rank documents in each data collection using the corresponding topic (query) for all 50 data collections.


**Inputs:** 50 long queries (topics) in the50Queries.txt and the corresponding 50 data collections (Data_C101, Data_C102, …, Data_C150).


**Output:** 50 ranked document files (e.g., for Query R107, the output file name is “My_PRM_R107Ranking.dat”) for all 50 data collections and save them in the folder RankingOutputs”.

**Formally describe your design for My_PRM** in an algorithm to **rank documents in each data collection *using corresponding query* (topic) ***for all 50 data collections*****.Your *approach should be generic*; that means it is feasible to be used for other topics (queries). You also need to **discuss the differences between My_PRM and the other two models (BM25 and JM_LM)**.

In [11]:
def My_PRM():
    
    
    return

# Task 4: Model Testing

**Description:** Use Python to implement three models: `BM25`, `JM_LM`, and `My_PRM`, and **test them on the given 50 data collections for the corresponding 50 queries (topics)**. 

Design Python programs to implement these three models. You can use a .py file (or a .ipynb file) for each model.


For each long query, your python programs will produce ranked results and save them into .dat files. For example, for query R107, you can save the ranked results of three models into “BM25_R107Ranking.dat”, “JM_LM_R107Ranking.dat”, and “My_PRM_R107Ranking.dat”, respectively by using the following format:
- The first column is the document id (the itemid in the corresponding XML document)
- The second column is the document score (or probability).

**Describe:** 
- Python packages or modules (or any open-source software) you used
- The data structures used to represent a single document and a set of documents for each model (you can use different data structures for different models).


You also need to **test the three models on the given 50 data collections for the 50 queries (topics) by *printing out the top 15 documents* for each data collection (in descending order)**. The **output will also be put in the appendix of your final report**.

In [12]:
## bm25 and jm_lm models have already been implemented & made .dat files
#need to implement my_prm model

In [13]:
#note: the outputs of the top-15 for each model goes in the appendix of the final report
##Note: Could have the sorting in the earlier code, before writing it into the file?
def get_top15(model_results):
    """
    Takes the model results, prints out the top-15 sorted by weights

    input:
    model results (dict), in the form of {query number, {documentID, document_score}}

    output:
    Prints top-15 list of documents for each query in descending order
    """
    #iterating over each set of {query:predictions}, where predictions is a dictionary of {docid : document weight}
    for(query, predictions) in model_results.items():
        print('Query' + str(query) + ' (DocID Weight):')        #printing query number + header for query

        #For the given query, sort the document weights and take the top 15 scores
        sorted_weights_top15 = {k: v for k, v in sorted(predictions.items(), key=lambda item: item[1], reverse=True)[:15]}        #sorting weights for query, selecting the top 15

        #iterating over each docid:weight for the predictions, for the given query
        for (docid, weight) in sorted_weights_top15.items():   #selecting each pairing
            print(docid + ': ' + str(weight))                  #printing the {docid:weight}



In [14]:
get_top15(BM25_results)

Query101 (DocID Weight):
46547: 1.3847669516515282
46974: 1.3847669516515282
62325: 1.0292957490094417
6146: 0.6957418000337314
22170: 0.6225844609564782
61329: 0.45712969204233117
61780: 0.3870064270429357
22513: 0.3038881818756373
82330: 0.2448921073604549
39496: 0.21044544731596362
18586: 0.0
26642: 0.0
26847: 0.0
27577: 0.0
30647: 0.0
Query102 (DocID Weight):
73038: 1.9654062144138527
26061: 1.8836675215878933
58476: 1.8505211923545217
57914: 1.827692660880318
78836: 1.6213729984378435
76635: 1.5728476262157196
12769: 1.524628521363549
65414: 1.4816271036527424
12767: 1.480572707240055
82227: 1.425411928703988
25096: 1.3785970499333264
24515: 1.2560857660546911
26611: 1.1950064608823974
33172: 1.1008705051298882
29908: 1.0882226835128364
Query103 (DocID Weight):
14314: 1.6183004908199734
81463: 1.2189516428505964
54533: 1.210884710930984
27426: 1.1885990930321604
27106: 1.1196289764976377
59459: 1.0198826799855123
83370: 0.949694942444308
20159: 0.7414845093806344
80988: 0.42531718

In [15]:
get_top15(JM_LM_results)

Query101 (DocID Weight):
46547: 0.0003686566682181676
46974: 0.0003686566682181676
62325: 5.67909896341436e-05
61329: 6.6829612013916405e-06
6146: 5.343522519648502e-06
61780: 3.12372078516988e-06
22170: 2.8915792887255432e-06
22513: 2.256512809426008e-06
82330: 1.4024817829811885e-06
39496: 1.1054548156956005e-06
18586: 2.972651605231867e-07
26642: 2.972651605231867e-07
26847: 2.972651605231867e-07
27577: 2.972651605231867e-07
30647: 2.972651605231867e-07
Query102 (DocID Weight):
78836: 5.193880353327135e-07
58476: 3.6238288552765587e-07
26061: 1.6922824849415993e-07
57914: 1.608537217534995e-07
76635: 1.2952021368676933e-07
73038: 9.530510823220392e-08
12769: 8.424995473011488e-08
12767: 6.59522374868332e-08
25096: 3.631100986911571e-08
24515: 2.9826489089815515e-08
82227: 2.8819290409952096e-08
65414: 1.4717139870081546e-08
26611: 1.4701832095644448e-08
33172: 9.634466041875688e-09
29908: 9.116744171101311e-09
Query103 (DocID Weight):
81463: 5.179167414484945e-07
14314: 4.7799827406

In [16]:
# get the top 15 from my_prm when implemented
## get_top15(My_PRM_results)

# Task 5: Model Evaluation

**Description:** Use three effectiveness measures to evaluate the three models.

In this task, you need to **use the relevance judgments (EvaluationBenchmark.zip)** to **compare with the ranking outputs in the folder of “RankingOutputs” for the selected effectiveness metric** for the three models.


You need to use the following three different effectiveness measures to evaluate the document ranking results you saved in the folder “RankingOutputs”:
1) Average precision (and MAP)
2) Precision@10 (and their average)
3) Discounted cumulative gain at rank position 10 ($p = 10$), $DCG_{10}$ (and their average):  
    $DCG_p=rel_i+\sum_{i=2}^p\frac{rel_i}{log_2(i)}$  
        $rel_i=1$ if the document at position $i$ is releveant; otherwise, it is 0.

Evaluation results can be summarized in tables or graphs. Examples are provided in the sepcification sheet.

### Getting evaluation benchmark set

In [17]:
#
def read_file_scores(file_path):
    """
    This function takes the file path to a .txt document in the eval_benchmark dataset, and returns a dict of {docid:relevance score}

    Input:
    file_path(str): name of a .txt file in the eval_benchmark folder

    output:
    topic_scores(dict): a dictionary of {docid:relevance_score}
    
    """
    topic_scores = {}       #initialise empty dictionary

    #iterate over each line in the file
    for line in open(file_path):
        line = line.strip()     #strip whitespaces
        line1 = line.split()    #split into [topic, docid, relevance_score]
        topic_scores[line1[1]] = int(float(line1[2]))   #add the {docid:relevance_score} to the dictionary

    return topic_scores


def get_rel_scores(folder_path):
    
    """
    This function gets the relevance scores for a collection of queries, by passing each text document to the read_file_scores() function.

    Input:
    folder_path(str): path to the evaluation_benchmark folder

    output:
    relevance_scores(dict): A dictionary in the form of {topic: {docid:relevance_score}}
    """

    relevance_scores = {}

    #for each .txt document in the folder
    for fn in os.listdir(folder_path):

        #get filepath/filename for .txt doc
        file_path = os.path.join(folder_path, fn)

        #changing name to just query - from 'Dataset101.txt' to '101' for the dict
        cleaned_fn = re.sub(r'\D', '', fn)
        
        #call read_file_scores function, getting each document score for the query
        relevance_scores[cleaned_fn] = read_file_scores(file_path)

    return relevance_scores


In [18]:
folder_path = 'EvaluationBenchmark'

relevance_scores = get_rel_scores(folder_path)

relevance_scores

{'101': {'6146': 0,
  '18586': 0,
  '22170': 0,
  '22513': 0,
  '26642': 0,
  '26847': 0,
  '27577': 0,
  '30647': 0,
  '39496': 1,
  '46547': 1,
  '46974': 1,
  '61329': 0,
  '61780': 0,
  '62325': 1,
  '63261': 1,
  '77909': 0,
  '80425': 0,
  '80950': 0,
  '81463': 0,
  '82330': 1,
  '82454': 1,
  '82912': 0,
  '83167': 0},
 '102': {'3265': 1,
  '3827': 1,
  '3828': 1,
  '3833': 1,
  '3834': 1,
  '3835': 1,
  '3837': 1,
  '3972': 0,
  '3976': 1,
  '4306': 1,
  '4310': 1,
  '4358': 1,
  '4395': 1,
  '4439': 1,
  '4881': 1,
  '4933': 0,
  '5862': 1,
  '6497': 1,
  '6498': 1,
  '6503': 1,
  '6534': 1,
  '6635': 0,
  '6636': 1,
  '6735': 1,
  '7115': 1,
  '7118': 1,
  '7502': 1,
  '7937': 0,
  '8327': 1,
  '8333': 1,
  '8455': 1,
  '9358': 0,
  '9479': 0,
  '9703': 1,
  '9726': 0,
  '9790': 1,
  '10182': 0,
  '11083': 0,
  '11481': 1,
  '11485': 1,
  '11922': 1,
  '11923': 1,
  '11930': 1,
  '11960': 0,
  '11979': 1,
  '12479': 1,
  '12767': 0,
  '12769': 0,
  '12812': 1,
  '12825': 0,


In [19]:
#sorting by document ID so that they're in the same order
def sort_by_docid(model_results):
    """
    This function sorts a dictionary in the form of {topic : {docid:relevance_score}}, by docid for each topic

    Input:
    model_results(dict): Dictionary of {topic : {docid:relevance_score}}

    Output:
    sorted_dict(dict): Dictionary of {topic : {docid:relevance_score}}, where the values {docid:relevance_score} are sorted by docid
    """


    sorted_dict = {}
    for key, value in model_results.items():
        # Sorting the inner dictionary by document ID in descending order
        sorted_inner_dict = dict(sorted(value.items(), key=lambda item: item[0], reverse=False))
        sorted_dict[key] = sorted_inner_dict
    return sorted_dict

relevance_scores = sort_by_docid(relevance_scores)
BM25_results = sort_by_docid(BM25_results)
JM_LM_results = sort_by_docid(JM_LM_results)

relevance_scores

{'101': {'18586': 0,
  '22170': 0,
  '22513': 0,
  '26642': 0,
  '26847': 0,
  '27577': 0,
  '30647': 0,
  '39496': 1,
  '46547': 1,
  '46974': 1,
  '61329': 0,
  '6146': 0,
  '61780': 0,
  '62325': 1,
  '63261': 1,
  '77909': 0,
  '80425': 0,
  '80950': 0,
  '81463': 0,
  '82330': 1,
  '82454': 1,
  '82912': 0,
  '83167': 0},
 '102': {'10182': 0,
  '11083': 0,
  '11481': 1,
  '11485': 1,
  '11922': 1,
  '11923': 1,
  '11930': 1,
  '11960': 0,
  '11979': 1,
  '12479': 1,
  '12767': 0,
  '12769': 0,
  '12812': 1,
  '12825': 0,
  '13313': 1,
  '13314': 1,
  '13320': 1,
  '13332': 0,
  '13335': 1,
  '13342': 1,
  '13345': 1,
  '14707': 1,
  '14713': 1,
  '15077': 1,
  '15082': 1,
  '15200': 1,
  '16390': 0,
  '16575': 0,
  '16897': 1,
  '16899': 1,
  '16922': 1,
  '16954': 0,
  '17517': 1,
  '17587': 1,
  '17609': 0,
  '17619': 0,
  '19033': 1,
  '19599': 1,
  '19601': 1,
  '19615': 1,
  '19634': 0,
  '19781': 0,
  '20127': 1,
  '20133': 1,
  '20156': 0,
  '20160': 1,
  '20199': 0,
  '202

### Average Precision (MAP)

In [20]:
def calculate_avg_precision(rel_docs, model_results, threshold):

    """"
    This function calculates the mean average precision (MAP) for each topic in a collection

    MAP is calculated by checking whether the document deemed relevant by the model (score exceeds the threshold) is considered relevant by the evaluation benchmark.


    Inputs:
    rel_docs (dict): A dictionary of {topic : {docid:relevance_score}} for the benchmark sets: used as the 'ground truth' scores
    model_results (dict): A dictionary of predictions for doc relevance, in the form of {topic : {docid:relevance_score}} from the model 
    threshold (float): A defined value, where if the model predicts a score above the threshold, the document has predicted the document is 'relevant'

    Returns:
    coll_precisions(dict): A dictionary of {topic : precision} for each topic in the collection
    """

    coll_precisions = {}

    for topic in rel_docs.keys():
        if topic in model_results:

           #getting the inner {docID:score} dictionaries
            rel_dict = rel_docs[topic]        #docID:relevance from benchmark
            model_dict = model_results[topic]  #docID: score from model - retrieved docs

            # Calculate the number of relevant docs
            R = 0
            for value in rel_dict.values():
                if value == 1:
                    R = R + 1


            # Calculate the number of retrieved docs
            R1 = 0
            for value in model_dict.values():
                if value > threshold:
                    R1 = R1 + 1
            

            # Calculate the number of correctly retrived docs (true positives)
            RR1 = 0
            for (docID, value) in model_dict.items():
                if (value > threshold) and (rel_dict[docID] == 1):
                    RR1 = RR1 + 1


            #calculate precision
           
            #if statement to handle any instances where there were no retrieved documents - returns score of 0
            if R1 > 0.0:
                p = float(RR1)/float(R1)
            else:
                p = 0.0


            #storing precision for each topic as {topic : precision}
            ####Note##### Can have R, R1, RR1 here to get the relevant docs, retrieved docs, and correctly retrieved docs if needed for checking outputs.
            coll_precisions[topic] = {
                'precision' : p
            }
        else:
            coll_precisions[topic] = {
                'precision' : 0
            }

    return coll_precisions



#function to calculate the averages
def calculate_map(coll_scores):
    """
    This function calculates the mean average precision, by averaging the precision for each query in a collection

    input:
    collection_scores(dict): A dict of {topic : scores}
    
    output:
    avg_score(float) : an average score for the collection
    """
    total_score = 0.0       #initialise score

    for topic, preds in coll_scores.items():    #iterate over each topic
        for score in preds.values():            #get the {docid:precision}
            total_score += score

    num_scores = len(coll_scores)

    avg_score = total_score / num_scores

    return avg_score


#### MAP for BM_25

In [21]:
# Define the threshold
threshold = 0.25

# Calculate precision for each query
bm25_precision = calculate_avg_precision(relevance_scores, BM25_results, threshold)

# Print the precision for each query
for topic, metric in bm25_precision.items():
    print(f" Topic {topic}: Precision: {metric['precision']}")

 Topic 101: Precision: 0.375
 Topic 102: Precision: 0.7878787878787878
 Topic 103: Precision: 0.1724137931034483
 Topic 104: Precision: 0.0
 Topic 105: Precision: 0.0
 Topic 106: Precision: 0.1875
 Topic 107: Precision: 0.25
 Topic 108: Precision: 0.10526315789473684
 Topic 109: Precision: 0.4
 Topic 110: Precision: 0.3125
 Topic 111: Precision: 0.17647058823529413
 Topic 112: Precision: 0.5454545454545454
 Topic 113: Precision: 0.43478260869565216
 Topic 114: Precision: 0.5
 Topic 115: Precision: 0.16666666666666666
 Topic 116: Precision: 0.25
 Topic 117: Precision: 0.0
 Topic 118: Precision: 0.25
 Topic 119: Precision: 0.0
 Topic 120: Precision: 0.2857142857142857
 Topic 121: Precision: 0.45161290322580644
 Topic 122: Precision: 0.6521739130434783
 Topic 123: Precision: 0.18181818181818182
 Topic 124: Precision: 0.0
 Topic 125: Precision: 0.5833333333333334
 Topic 126: Precision: 0.0
 Topic 127: Precision: 0.4166666666666667
 Topic 128: Precision: 0.09090909090909091
 Topic 129: Prec

In [22]:
# Get the mean average precision for bm_25
calculate_map(bm25_precision)

0.2556163850770885

#### MAP for JM_LM

In [23]:
# Define the threshold
threshold = 0.0001

# Calculate precision for each query
jmlm_precision = calculate_avg_precision(relevance_scores, JM_LM_results, threshold)

# Print the precision for each query
for topic, metric in jmlm_precision.items():
    print(f" Topic {topic}: Precision: {metric['precision']}")


 Topic 101: Precision: 1.0
 Topic 102: Precision: 0.0
 Topic 103: Precision: 0.0
 Topic 104: Precision: 0.0
 Topic 105: Precision: 0.0
 Topic 106: Precision: 0.0
 Topic 107: Precision: 0.0
 Topic 108: Precision: 0.0
 Topic 109: Precision: 0.0
 Topic 110: Precision: 0.0
 Topic 111: Precision: 0.0
 Topic 112: Precision: 0.0
 Topic 113: Precision: 0.0
 Topic 114: Precision: 0.0
 Topic 115: Precision: 0.0
 Topic 116: Precision: 0.0
 Topic 117: Precision: 0.125
 Topic 118: Precision: 0.0
 Topic 119: Precision: 0.0
 Topic 120: Precision: 0.0
 Topic 121: Precision: 0.0
 Topic 122: Precision: 0.0
 Topic 123: Precision: 0.0
 Topic 124: Precision: 0.0
 Topic 125: Precision: 0.7142857142857143
 Topic 126: Precision: 0.6153846153846154
 Topic 127: Precision: 0.0
 Topic 128: Precision: 0.0
 Topic 129: Precision: 0.0
 Topic 130: Precision: 0.0
 Topic 131: Precision: 0.16666666666666666
 Topic 132: Precision: 0.0
 Topic 133: Precision: 0.0
 Topic 134: Precision: 0.0
 Topic 135: Precision: 0.0
 Topic 

In [24]:
#Calculate map for jm_lm
calculate_map(jmlm_precision)

0.06969946719946721

#### MAP for My_PRM

### Precision @ 10

In [25]:
# Functions

def precision_at_k(rel_docs, retrieved_docs, k, threshold):

    """
    This function calculates the precision at k for a given set of relevant and retrieved items.
    It sorts values by the scores, from highest to lowest, taking the top-k scores.
    If the model predicts that the document is relevant (score > threshold), and the document exists in the rel_docs (it is actually relevant), then the score is added. 

    Precision is calculated by (number of retrieved docs / number of retrieved relevant docs)
    
    Inputs:
    rel_docs(dict): a dictionary of {docid:relevance_score} for a given query. This is the 'ground truth' used to check if the document is relevant or not
    retrieved_docs(dict): A dictionary of {docid:relevance_score} for a given query. This is the model predictions 
    k(int) : The rank position up to which precision is calculated
    threshold(float): A threshold to determine relevance. If the model's score for that document exceeds the threshold, the model considers that document relevant

    output:
    precision(float): the precision at rank k for the model

    """
    #sort the model's scores in descending order, taking the top 'k' scores
    sorted_predictions = sorted(retrieved_docs.items(), key=lambda item: item[1], reverse=True)[:k]


    #Make dictionary of {docID : relevant_score} using  rel_docs
    relevant_items = {}     #initialise dictionary

    for (docid, score) in rel_docs.items():     #for each document_id:score in the dict
        if score == 1:              #if the score is equal to or above threshold (1 in this case from eval_benchmark)
            relevant_items[docid] = score       #add relevant documentID, and score to the dictionary

    
    #count number of retrieved docs
    R1 = 0
    for (docid, score) in sorted_predictions:
        if (score >= threshold):
            R1 = R1 + 1

    #calculating the number of documents in the retrieved doc collection that are correctly retrieved, using the top-k predictions
    RR1 = 0      #initialise count 

    for (docid, score) in sorted_predictions:       #for each document in the model predictions
        if docid in relevant_items:                 #if the document retrieved is correctly in the 'relevant documents' dictionary
            if score >= threshold:            #if the score given to the model by the document is equal to or greater than the specified threshold
                RR1 = RR1 + 1                        #increases the count by 1
                

    #calculate precision at k

    #if statement to handle any instances where there were no retrieved documents - returns score of 0
    if R1 > 0.0 and RR1 > 0.0:
        precision = float(RR1) / float(R1)      #calculate precision (number of correctly retrieved docs / number of retrieved docs)
    else:
        precision = 0.0

    return precision


def calculate_precision_at_10(rel_docs, model_results, threshold):
    """
    This function calculates the precision at rank 10 for a collection of topics.
    It does this by passing each {docid:query} for a given topic to the precision_at_k function, and adding each result to a dictionary

    input:
    rel_docs(dict): a dictionary of {topic : {docid:relevance_score}}. This is the 'ground truth' used to check if the document is relevant or not
    model_results(dict): a dictionary of {topic : {docid:relevance_score}}. This is the model predictions, or 'retrieved documents'
    threshold(float): documents with a score/weight that exceeds this threshold are considered 'retrieved docs', ie., documents the model has predicted are relevant.

    output:
    precision_at_10_per_topic(dict) : A dictionary of {topic:p10}, where each topic is a string, and each 'p10' is a float value of its precision at rank 10


    """
    precision_at_10_per_topic = {}      #initialise dictionary

    #iterate over each topic in the dict
    for topic in rel_docs.keys():
        if topic in model_results:    #ensuring the topic exists in the predictions
            #passing to the precision_at_k() function
            precision_at_10 = precision_at_k(rel_docs[topic], model_results[topic], 10, threshold)
            #assigning the value for that topic in the dictionary  
            precision_at_10_per_topic[topic] = precision_at_10
        else:
            precision_at_10_per_topic[topic] = 0.0  #otherwise score = 0
    return precision_at_10_per_topic


In [26]:
#note: this can be used for any {topic:score} dict - so at the moment, both p10 and dcg
def calculate_avg_score(collection_scores):
    """
    This function calculates the average score for a collection of {topic:score}. It sums up each value in the dictionary, then divides by the number of entries

    input: 
    collection_scores(dict): A collection of {ID:scores} 

    output:
    avg_score(float): a value for the average score in the 'values' part of the dictionary
    
    """

    total_score = 0.0 #initialise score
    total_score = sum(collection_scores.values())

    num_scores = len(collection_scores)

    avg_score = total_score / num_scores

    return avg_score 


#### BM_25 Precision @ 10 

In [27]:
# Define the threshold
model_threshold = 0.25

# Calculate precision at 10
bm_precision_at_10_per_topic = calculate_precision_at_10(relevance_scores, BM25_results, model_threshold)

# Print the precision at 10 for each group
for topic, precision in bm_precision_at_10_per_topic.items():
    print(f"Topic {topic}: Precision at 10: {precision:.3f}")


Topic 101: Precision at 10: 0.375
Topic 102: Precision at 10: 0.500
Topic 103: Precision at 10: 0.500
Topic 104: Precision at 10: 0.000
Topic 105: Precision at 10: 0.000
Topic 106: Precision at 10: 0.200
Topic 107: Precision at 10: 0.250
Topic 108: Precision at 10: 0.000
Topic 109: Precision at 10: 0.400
Topic 110: Precision at 10: 0.300
Topic 111: Precision at 10: 0.000
Topic 112: Precision at 10: 0.600
Topic 113: Precision at 10: 0.500
Topic 114: Precision at 10: 0.500
Topic 115: Precision at 10: 0.200
Topic 116: Precision at 10: 0.200
Topic 117: Precision at 10: 0.000
Topic 118: Precision at 10: 0.300
Topic 119: Precision at 10: 0.000
Topic 120: Precision at 10: 0.600
Topic 121: Precision at 10: 0.400
Topic 122: Precision at 10: 0.400
Topic 123: Precision at 10: 0.100
Topic 124: Precision at 10: 0.000
Topic 125: Precision at 10: 0.700
Topic 126: Precision at 10: 0.000
Topic 127: Precision at 10: 0.400
Topic 128: Precision at 10: 0.100
Topic 129: Precision at 10: 0.300
Topic 130: Pre

In [28]:
calculate_avg_score(bm_precision_at_10_per_topic)

0.26199999999999996

#### JM_LM Precision @ 10

In [29]:
# Define the threshold
threshold = 0.00001

# Calculate precision at 10
jmlm_precision_at_10_per_topic = calculate_precision_at_10(relevance_scores, JM_LM_results,threshold)

# Print the precision at 10 for each group
for topic, precision in jmlm_precision_at_10_per_topic.items():
    print(f"Topic {topic}: Precision at 10: {precision:.3f}")

Topic 101: Precision at 10: 1.000
Topic 102: Precision at 10: 0.000
Topic 103: Precision at 10: 0.000
Topic 104: Precision at 10: 0.000
Topic 105: Precision at 10: 0.000
Topic 106: Precision at 10: 0.000
Topic 107: Precision at 10: 0.000
Topic 108: Precision at 10: 0.000
Topic 109: Precision at 10: 0.000
Topic 110: Precision at 10: 0.000
Topic 111: Precision at 10: 0.000
Topic 112: Precision at 10: 1.000
Topic 113: Precision at 10: 0.000
Topic 114: Precision at 10: 0.000
Topic 115: Precision at 10: 0.000
Topic 116: Precision at 10: 0.100
Topic 117: Precision at 10: 0.200
Topic 118: Precision at 10: 0.000
Topic 119: Precision at 10: 0.000
Topic 120: Precision at 10: 0.000
Topic 121: Precision at 10: 0.000
Topic 122: Precision at 10: 1.000
Topic 123: Precision at 10: 0.000
Topic 124: Precision at 10: 0.400
Topic 125: Precision at 10: 0.700
Topic 126: Precision at 10: 0.600
Topic 127: Precision at 10: 0.000
Topic 128: Precision at 10: 0.000
Topic 129: Precision at 10: 0.000
Topic 130: Pre

In [30]:
calculate_avg_score(jmlm_precision_at_10_per_topic)

0.14422222222222222

#### My_PRM Precision @ 10

In [31]:
#implement p@10 function for My_PRM model

### Discounted Cumulative Gain at rank position 10

In [32]:
## Functions ##

import math

# Function to calculate dcg at position k

def dcg_at_k(rel_docs, model_predictions, k):
    """
    This function calculates the discounted cumulative rank gain (dcg) at position k for a set of {docid:score} predictions.

    DCG is calculated by summing relevance scores, divided by the log(2) of their rank position.

    For each document in top-k, check whether it is relevant.
    Relevance is determined by whether the document has the score (1) in the relevance_docs set
        if document at position i is relevant, relevance score of 1 is assigned
        if document at position i is irrelevant, score is zero

    Inputs:
    rel_docs (dict): A dictionary with the benchmark relevance for a given query. Stored in the form of {docid : relevance}
    model_predictions (dict): A dictionary of model predictions for a given query (topic). Stored in the form of {docid : relevance}
    k (int): The rank position DCG is calculated for.

    Output:
    dcg (float): Calculated dcg value for position k 

    """

    #sort the model scores based on values, taking only top-k results
    sorted_preds = sorted(model_predictions.items(), key = lambda item : item[1], reverse=True)[:k]

    #initialise dcg counter
    dcg = 0.0

    #calculate dcg for position 1
    #check whether the document at that position is relevant (has a score of 1 in the rel_docs set)
    docid, score = sorted_preds[0]      #get the docid & score for the highest-ranked document
    if rel_docs.get(docid, 0) == 1:     #check if the document is relevant in the benchmark set
        relevance = 1                   #assign a value of 1
        dcg += relevance                #add relevance to the dcg 

    #iterate over the top-k sorted predictions
    for i in range (1, k):              #starting from position 2
        if i < len(sorted_preds):       #ensuring we haven't exceeded the number of predictions
            docid, score = sorted_preds[i]  #get the document ID and score for the current rank
            if rel_docs.get(docid, 0) == 1:     #check whether the score exists in the benchmark set
                relevance = 1
                discounted_rel = relevance / math.log2(i + 1)       #apply the dcg formula by dividing by log i 

                dcg += discounted_rel           #add to running total
    

    #return calculated DCG value at position k
    return dcg


#Function to calculate dcg for a collection of topics

def calculate_dcg10(rel_docs, model_predictions):
    """
    This calculates dcg at position 10 for each topic (query) in a collection of {topic : {docid:score}}
    
    For each query, it passes the {docid:relevance score} for the benchmark set, and the model predictions to the dcg_at_k() function 

    Inputs:
    rel_docs(dict): A dictionary where the keys are the topics, and values are a {docid:score} dictionary from the benchmark set (relevance docs)
    model_predictions(dict): A dictionary where the keys are the topics, and values are a {docid:score} dictionary for predictions from our model (retrieved docs)

    output:
    dcg10_scores(dict): A dictionary where the keys are the topics, and the values are the DCG10 values for the given model
    """

    #initialise a dictionary to store the DCG at 10 score for each topic
    dcg10_scores = {}

    #iterate over each topic in the relevance scores documents
    for topic in rel_docs.keys():

        #check if the topic also exists in the model predictions
        if topic in model_predictions:

            #calculate dcg @ 10 for this topic
            #pass to calculate dcg_at_k, using k = 10 
            dcg_at_10 = dcg_at_k(rel_docs[topic], model_predictions[topic], k = 10)

            #store the results in a dictionary
            dcg10_scores[topic] = dcg_at_10

        else:
            dcg10_scores[topic] = 0.0

    return dcg10_scores


#### BM_25 DCG10

In [33]:
bm25_dcg10 = calculate_dcg10(relevance_scores, BM25_results)

for (topic, score) in bm25_dcg10.items():
    print(f"Topic {topic}: {score:.3f}")

Topic 101: 3.247
Topic 102: 2.749
Topic 103: 2.674
Topic 104: 3.868
Topic 105: 2.232
Topic 106: 1.387
Topic 107: 1.500
Topic 108: 0.000
Topic 109: 1.819
Topic 110: 1.320
Topic 111: 0.000
Topic 112: 2.398
Topic 113: 3.432
Topic 114: 3.018
Topic 115: 1.500
Topic 116: 0.732
Topic 117: 1.133
Topic 118: 1.220
Topic 119: 0.000
Topic 120: 3.189
Topic 121: 2.964
Topic 122: 2.005
Topic 123: 0.500
Topic 124: 1.667
Topic 125: 4.120
Topic 126: 2.948
Topic 127: 2.088
Topic 128: 0.333
Topic 129: 1.931
Topic 130: 1.764
Topic 131: 1.690
Topic 132: 1.631
Topic 133: 3.518
Topic 134: 0.764
Topic 135: 1.421
Topic 136: 0.887
Topic 137: 2.631
Topic 138: 0.315
Topic 139: 1.518
Topic 140: 4.868
Topic 141: 4.754
Topic 142: 0.301
Topic 143: 0.301
Topic 144: 0.688
Topic 145: 0.000
Topic 146: 1.000
Topic 147: 1.174
Topic 148: 0.000
Topic 149: 0.301
Topic 150: 1.877


In [34]:
#calculate dcg average for bm_25 
calculate_avg_score(bm25_dcg10)

1.7475467002197471

#### JM_LM DCG10

In [35]:
jmlm_dcg10 = calculate_dcg10(relevance_scores, JM_LM_results)

for (topic, score) in jmlm_dcg10.items():
    print(f"Topic {topic}: {score:.3f}")

Topic 101: 3.247
Topic 102: 3.434
Topic 103: 1.818
Topic 104: 4.567
Topic 105: 2.754
Topic 106: 2.315
Topic 107: 1.431
Topic 108: 0.688
Topic 109: 4.196
Topic 110: 2.377
Topic 111: 0.000
Topic 112: 3.918
Topic 113: 3.597
Topic 114: 3.395
Topic 115: 1.062
Topic 116: 1.000
Topic 117: 1.301
Topic 118: 1.018
Topic 119: 0.000
Topic 120: 2.918
Topic 121: 4.134
Topic 122: 2.475
Topic 123: 1.000
Topic 124: 2.856
Topic 125: 3.453
Topic 126: 2.860
Topic 127: 2.392
Topic 128: 1.000
Topic 129: 3.178
Topic 130: 1.856
Topic 131: 1.301
Topic 132: 1.431
Topic 133: 3.562
Topic 134: 0.631
Topic 135: 3.059
Topic 136: 2.319
Topic 137: 2.631
Topic 138: 0.657
Topic 139: 2.000
Topic 140: 4.254
Topic 141: 3.820
Topic 142: 0.315
Topic 143: 0.000
Topic 144: 2.178
Topic 145: 0.000
Topic 146: 1.000
Topic 147: 2.377
Topic 148: 0.634
Topic 149: 0.616
Topic 150: 1.818


In [36]:
#calculate dcg average for jm_lm 
calculate_avg_score(jmlm_dcg10)

2.0968713213581576

#### My_PRM DCG10

In [37]:
# Implement DCG10 for My_PRM model

# Task 6: Recommendation

**Description:** Recommend a model based on significance test and your analysis. 

You need to conduct a significance test to compare models. You can choose a t-test to perform a significance test on the evaluation results (e.g., in Tables 1, 2 and 3). 

You can compare models between:
- **BM25** and **JM_LM**
- **BM25** and **My_PRM**
- **JM_LM** and **My_PRM**

Based on $t$-test results ($p$-value and $t$-statistic), you can recommend a model (You ***want the proposed "My_RPM" to be the best because it is your own model***). You can perform the $t$-test using a single effectiveness measure or multiple measures. Generally, using more effectiveness measures provides stronger evidence against the null hypothesis. Note that if the $t$-test is unsatisfactory, you can use the evaluation results to refine **My_PRM** mode. For example, you can adjust parameter settings or update your design and implementation.