In [None]:
import math

In [1]:
class BM25:
    """Scoring class for the BM25 method.
    
    Note:
        The method neglects the term that normalizes query term frequencies because the
        queries presented here are typically short. Also, average document length is 
        computed relative to the entire collection, as opposed to the average length
        of the documents that contain one or more query terms. this makes the score generalisable
        to all the collection.
    Attributes:
        index: pyndry index for the entire collection.
        inverted_index: dict of term frequencies per document.
        k: tuning parameter that calibrates the document term frequency scaling.
        b: tuning parameter which calibrates the document length scaling.
        avg_len: average document length for the entire collection.
        col_size: number of documents in the collection.
    """
    
    def __init__(self, index, inverted_index, k, b, avg_len):
        """Initialize BM25 scoring method.
        
        Args: 
            index: pyndry index for the entire collection.
            inverted_index: dict of term frequencies per document.
            k: tuning parameter that calibrates the document term frequency scaling.
            b: tuning parameter which calibrates the document length scaling.
            avg_len: average document length for the entire collection.
        """
        self.index = index
        self.inverted_index = inverted_index
        self.k = k
        self.b = b
        self.avg_len = avg_len
        self.col_size = index.maximum_document() - index.document_base()
        
    def score(self, int_doc_id: int, query_term_id: int, doc_term_freq: int)-> float:
        """Compute the score for a document and a query term.
        
        Args:
            int_doc_id: the document id.
            query_term_id: the query term id (assuming you have split the query to tokens).
            doc_term_freq: the document term frequency of the query term.
        """
        
        wtf = self.wtf(int_doc_id, doc_term_freq)
        idf = self.idf(query_term_id)
        
        return wtf * idf
        
    def wtf(self, int_doc_id: int, doc_term_freq: int) -> float:
        """Compute the term frequency term in the score.
        
        Args:
            int_doc_id: the document id.
            doc_term_freq: the document term frequency of the query term.
            
        Return:
            Term frequency weight.
        """
        return self.num(doc_term_freq) / self.denom(int_doc_id, doc_term_freq)
        
    def num(self, doc_term_freq: int) -> float:
        """Numerator of the first term.
        
        Args:
            doc_term_freq: the document term frequency of the query term.
            
        Return:
            Term frequency scaled by the `k` parameter.
        """
        return (self.k + 1) * doc_term_freq
    
    def denom(self, int_doc_id: int, doc_term_freq: int) -> float:
        """Denominator of the first term.
        
        Args:
            int_doc_id: the document id.
            doc_term_freq: the document term frequency of the query term.
            
        Return:
            term frequency normalized by document length according to parameters `k` and `b`.
        """
        doc_len = len(self.index.document(int_doc_id)[1])
        return self.k * ((1-self.b) + self.b * (doc_len/self.avg_len)) + doc_term_freq
    
    def df(self, query_term_id: int) -> int:
        """Calculate document frequency of query term.
        
        Args:
            query_term_id: pyndri query term id.
        Return:
            Length of the inverted index for the query.
        """
        return len(self.inverted_index[query_term_id])
        
    def idf(self, query_term_id: int) -> float:
        """Calculate inverted document frequency.
        
        Args:
            query_term_id: pyndri query term id.
        Return:
            Inverted document frequency.
        """
        return math.log(self.col_size) - math.log(self.df(query_term_id))