In [1]:
import pyndri as pn
import collections as cl
import math

In [2]:
class TFIDF:
    """Scoring class for the tf-idf method.
    
    Attributes:
        index: pyndry index for the entire collection.
        inverted_index: dict of term frequencies per document.
        col_freq: dict of term frequencies for the entire collection.
        col_size: number of documents in the collection.
        tf_transform: string denoting possible sublinear tf transformations. accepted values are: log
    """
    
    def __init__(self, index: pn.Index, inverted_index: cl.defaultdict(dict), col_freq: cl.defaultdict(int), tf_transform: str):
        """Initialize tf-idf scoring function.
        
        Args:
            index: pyndry index for the entire collection.
            inverted_index: dict of term frequencies per document.
            col_freq: dict of term frequencies for the entire collection.
            tf_transform: string denoting possible sublinear tf transformations. accepted values are: `log`
        """
        self.index = index
        self.inverted_index = inverted_index
        self.col_freq = col_freq
        self.col_size = index.maximum_document() - index.document_base()
        self.tf_transform = tf_transform

    def score(self, int_doc_id: int, query_term_id: int, doc_term_freq: float) -> float:
        """Scoring function for a document and a query term.

        Args:
            int_document_id: the document id
            query_token_id: the query term id (assuming you have split the query to tokens)
            document_term_freq: the document term frequency of the query term 
        """
        if self.tf_transform == 'log':
            wtf = log_tf(doc_term_freq)
        else:
            raise ValueError('Unsupported term frequency transformation specified: {}'.format(self.tf_transform))
        idf = self.idf(int_doc_id, query_term_id)
        
        
        return wtf * idf
    
    def log_tf(doc_term_freq: int)-> float:
        return 1 + math.log(doc_term_freq)
    
    def idf(self, int_doc_id: int, query_term_id: int) -> float:
        return math.log(self.col_size) - math.log(self.df(query_term_id))
    
    def df(self, query_term_id: int) -> int:
        return len(self.inverted_index[query_term_id])
    
    