In [2]:
import pyndri
import collections
import math

In [2]:
class TFIDF:
    """Scoring class for the tf-idf method.
    
    Note:
        The log sublinear transform of the term frequencies is used as a benchmark.
    
    Attributes:
        index: pyndry index for the entire collection.
        inverted_index: dict of term frequencies per document.
        col_freq: dict of term frequencies for the entire collection.
        col_size: number of documents in the collection.
        tf_transform: string denoting possible sublinear tf transformations. accepted values are: log
    """
    
    def __init__(self, index: pyndri.Index, inverted_index: collections.defaultdict(dict), col_freq: collections.defaultdict(int), tf_transform: str):
        """Initialize tf-idf scoring function.
        
        Args:
            index: pyndry index for the entire collection.
            inverted_index: dict of term frequencies per document.
            col_freq: dict of term frequencies for the entire collection.
            tf_transform: string denoting possible sublinear tf transformations. accepted values are: `log`
        """
        self.index = index
        self.inverted_index = inverted_index
        self.col_freq = col_freq
        self.col_size = index.maximum_document() - index.document_base()
        self.tf_transform = tf_transform

    def score(self, int_doc_id: int, query_term_id: int, doc_term_freq: int) -> float:
        """Scoring method for a document and a query term.

        Args:
            int_doc_id: the document id.
            query_term_id: the query term id (assuming you have split the query to tokens).
            doc_term_freq: the document term frequency of the query term.
        """
        if self.tf_transform == 'log':
            wtf = self.log_tf(doc_term_freq)
        else:
            raise ValueError('Unsupported term frequency transformation specified: {}'.format(self.tf_transform))
        idf = self.idf(query_term_id)
        
        return wtf * idf
    
    def log_tf(self, doc_term_freq: int)-> float:
        """Apply sublinear transformation to document query term frequency.
        
        Args:
            doc_term_freq: the document term frequency for the query term.
            
        Return:
            Log sublinear transformation.
        """
        return 1 + math.log(doc_term_freq)
    
    def idf(self, query_term_id: int) -> float:
        """Calculate inverted document frequency.
        
        Args:
            query_term_id: pyndri query term id.
        Return:
            Inverted document frequency.
        """
        return math.log(self.col_size) - math.log(self.df(query_term_id))
    
    def df(self, query_term_id: int) -> int:
        """Calculate document frequency of query term.
        
        Args:
            query_term_id: pyndri query term id.
        Return:
            Length of the inverted index for the query.
        """
        return len(self.inverted_index[query_term_id])