

In [None]:
import math
from collections import defaultdict

class BM25:
    def __init__(self, inversion_tree_file):
        self.inversion_tree = self.parse_inversion_tree(inversion_tree_file)
        self.document_count = len(self.inversion_tree)
        self.avg_document_length = sum(sum(doc.values()) for doc in self.inversion_tree.values()) / self.document_count
        self.k1 = 1.2
        self.b = .75

    def parse_inversion_tree(self, inversion_tree_file):
        inversion_tree = defaultdict(dict)
        with open(inversion_tree_file, 'r') as file:
            for line in file:
                word, postings = line.strip().split('\t', 1)  # Change split separator to tab character
                postings = postings.split(', ')
                for posting in postings:
                    doc_id, word_freq = posting.split(':')
                    inversion_tree[word][doc_id] = int(word_freq)
        return inversion_tree

    def calculate_idf(self, term, query_terms):
        document_with_term_count = len(self.inversion_tree[term])
        query_term_freq = query_terms[term]
        return math.log((self.document_count - document_with_term_count + 0.5) / (document_with_term_count + 0.5) + 1) * query_term_freq

    def calculate_bm25_score(self, query, document, doc_id):
        score = 0.0
        document_length = sum(document.values())
        query_terms = defaultdict(int)
        for term in query:
            query_terms[term] += 1

        for term in query_terms:
            if term not in self.inversion_tree:
                continue
            idf = self.calculate_idf(term, query_terms)
            word_frequencies = self.inversion_tree[term]  # Get word frequencies for the current term
            term_frequency = word_frequencies.get(doc_id, 0)  # Extract term frequency for the current document
            numerator = term_frequency * (self.k1 + 1)
            denominator = term_frequency + (self.k1 * (1 - self.b + self.b * (document_length / self.avg_document_length)))
            score += idf * (numerator / denominator)

        return score

    def rank_documents(self, query, output_file):
        document_scores = {}
        processed_docs = set()
        for word, word_freqs in self.inversion_tree.items():
            for doc_id, word_freq in word_freqs.items():
                if doc_id not in processed_docs:
                    score = self.calculate_bm25_score(query, word_freqs, doc_id)
                    document_scores[doc_id] = score
                    processed_docs.add(doc_id)

        ranked_documents = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)

        # Write results to output file
        with open(output_file, 'w') as f:
            for doc_id, score in ranked_documents:
                f.write(f"Document ID: {doc_id}, Score: {score}\n")

# Example usage:
inversion_tree_file = "/content/sample_data/output_project6.txt"  # Change to your inversion tree file
output_file = "/content/sample_data/ranking_algo_output.txt"  # Change to the desired output file name
bm25 = BM25(inversion_tree_file)
query = ['apple', 'orange']
bm25.rank_documents(query, output_file)
print("Results written to", output_file)


Results written to /content/sample_data/ranking_algo_output.txt
