# Reading and Loading the dataset

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import os
import tarfile

# **Modified**: Using os.path.abspath to create an absolute path
# This ensures that the path is correct regardless of the current working directory.
tar_path = os.path.abspath('../time.tar.gz')
data_folder = os.path.abspath('../data')

# List of expected files after extraction
expected_files = ['TIME.ALL', 'TIME.QUE', 'TIME.REL']

# Step 1: Check if the 'data' folder exists, if not, create it
if not os.path.exists(data_folder):
    os.makedirs(data_folder)
    print(f"Created folder: {data_folder}")

# Step 2: Check if all expected files are already in the 'data' folder
files_present = all(os.path.exists(os.path.join(data_folder, f)) for f in expected_files)

if files_present:
    print("All files are already extracted.")
else:
    # Step 3: Open and extract the tar file if files are not present
    # Use 'r:' instead of 'r:gz' for non-gzip tar files
    try:
        with tarfile.open(tar_path, 'r:gz') as tar: # **Corrected mode to 'r:gz'**
            tar.extractall(path=data_folder)  # Extract the contents to the /data folder
            print(f"Extracted contents of {tar_path} to {data_folder}")
    except FileNotFoundError:
        print(f"Error: File not found at {tar_path}. Please check the path.")
        # You may want to add more detailed error handling here.

    # Verify that files have been extracted
    if all(os.path.exists(os.path.join(data_folder, f)) for f in expected_files):
        print("Files successfully extracted.")
    else:
        print("Error: Some files were not extracted successfully.")


Error: File not found at /time.tar.gz. Please check the path.
Error: Some files were not extracted successfully.


In [7]:
import os
import tarfile

In [8]:
import os
import tarfile
from collections import Counter, defaultdict
def parse_query_strings(strings):
    """
    Parse lines from TIME.QUE, which contains the search queries.

    Params:
      strings...A list of strings, one per line, from TIME.QUE
    Returns:
      A dict from query id to query text.
    """
    queries = {}
    query_id = None
    query_lines = []

    for line in strings:
        line = line.strip()
        if line.startswith('*FIND'):
            if query_id is not None:
                queries[query_id] = ' '.join(query_lines).strip()
            query_id = int(line.split()[1])
            query_lines = []
        elif line.startswith('*STOP'):
            if query_id is not None:
                queries[query_id] = ' '.join(query_lines).strip()
            query_id = None
        else:
            if query_id is not None:
                query_lines.append(line)

    return queries


In [9]:
def read_queries(fname):
    """ Read the queries from TIME.QUE. """
    with open(fname, 'r') as f:
        strings = f.readlines()
    return parse_query_strings(strings)

In [10]:
def parse_document_strings(strings):
    """
    Parse lines from TIME.ALL, which contains the documents.

    Params:
      strings...A list of strings, one per line, from TIME.ALL
    Returns:
      A list of documents (as strings).
    """
    documents = []
    doc_lines = []

    for line in strings:
        line = line.strip()
        if line.startswith('*TEXT'):
            if doc_lines:
                documents.append(' '.join(doc_lines).strip())
            doc_lines = []
        elif line.startswith('*STOP'):
            if doc_lines:
                documents.append(' '.join(doc_lines).strip())
        else:
            doc_lines.append(line)

    return documents

In [11]:
def read_documents(fname):
    """ Read the documents from TIME.ALL. """
    with open(fname, 'r') as f:
        strings = f.readlines()
    return parse_document_strings(strings)


In [12]:
def parse_relevance_strings(strings):
    """
    Parse lines from TIME.REL, which contains relevance judgements.

    Params:
      strings...A list of strings, one per line, from TIME.REL
    Returns:
      A dict from query id to the list of relevant document ids.
    """
    relevance_dict = {}
    for line in strings:
        tokens = line.split()
        if len(tokens) < 2:
            continue
        query_id = int(tokens[0])
        relevant_docs = list(map(int, tokens[1:]))
        relevance_dict[query_id] = relevant_docs
    return relevance_dict


In [13]:
def read_relevances(fname):
    """ Read the relevance judgements from TIME.REL. """
    with open(fname, 'r') as f:
        strings = f.readlines()
    return parse_relevance_strings(strings)

In [14]:
def read_data(data_dir):
    """ Read and parse the TIME dataset. """
    queries = read_queries(os.path.join(data_dir, 'TIME.QUE'))
    relevances = read_relevances(os.path.join(data_dir, 'TIME.REL'))
    documents = read_documents(os.path.join(data_dir, 'TIME.ALL'))

    print(f'Read {len(queries)} queries.')
    print(f'Read {len(relevances)} relevance judgements.')
    print(f'Read {len(documents)} documents.')

    return queries, relevances, documents

In [27]:
data_dir = "/content/drive/MyDrive/IR-Project-Samples/data"
queries, relevance, documents = read_data(data_dir)

Looking for queries file at: /content/drive/MyDrive/IR-Project-Samples/data/TIME.QUE
Read 83 queries.
Read 83 relevance judgements.
Read 423 documents.


# Preprocessing and Term Frequency Calculations

In [20]:
import math
import re
from collections import Counter, defaultdict

def preprocess(text):
    """ Tokenize and clean text by removing non-alphabetic characters. """
    return re.findall(r'\b\w+\b', text.lower())

def compute_tf_and_lengths(documents):
    """ Compute term frequencies (TF) and document lengths for each document. """
    tfs = {}
    doc_lengths = {}
    for doc_id, text in enumerate(documents):  # Ensure documents is a dictionary
        tokens = preprocess(text)
        tf = Counter(tokens)
        tfs[doc_id] = tf  # Store term frequencies as a dictionary
        doc_lengths[doc_id] = len(tokens)  # Store document lengths

    return tfs, doc_lengths


# Calculate Inverse Document Frequency

In [21]:
import math
from collections import defaultdict

def compute_idf(documents):
    """
    Compute inverse document frequency (IDF) for each term across all documents.

    Params:
      documents...A list of documents (as strings) or a dictionary of {doc_id: document}.

    Returns:
      A dictionary where the key is a term and the value is its IDF score.
    """
    N = len(documents)  # Total number of documents
    df = defaultdict(int)  # Document frequency of terms

    # Handle dictionary or list of documents
    if isinstance(documents, dict):
        doc_items = documents.values()  # For dictionary, get document contents
    else:
        doc_items = documents  # For list, documents are directly available

    # Count how many documents contain each term
    for doc in doc_items:
        terms = set(preprocess(doc))  # Use unique terms per document
        for term in terms:
            df[term] += 1

    # Compute IDF for each term
    idf = {}
    for term, freq in df.items():
        # Applying BM25 IDF formula: log((N - df + 0.5) / (df + 0.5) + 1)
        idf[term] = math.log((N - freq + 0.5) / (freq + 0.5) + 1)

    return idf



# BM25

In [22]:
def bm25_score(query, tfs, idf, doc_lengths, avg_doc_length, k1=1.5, b=0.75):
    """ Compute BM25 scores for each document based on a query. """
    query_terms = preprocess(query)
    scores = defaultdict(float)

    # Iterate over query terms and check if each term is in IDF dictionary
    for term in query_terms:
        if term not in idf:
            continue  # Skip terms not in the corpus

        # Iterate over each document and compute score
        for doc_id, tf in tfs.items():  # Ensure tfs is a dictionary
            term_freq = tf.get(term, 0)
            doc_len = doc_lengths[doc_id]
            numerator = idf[term] * term_freq * (k1 + 1)
            denominator = term_freq + k1 * (1 - b + b * (doc_len / avg_doc_length))
            scores[doc_id] += numerator / denominator

    return scores


In [23]:
def rank_documents_bm25(data_dir):
    """ BM25 ranking for TIME dataset. """
    # Read data
    queries, relevances, documents = read_data(data_dir)

    # Preprocess and compute term frequencies and document lengths
    tfs, doc_lengths = compute_tf_and_lengths(documents)
    avg_doc_length = sum(doc_lengths.values()) / len(documents)

    # Compute IDF for the document collection
    idf = compute_idf(documents)

    results = {}
    # Score each query against the documents
    for query_id, query in queries.items():
        print(f"Ranking documents for Query {query_id}...")

        # Compute BM25 scores
        scores = bm25_score(query, tfs, idf, doc_lengths, avg_doc_length)

        # Ensure sorting based on scores and that results are a list of tuples (doc_id, score)
        ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)

        results[query_id] = ranked_docs

        # Display top 5 results for each query
        print(f"Query {query_id} results:")
        for doc, score in ranked_docs[:5]:
            print(f"Document: {doc}, Score: {score}")
        print("\n")

    return results


In [26]:
# Example Usage
data_dir = "/content/drive/MyDrive/IR-Project-Samples/data"
results = rank_documents_bm25(data_dir)

Looking for queries file at: /content/drive/MyDrive/IR-Project-Samples/data/TIME.QUE
Read 83 queries.
Read 83 relevance judgements.
Read 423 documents.
Ranking documents for Query 1...
Query 1 results:
Document: 256, Score: 22.71176201731724
Document: 303, Score: 22.4315088278261
Document: 307, Score: 21.84549783684643
Document: 287, Score: 21.598730015953684
Document: 267, Score: 21.387080481128336


Ranking documents for Query 2...
Query 2 results:
Document: 325, Score: 32.78804008023718
Document: 322, Score: 32.76526919669497
Document: 333, Score: 31.88247587464462
Document: 382, Score: 30.7031265936503
Document: 303, Score: 29.810892549071077


Ranking documents for Query 3...
Query 3 results:
Document: 11, Score: 24.905664671622116
Document: 348, Score: 24.61997324042201
Document: 307, Score: 22.25697608719704
Document: 35, Score: 20.795141674726167
Document: 210, Score: 20.354745379184454


Ranking documents for Query 4...
Query 4 results:
Document: 210, Score: 26.70356474234012


In [28]:
def rank_documents_bm25_with_log(data_dir, output_file='bm25_results.txt'):
    """ BM25 ranking for TIME dataset. Writes the top 5 results to a .txt file in the same format as printed. """
    # Read data
    queries, relevances, documents = read_data(data_dir)

    # Preprocess and compute term frequencies and document lengths
    tfs, doc_lengths = compute_tf_and_lengths(documents)
    avg_doc_length = sum(doc_lengths.values()) / len(documents)

    # Compute IDF for the document collection
    idf = compute_idf(documents)

    results = {}

    # Open the output file in write mode
    with open(output_file, 'w') as f:
        # Score each query against the documents
        for query_id, query in queries.items():
            print(f"Ranking documents for Query {query_id}...")

            # Write the query and header for ranked results to the file
            f.write(f"Query {query_id}: {query}\n")
            f.write("Top 5 Ranked Documents:\n")

            # Compute BM25 scores
            scores = bm25_score(query, tfs, idf, doc_lengths, avg_doc_length)

            # Ensure sorting based on scores and get the top 5 documents
            ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:5]

            results[query_id] = ranked_docs

            # Display and write the top 5 results
            for rank, (doc_id, score) in enumerate(ranked_docs, 1):
                result_str = f"Document: {doc_id}, Score: {score:.4f}"
                print(result_str)  # Print to console
                f.write(f"{rank}. {result_str}\n")  # Write to file

            # Write a newline to separate queries in the output file
            f.write("\n")
            print("\n")

    print(f"Top 5 results for each query have been written to {output_file}")
    return results



In [29]:
results = rank_documents_bm25_with_log(data_dir)

Looking for queries file at: /content/drive/MyDrive/IR-Project-Samples/data/TIME.QUE
Read 83 queries.
Read 83 relevance judgements.
Read 423 documents.
Ranking documents for Query 1...
Document: 256, Score: 22.7118
Document: 303, Score: 22.4315
Document: 307, Score: 21.8455
Document: 287, Score: 21.5987
Document: 267, Score: 21.3871


Ranking documents for Query 2...
Document: 325, Score: 32.7880
Document: 322, Score: 32.7653
Document: 333, Score: 31.8825
Document: 382, Score: 30.7031
Document: 303, Score: 29.8109


Ranking documents for Query 3...
Document: 11, Score: 24.9057
Document: 348, Score: 24.6200
Document: 307, Score: 22.2570
Document: 35, Score: 20.7951
Document: 210, Score: 20.3547


Ranking documents for Query 4...
Document: 210, Score: 26.7036
Document: 307, Score: 25.9246
Document: 382, Score: 25.3929
Document: 267, Score: 25.1050
Document: 325, Score: 24.9041


Ranking documents for Query 5...
Document: 382, Score: 13.0332
Document: 394, Score: 12.6572
Document: 303, Sc

# Evaluation

In [30]:
def calculate_precision_at_k(relevant_docs, ranked_docs, k):
    """
    Calculate precision at k for a single query.

    Params:
      relevant_docs...A set of relevant document IDs for the query.
      ranked_docs.....A list of ranked document IDs for the query.
      k...............The cutoff for precision.

    Returns:
      Precision at k (float).
    """
    if not ranked_docs:
        return 0.0
    top_k_docs = ranked_docs[:k]
    relevant_retrieved = sum(1 for doc_id, _ in top_k_docs if doc_id in relevant_docs)
    return relevant_retrieved / k

In [31]:
def calculate_average_precision(relevant_docs, ranked_docs):
    """
    Calculate Average Precision (AP) for a single query.

    Params:
      relevant_docs...A set of relevant document IDs for the query.
      ranked_docs.....A list of ranked document IDs for the query.

    Returns:
      Average Precision (float).
    """
    if not relevant_docs:
        return 0.0

    relevant_docs = set(relevant_docs)
    num_relevant = len(relevant_docs)

    score = 0.0
    num_hits = 0

    for i, (doc_id, _) in enumerate(ranked_docs):
        if doc_id in relevant_docs:
            num_hits += 1
            score += num_hits / (i + 1)  # Precision at each relevant doc position

    return score / num_relevant

In [32]:
def calculate_map(queries, relevances, results):
    """
    Calculate Mean Average Precision (MAP) over all queries.

    Params:
      queries.....A dict from query_id to query string.
      relevances..A dict from query_id to a list of relevant document IDs.
      results.....A dict from query_id to the list of ranked (doc_id, score) tuples.

    Returns:
      Mean Average Precision (MAP) (float).
    """
    average_precisions = []

    for query_id in queries:
        relevant_docs = relevances.get(query_id, [])
        ranked_docs = results.get(query_id, [])

        if relevant_docs:
            ap = calculate_average_precision(relevant_docs, ranked_docs)
            average_precisions.append(ap)

    if not average_precisions:
        return 0.0

    return sum(average_precisions) / len(average_precisions)

In [33]:
def rank_documents_bm25_with_evaluation(data_dir, output_file='bm25_results.txt'):
    """ BM25 ranking for TIME dataset and evaluation with Precision and MAP. """
    # Read data
    queries, relevances, documents = read_data(data_dir)

    # Preprocess and compute term frequencies and document lengths
    tfs, doc_lengths = compute_tf_and_lengths(documents)
    avg_doc_length = sum(doc_lengths.values()) / len(documents)

    # Compute IDF for the document collection
    idf = compute_idf(documents)

    results = {}

    with open(output_file, 'w') as f:
        for query_id, query in queries.items():
            print(f"Ranking documents for Query {query_id}...")
            f.write(f"Query {query_id}: {query}\n")
            f.write("Top 5 Ranked Documents:\n")

            # Compute BM25 scores
            scores = bm25_score(query, tfs, idf, doc_lengths, avg_doc_length)
            ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:5]
            results[query_id] = ranked_docs

            for rank, (doc_id, score) in enumerate(ranked_docs, 1):
                result_str = f"Document: {doc_id}, Score: {score:.4f}"
                print(result_str)
                f.write(f"{rank}. {result_str}\n")

            f.write("\n")
            print("\n")

    # Evaluate Precision@5 and MAP
    k = 5
    precision_at_5 = []
    for query_id, ranked_docs in results.items():
        relevant_docs = relevances.get(query_id, [])
        precision = calculate_precision_at_k(relevant_docs, ranked_docs, k)
        precision_at_5.append(precision)
        print(f"Precision@{k} for Query {query_id}: {precision:.4f}")

    mean_precision_at_5 = sum(precision_at_5) / len(precision_at_5)
    print(f"\nMean Precision@{k}: {mean_precision_at_5:.4f}")

    map_score = calculate_map(queries, relevances, results)
    print(f"\nMean Average Precision (MAP): {map_score:.4f}")

    return results


In [34]:
results = rank_documents_bm25_with_evaluation(data_dir)

Looking for queries file at: /content/drive/MyDrive/IR-Project-Samples/data/TIME.QUE
Read 83 queries.
Read 83 relevance judgements.
Read 423 documents.
Ranking documents for Query 1...
Document: 256, Score: 22.7118
Document: 303, Score: 22.4315
Document: 307, Score: 21.8455
Document: 287, Score: 21.5987
Document: 267, Score: 21.3871


Ranking documents for Query 2...
Document: 325, Score: 32.7880
Document: 322, Score: 32.7653
Document: 333, Score: 31.8825
Document: 382, Score: 30.7031
Document: 303, Score: 29.8109


Ranking documents for Query 3...
Document: 11, Score: 24.9057
Document: 348, Score: 24.6200
Document: 307, Score: 22.2570
Document: 35, Score: 20.7951
Document: 210, Score: 20.3547


Ranking documents for Query 4...
Document: 210, Score: 26.7036
Document: 307, Score: 25.9246
Document: 382, Score: 25.3929
Document: 267, Score: 25.1050
Document: 325, Score: 24.9041


Ranking documents for Query 5...
Document: 382, Score: 13.0332
Document: 394, Score: 12.6572
Document: 303, Sc

# Vector Space Model

## Computing TF*IDF vectors

In [35]:
def compute_tf_idf(tfs, idf):
    """
    Compute the TF-IDF vector for each document.

    Params:
      tfs...........Term frequencies for each document.
      idf...........Inverse document frequencies.

    Returns:
      A dict from document ID to a dictionary of {term: tf-idf score}.
    """
    tf_idf = {}
    for doc_id, term_freqs in tfs.items():
        doc_tf_idf = {}
        for term, tf in term_freqs.items():
            doc_tf_idf[term] = tf * idf.get(term, 0)
        tf_idf[doc_id] = doc_tf_idf
    return tf_idf

In [36]:
def compute_query_tf_idf(query, idf):
    """
    Compute the TF-IDF vector for the query.

    Params:
      query.........A query string.
      idf...........Inverse document frequencies.

    Returns:
      A dictionary of {term: tf-idf score} for the query.
    """
    query_terms = preprocess(query)
    term_freqs = Counter(query_terms)
    query_tf_idf = {}

    for term, tf in term_freqs.items():
        query_tf_idf[term] = tf * idf.get(term, 0)

    return query_tf_idf

## Cosine Similarity Score

In [37]:
def cosine_similarity(vec1, vec2):
    """
    Compute the cosine similarity between two TF-IDF vectors.

    Params:
      vec1....A dictionary representing the first vector {term: tf-idf score}.
      vec2....A dictionary representing the second vector {term: tf-idf score}.

    Returns:
      Cosine similarity (float).
    """
    # Dot product
    common_terms = set(vec1.keys()).intersection(set(vec2.keys()))
    dot_product = sum(vec1[term] * vec2[term] for term in common_terms)

    # Magnitudes of the vectors
    magnitude1 = math.sqrt(sum(val ** 2 for val in vec1.values()))
    magnitude2 = math.sqrt(sum(val ** 2 for val in vec2.values()))

    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0  # Avoid division by zero

    return dot_product / (magnitude1 * magnitude2)


In [38]:
def rank_documents_vsm(data_dir):
    """ VSM ranking for TIME dataset. """
    # Read data
    queries, relevances, documents = read_data(data_dir)

    # Preprocess and compute term frequencies and document lengths
    tfs, doc_lengths = compute_tf_and_lengths(documents)

    # Compute IDF for the document collection
    idf = compute_idf(documents)

    # Compute TF-IDF vectors for documents
    doc_tf_idf = compute_tf_idf(tfs, idf)

    results = {}

    for query_id, query in queries.items():
        print(f"Ranking documents for Query {query_id} using VSM...")

        # Compute TF-IDF vector for the query
        query_tf_idf = compute_query_tf_idf(query, idf)

        # Compute cosine similarity between the query and all documents
        scores = {}
        for doc_id, doc_vector in doc_tf_idf.items():
            score = cosine_similarity(query_tf_idf, doc_vector)
            scores[doc_id] = score

        # Rank documents based on cosine similarity
        ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)

        results[query_id] = ranked_docs

        # Display top 5 results for each query
        print(f"Query {query_id} results (VSM):")
        for doc, score in ranked_docs[:5]:
            print(f"Document: {doc}, Score: {score}")
        print("\n")

    return results


In [39]:
def compare_bm25_vsm(data_dir):
    # Run BM25 Ranking
    queries, relevances, documents = read_data(data_dir)
    print("Running BM25 Ranking...")
    bm25_results = rank_documents_bm25(data_dir)

    # Run VSM Ranking
    print("\nRunning VSM Ranking...")
    vsm_results = rank_documents_vsm(data_dir)

    # Optionally, evaluate both using Precision@5 or MAP
    # Precision@5 for BM25
    k = 5
    precision_bm25 = []
    for query_id, ranked_docs in bm25_results.items():
        relevant_docs = relevances.get(query_id, [])
        precision = calculate_precision_at_k(relevant_docs, ranked_docs, k)
        precision_bm25.append(precision)

    mean_precision_bm25 = sum(precision_bm25) / len(precision_bm25)
    print(f"\nMean Precision@{k} (BM25): {mean_precision_bm25:.4f}")

    # Precision@5 for VSM
    precision_vsm = []
    for query_id, ranked_docs in vsm_results.items():
        relevant_docs = relevances.get(query_id, [])
        precision = calculate_precision_at_k(relevant_docs, ranked_docs, k)
        precision_vsm.append(precision)

    mean_precision_vsm = sum(precision_vsm) / len(precision_vsm)
    print(f"\nMean Precision@{k} (VSM): {mean_precision_vsm:.4f}")

    # Calculate MAP for BM25
    map_bm25 = calculate_map(queries, relevances, bm25_results)
    print(f"\nMean Average Precision (MAP) (BM25): {map_bm25:.4f}")

    # Calculate MAP for VSM
    map_vsm = calculate_map(queries, relevances, vsm_results)
    print(f"\nMean Average Precision (MAP) (VSM): {map_vsm:.4f}")


In [40]:
results_compare = compare_bm25_vsm(data_dir)

Looking for queries file at: /content/drive/MyDrive/IR-Project-Samples/data/TIME.QUE
Read 83 queries.
Read 83 relevance judgements.
Read 423 documents.
Running BM25 Ranking...
Looking for queries file at: /content/drive/MyDrive/IR-Project-Samples/data/TIME.QUE
Read 83 queries.
Read 83 relevance judgements.
Read 423 documents.
Ranking documents for Query 1...
Query 1 results:
Document: 256, Score: 22.71176201731724
Document: 303, Score: 22.4315088278261
Document: 307, Score: 21.84549783684643
Document: 287, Score: 21.598730015953684
Document: 267, Score: 21.387080481128336


Ranking documents for Query 2...
Query 2 results:
Document: 325, Score: 32.78804008023718
Document: 322, Score: 32.76526919669497
Document: 333, Score: 31.88247587464462
Document: 382, Score: 30.7031265936503
Document: 303, Score: 29.810892549071077


Ranking documents for Query 3...
Query 3 results:
Document: 11, Score: 24.905664671622116
Document: 348, Score: 24.61997324042201
Document: 307, Score: 22.256976087197

# Language Model with Laplace Smoothing

## Unigram Model

In [41]:
def compute_unigram_model(tfs, doc_lengths, vocab_size):
    """
    Compute unigram language models for each document with Laplace smoothing.

    Params:
      tfs............Term frequencies for each document.
      doc_lengths....Lengths of each document.
      vocab_size.....The size of the vocabulary.

    Returns:
      A dict from document ID to a dict of {term: smoothed probability}.
    """
    unigram_models = {}

    for doc_id, term_freqs in tfs.items():
        doc_length = doc_lengths[doc_id]
        model = {}

        for term, freq in term_freqs.items():
            model[term] = (freq + 1) / (doc_length + vocab_size)  # Laplace smoothing

        # Add probabilities for terms that don't appear in the document
        model['<UNK>'] = 1 / (doc_length + vocab_size)

        unigram_models[doc_id] = model

    return unigram_models

In [42]:
def score_documents_unigram(query, unigram_models, doc_lengths, vocab_size):
    """
    Compute the likelihood of a query given each document's unigram language model.

    Params:
      query..........The query string.
      unigram_models.The unigram language model for each document.
      doc_lengths....Lengths of each document.
      vocab_size.....The size of the vocabulary.

    Returns:
      A dict from document ID to the score (log-likelihood).
    """
    query_terms = preprocess(query)  # Tokenize and preprocess query
    scores = {}

    for doc_id, model in unigram_models.items():
        log_likelihood = 0

        for term in query_terms:
            # Use smoothed probability or the UNK (unknown) term probability
            prob = model.get(term, model['<UNK>'])
            log_likelihood += math.log(prob)  # Sum of log-probabilities

        scores[doc_id] = log_likelihood

    return scores

In [43]:
def rank_documents_unigram(data_dir):
    """ Unigram Language Model ranking for TIME dataset. """
    # Read data
    queries, relevances, documents = read_data(data_dir)

    # Preprocess and compute term frequencies and document lengths
    tfs, doc_lengths = compute_tf_and_lengths(documents)

    # Build the vocabulary
    all_terms = set()
    for doc_id, term_freqs in tfs.items():
        all_terms.update(term_freqs.keys())

    vocab_size = len(all_terms)

    # Compute Unigram language models for documents
    unigram_models = compute_unigram_model(tfs, doc_lengths, vocab_size)

    results = {}

    # Score and rank each query
    for query_id, query in queries.items():
        print(f"Ranking documents for Query {query_id} using Unigram Model...")

        # Compute scores using Unigram Model
        scores = score_documents_unigram(query, unigram_models, doc_lengths, vocab_size)

        # Rank documents based on the scores
        ranked_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)

        results[query_id] = ranked_docs

        # Display top 5 results for each query
        print(f"Query {query_id} results (Unigram Model):")
        for doc, score in ranked_docs[:5]:
            print(f"Document: {doc}, Score: {score}")
        print("\n")

    return results

In [44]:
def compare_bm25_vsm_unigram(data_dir):
    queries, relevances, documents = read_data(data_dir)
    # Run BM25 Ranking
    print("Running BM25 Ranking...")
    bm25_results = rank_documents_bm25(data_dir)

    # Run VSM Ranking
    print("\nRunning VSM Ranking...")
    vsm_results = rank_documents_vsm(data_dir)

    # Run Unigram Ranking
    print("\nRunning Unigram Model Ranking...")
    unigram_results = rank_documents_unigram(data_dir)

    # Optionally, evaluate all three using Precision@5 or MAP
    k = 5

    # Precision@5 for BM25
    precision_bm25 = []
    for query_id, ranked_docs in bm25_results.items():
        relevant_docs = relevances.get(query_id, [])
        precision = calculate_precision_at_k(relevant_docs, ranked_docs, k)
        precision_bm25.append(precision)

    mean_precision_bm25 = sum(precision_bm25) / len(precision_bm25)
    print(f"\nMean Precision@{k} (BM25): {mean_precision_bm25:.4f}")

    # Precision@5 for VSM
    precision_vsm = []
    for query_id, ranked_docs in vsm_results.items():
        relevant_docs = relevances.get(query_id, [])
        precision = calculate_precision_at_k(relevant_docs, ranked_docs, k)
        precision_vsm.append(precision)

    mean_precision_vsm = sum(precision_vsm) / len(precision_vsm)
    print(f"\nMean Precision@{k} (VSM): {mean_precision_vsm:.4f}")

    # Precision@5 for Unigram
    precision_unigram = []
    for query_id, ranked_docs in unigram_results.items():
        relevant_docs = relevances.get(query_id, [])
        precision = calculate_precision_at_k(relevant_docs, ranked_docs, k)
        precision_unigram.append(precision)

    mean_precision_unigram = sum(precision_unigram) / len(precision_unigram)
    print(f"\nMean Precision@{k} (Unigram): {mean_precision_unigram:.4f}")

    # Calculate MAP for BM25
    map_bm25 = calculate_map(queries, relevances, bm25_results)
    print(f"\nMean Average Precision (MAP) (BM25): {map_bm25:.4f}")

    # Calculate MAP for VSM
    map_vsm = calculate_map(queries, relevances, vsm_results)
    print(f"\nMean Average Precision (MAP) (VSM): {map_vsm:.4f}")

    # Calculate MAP for Unigram
    map_unigram = calculate_map(queries, relevances, unigram_results)
    print(f"\nMean Average Precision (MAP) (Unigram): {map_unigram:.4f}")

In [45]:
results_compare_uni = compare_bm25_vsm_unigram(data_dir)

Looking for queries file at: /content/drive/MyDrive/IR-Project-Samples/data/TIME.QUE
Read 83 queries.
Read 83 relevance judgements.
Read 423 documents.
Running BM25 Ranking...
Looking for queries file at: /content/drive/MyDrive/IR-Project-Samples/data/TIME.QUE
Read 83 queries.
Read 83 relevance judgements.
Read 423 documents.
Ranking documents for Query 1...
Query 1 results:
Document: 256, Score: 22.71176201731724
Document: 303, Score: 22.4315088278261
Document: 307, Score: 21.84549783684643
Document: 287, Score: 21.598730015953684
Document: 267, Score: 21.387080481128336


Ranking documents for Query 2...
Query 2 results:
Document: 325, Score: 32.78804008023718
Document: 322, Score: 32.76526919669497
Document: 333, Score: 31.88247587464462
Document: 382, Score: 30.7031265936503
Document: 303, Score: 29.810892549071077


Ranking documents for Query 3...
Query 3 results:
Document: 11, Score: 24.905664671622116
Document: 348, Score: 24.61997324042201
Document: 307, Score: 22.256976087197