In [9]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict
import os

DATASET_PATH = r"C:\Users\Acer\Desktop\Week 6 IR\bbc-news-data.csv"
QUERY_FILE_PATH = r"C:\Users\Acer\Desktop\Week 6 IR\Query.txt.txt"
TEXT_COLUMN = "content"  

# Preprocessing
def tokenize(text):
    """Convert text to lowercase and split into words."""
    return re.findall(r'\b\w+\b', str(text).lower())

# Load documents
def load_documents(csv_path, text_col):
    """Load documents safely, skipping bad lines. Use tab separator."""
    df = pd.read_csv(csv_path, sep='\t', encoding="cp1252", on_bad_lines='skip')
    
    if text_col not in df.columns:
        raise ValueError(f"Column '{text_col}' not found. Available columns: {list(df.columns)}")
    
    df = df[[text_col]].dropna()
    documents = {i: row[text_col] for i, row in df.iterrows()}
    return documents

# Build inverted index
def build_index(docs):
    index = defaultdict(list)
    doc_lengths = {}
    total_terms = 0

    for doc_id, text in docs.items():
        words = tokenize(text)
        doc_lengths[doc_id] = len(words)
        total_terms += len(words)

        freq = defaultdict(int)
        for word in words:
            freq[word] += 1
        for word, count in freq.items():
            index[word].append((doc_id, count))

    avg_doc_len = total_terms / len(docs)
    return index, doc_lengths, avg_doc_len

# BM25 scoring
def compute_BM25(query, index, doc_lengths, avg_dl, k1=1.5, b=0.75):
    scores = defaultdict(float)
    N = len(doc_lengths)
    query_words = tokenize(query)

    for word in query_words:
        if word not in index:
            continue
        df = len(index[word])
        idf = np.log((N - df + 0.5) / (df + 0.5) + 1)
        for doc_id, freq in index[word]:
            denom = freq + k1 * (1 - b + b * (doc_lengths[doc_id] / avg_dl))
            score = idf * ((freq * (k1 + 1)) / denom)
            scores[doc_id] += score
    return scores

# Display results 
def show_results_table(query, scores, docs, top_n=6):
    sorted_docs = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    data = []
    for rank, (doc_id, score) in enumerate(sorted_docs, start=1):
        preview = docs[doc_id][:120].replace("\n", " ")
        if len(docs[doc_id]) > 120:
            preview += "..."
        data.append([rank, doc_id, round(score, 4), preview])
    
    df = pd.DataFrame(data, columns=["Rank", "Doc ID", "Score", "Content Preview"])
    print(f"\nQuery: {query}\n")
    print(df)

# Main program
if __name__ == "__main__":
    # Load documents
    documents = load_documents(DATASET_PATH, TEXT_COLUMN)
    index, doc_lengths, avg_dl = build_index(documents)

    # Load queries from file and process
    if os.path.exists(QUERY_FILE_PATH):
        with open(QUERY_FILE_PATH, 'r', encoding="cp1252") as f:
            queries = [line.strip() for line in f if line.strip()]
        
        for q in queries:
            scores = compute_BM25(q, index, doc_lengths, avg_dl)
            show_results_table(q, scores, documents)
    else:
        print("Query file not found.")



Query: politics UK government election

   Rank  Doc ID    Score                                                                                      Content Preview
0     1     972  13.3285   Chancellor Gordon Brown will deliver his Budget to the House of Commons on 16 March, the Treasu...
1     2    1249  12.9129   Chancellor Gordon Brown will deliver his Budget to the House of Commons on 16 March, the Treasu...
2     3    1218  12.8240   Voters' "pent up passion" could confound predictions of a low turnout in the coming general ele...
3     4    1221  12.6064   So who, if anyone, is playing politics with the security of the nation?  Michael Howard has no ...
4     5    1228  12.5939   Ex-chat show host Robert Kilroy-Silk is to contest the Derbyshire seat of Erewash at the next g...
5     6    1112  12.0865   Record numbers of schools across the UK are to take part in a mock general election backed by t...

Query: business stock market growth

   Rank  Doc ID    Score               

In [11]:

# EVALUATION METRICS 
def precision_at_k(retrieved, relevant, k):
    return sum(1 for d in retrieved[:k] if d in relevant) / k

def recall_at_k(retrieved, relevant, k):
    if len(relevant) == 0:
        return 0
    return sum(1 for d in retrieved[:k] if d in relevant) / len(relevant)

def average_precision(retrieved, relevant, k=10):
    score = 0.0
    hits = 0
    if len(relevant) == 0:
        return 0

    for i, d in enumerate(retrieved[:k], start=1):
        if d in relevant:
            hits += 1
            score += hits / i

    return score / len(relevant)

def dcg_at_k(retrieved, relevant, k):
    dcg = 0.0
    for i, d in enumerate(retrieved[:k], start=1):
        rel = 1 if d in relevant else 0
        dcg += rel / math.log2(i + 1)
    return dcg

def ndcg_at_k(retrieved, relevant, k):
    ideal_rel = min(len(relevant), k)
    idcg = sum(1 / math.log2(i + 1) for i in range(1, ideal_rel + 1))
    if idcg == 0:
        return 0
    return dcg_at_k(retrieved, relevant, k) / idcg

#  RELEVANCE HEURISTIC 
def get_relevant_docs(query, documents):
    query_tokens = tokenize(query)
    relevant = set()

    for doc_id, text in documents.items():
        text_tokens = tokenize(text)
        match_ratio = sum(token in text_tokens for token in query_tokens) / len(query_tokens)
        if match_ratio >= 0.5:  
            relevant.add(doc_id)

    return relevant

#  MAIN PROGRAM 
if __name__ == "__main__":
    documents = load_documents(DATASET_PATH, TEXT_COLUMN)
    index, doc_lengths, avg_dl = build_index(documents)

    K = 10
    results = []

    if os.path.exists(QUERY_FILE_PATH):
        with open(QUERY_FILE_PATH, 'r', encoding="cp1252") as f:
            queries = [line.strip() for line in f if line.strip()]

        for q in queries:
            scores = compute_BM25(q, index, doc_lengths, avg_dl)
            retrieved = [doc_id for doc_id, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)]
            relevant = get_relevant_docs(q, documents)

            results.append({
                "query": q,
                "precision@10": round(precision_at_k(retrieved, relevant, K), 4),
                "recall@10": round(recall_at_k(retrieved, relevant, K), 4),
                "AP": round(average_precision(retrieved, relevant, K), 4),
                "nDCG@10": round(ndcg_at_k(retrieved, relevant, K), 4)
            })

        results_df = pd.DataFrame(results)
        pd.set_option('display.max_colwidth', 100)
        pd.set_option('display.width', 200)

        print("\nEvaluation Results\n")
        print(results_df.to_string(index=False))

        print("\nSummary Metrics\n")
        print(f"MAP:          {results_df['AP'].mean():.4f}")
        print(f"Mean nDCG@10: {results_df['nDCG@10'].mean():.4f}")

    else:
        print("Query file not found.")



Evaluation Results

                                 query  precision@10  recall@10     AP  nDCG@10
       politics UK government election           1.0     0.0361 0.0361   1.0000
          business stock market growth           1.0     0.0418 0.0418   1.0000
             sports football world cup           1.0     0.0781 0.0781   1.0000
    entertainment movie release review           1.0     0.2381 0.2381   1.0000
            tech new smartphone launch           0.9     0.0928 0.0893   0.9266
 climate change global warming effects           0.6     1.0000 0.9762   0.9931
health coronavirus vaccine development           0.9     0.5294 0.4159   0.7799
        international diplomacy summit           0.3     1.0000 0.6806   0.8520
          economy unemployment rate UK           1.0     0.0704 0.0704   1.0000

Summary Metrics

MAP:          0.2918
Mean nDCG@10: 0.9502
