## Import Libraries and Download NLTK Data
Import required libraries and download NLTK data for tokenization and stopwords.

In [1]:
import math
import os
from collections import defaultdict
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from joblib import Parallel, delayed
from gensim import corpora, models
import pandas as pd
from itertools import product
from nltk.corpus import wordnet

# Download required NLTK data
nltk_data_path = os.path.expanduser('~/nltk_data')
try:
    nltk.download('punkt', download_dir=nltk_data_path, quiet=True)
    nltk.download('stopwords', download_dir=nltk_data_path, quiet=True)
except Exception as e:
    print(f"Error downloading NLTK data: {e}")

stop_words = set(stopwords.words('english'))


## Define Helper Functions
Define functions for tokenization, file reading, and custom TF, IDF, and normalization methods.

In [2]:
# Tokenization function
def tokenize(text):
    lemmatizer = WordNetLemmatizer()
    if not isinstance(text, str):
        text = str(text)
    try:
        tokens = word_tokenize(text.lower())
        return [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    except Exception as e:
        print(f"Tokenization failed for text: {text[:100]}\\nError: {e}")
        return []


# File reading function for CSV files
def read_csv_files(doc_file, qrel_file, query_file):
    try:
        doc_df = pd.read_csv(doc_file)
        if not {'DocID', 'Text'}.issubset(doc_df.columns):
            raise ValueError("The document file contains the columns 'DocID' and 'Text'.")
        docs = {int(doc_id): text for doc_id, text in zip(doc_df['DocID'], doc_df['Text'])}
    except Exception as e:
        print(f"Error reading the documentes file: {e}")
        docs = {}
    try:
        qrel_df = pd.read_csv(qrel_file)
        if not {'QueryID', 'DocID'}.issubset(qrel_df.columns):
            raise ValueError("The rels file must contain the columns 'QueryID', 'DocID'.")
        qrels = defaultdict(set)
        for _, row in qrel_df.iterrows():
            query_id = int(row['QueryID'])
            doc_id = int(row['DocID'])
            if query_id > 0 and doc_id > 0:
                qrels[query_id].add(doc_id)
    except Exception as e:
        print(f"Error reading rels file: {e}")
        qrels = defaultdict(set)
    try:
        query_df = pd.read_csv(query_file)
        if not {'QueryID', 'Text'}.issubset(query_df.columns):
            raise ValueError("The query file must contain the 'QueryID' and 'Text' columns.")
        queries = {int(query_id): text for query_id, text in zip(query_df['QueryID'], query_df['Text'])}
    except Exception as e:
        print(f"Error reading query file: {e}")
        queries = {}
    return docs, qrels, queries

# Load collection from CSV files
def load_collection(base_path, dataset_name):
    doc_file = f"{base_path}/{dataset_name}_docs.csv"
    qrel_file = f"{base_path}/{dataset_name}_qrels.csv"
    query_file = f"{base_path}/{dataset_name}_queries.csv"
    docs, qrels, queries = read_csv_files(doc_file, qrel_file, query_file)
    if not docs or not queries or not qrels:
        print(f"Warning: Incomplete data for {dataset_name}")
    return docs, queries, qrels

# Custom TF methods
def compute_natural_tf(tokens, dictionary):
    bow = dictionary.doc2bow(tokens)
    return [(term_id, count) for term_id, count in bow]

def compute_augmented_tf(tokens, dictionary):
    bow = dictionary.doc2bow(tokens)
    max_tf = max([count for _, count in bow], default=1)
    return [(term_id, 0.5 + 0.5 * (count / max_tf)) for term_id, count in bow]

def compute_boolean_tf(tokens, dictionary):
    bow = dictionary.doc2bow(tokens)
    return [(term_id, 1) for term_id, _ in bow]

def compute_log_avg_tf(tokens, dictionary):
    bow = dictionary.doc2bow(tokens)
    avg_tf = np.mean([count for _, count in bow]) if bow else 1
    return [(term_id, 1 + math.log(count / avg_tf) if count > 0 else 0) for term_id, count in bow]

def compute_logarithmic_tf(tokens, dictionary):
    bow = dictionary.doc2bow(tokens)
    return [(term_id, 1 + math.log(count) if count > 0 else 0) for term_id, count in bow]

# Custom IDF methods
def compute_standard_idf(df, N):
    return {term: math.log(N / (df.get(term, 1))) for term in df}

def compute_probabilistic_idf(df, N):
    return {term: math.log((N - df.get(term, 0)) / (df.get(term, 1))) for term in df}

def compute_none_idf(df, N):
    return {term: 1 for term in df}

# Custom normalization methods
def apply_normalization(vector, norm_method, doc_length=None):
    if not vector:
        return vector
    if norm_method == 'cosine':
        norm = np.sqrt(sum(weight ** 2 for _, weight in vector))
        return [(term_id, weight / norm if norm != 0 else weight) for term_id, weight in vector]
    elif norm_method == 'pivoted':
        pivot = 0.3
        norm = sum(weight for _, weight in vector)
        return [(term_id, weight * (1 - pivot + pivot * doc_length / norm) if norm != 0 else weight) 
                for term_id, weight in vector]
    elif norm_method == 'byte':
        norm = doc_length if doc_length and doc_length > 0 else 1
        return [(term_id, weight / norm) for term_id, weight in vector]
    return vector

## Compute TF-IDF Vectors
Compute TF-IDF vectors for documents and queries using custom TF, IDF, and normalization methods.

In [3]:
def compute_tfidf_gensim(docs, queries, tf_method_name, idf_method_name, norm_method):
    doc_tokens = [tokenize(doc) for doc in docs.values()]
    query_tokens = {qid: tokenize(query) for qid, query in queries.items()}
    doc_lengths = [len(tokens) for tokens in doc_tokens]
    
    dictionary = corpora.Dictionary(doc_tokens)
    
    df = defaultdict(int)
    for tokens in doc_tokens:
        for term in set(tokens):
            df[term] += 1
    
    tf_methods = {
        'natural': compute_natural_tf,
        'augmented': compute_augmented_tf,
        'boolean': compute_boolean_tf,
        'log_avg': compute_log_avg_tf,
        'logarithmic': compute_logarithmic_tf
    }
    tf_method = tf_methods[tf_method_name]
    
    corpus = [tf_method(tokens, dictionary) for tokens in doc_tokens]
    
    idf_methods = {
        'standard': compute_standard_idf,
        'probabilistic': compute_probabilistic_idf,
        'none': compute_none_idf
    }
    idf_method = idf_methods[idf_method_name]
    N = len(docs)
    idf = idf_method(df, N)
    
    doc_vectors = []
    for i, bow in enumerate(corpus):
        tfidf_vec = [(term_id, weight * idf.get(dictionary[term_id], 0)) for term_id, weight in bow]
        tfidf_vec = apply_normalization(tfidf_vec, norm_method, doc_lengths[i])
        doc_vectors.append((list(docs.keys())[i], tfidf_vec))
    
    query_vectors = {}
    for qid, tokens in query_tokens.items():
        q_bow = tf_method(tokens, dictionary)
        q_tfidf = [(term_id, weight * idf.get(dictionary[term_id], 0)) for term_id, weight in q_bow]
        q_tfidf = apply_normalization(q_tfidf, norm_method, len(tokens))
        query_vectors[qid] = q_tfidf
    
    return doc_vectors, query_vectors

## Evaluate Metrics
Evaluate the system using Precision@10, Recall, and MAP for each query, using dot product for similarity to preserve normalization effects.

In [4]:
def evaluate_metrics_gensim(doc_vectors, query_vector, qrels, k=10):
    k = min(k, len(doc_vectors))
    similarities = []
    for doc_id, doc_vec in doc_vectors:
        doc_vec_dict = {term_id: weight for term_id, weight in doc_vec}
        query_vec_dict = {term_id: weight for term_id, weight in query_vector}
        dot_product = sum(doc_vec_dict.get(term_id, 0) * query_vec_dict.get(term_id, 0) 
                         for term_id in set(doc_vec_dict) & set(query_vec_dict))
        similarity = dot_product  # Use dot product to preserve normalization effects
        similarities.append((doc_id, similarity))
    
    # Sort documents by similarity (descending)
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_k_ids = [doc_id for doc_id, _ in similarities[:k]]
    
    # Compute Precision@10 and Recall
    top_k_relevant = np.array([1 if did in qrels else 0 for did in top_k_ids])
    precision_at_k = np.mean(top_k_relevant) if k else 0.0
    recall = np.sum(top_k_relevant) / len(qrels) if len(qrels) else 0.0
    
    # Compute Average Precision (AP)
    ap = 0.0
    if len(qrels) > 0:
        relevant_count = 0
        precision_sum = 0.0
        for rank, doc_id in enumerate(top_k_ids, 1):
            if doc_id in qrels:
                relevant_count += 1
                precision = relevant_count / rank
                precision_sum += precision
        ap = precision_sum / len(qrels) if relevant_count > 0 else 0.0
    
    return {'Precision@10': precision_at_k, 'Recall': recall, 'MAP': ap}

def evaluate_collection_gensim(docs, queries, qrels, tf_method, idf_method, norm_method):
    doc_vectors, query_vectors = compute_tfidf_gensim(docs, queries, tf_method, idf_method, norm_method)
    results = Parallel(n_jobs=-1)(delayed(evaluate_metrics_gensim)(
        doc_vectors, query_vectors[qid], qrels[qid], k=10) for qid in queries)
    return {
        'Precision@10': np.mean([r['Precision@10'] for r in results]),
        'Recall': np.mean([r['Recall'] for r in results]),
        'MAP': np.mean([r['MAP'] for r in results])
    }

## Main Execution
Evaluate all combinations of TF, IDF, and normalization methods on the datasets and save results to a CSV file.

In [None]:
tf_methods = {
    'natural': compute_natural_tf,
    'augmented': compute_augmented_tf,
    'boolean': compute_boolean_tf,
    'log_avg': compute_log_avg_tf,
    'logarithmic': compute_logarithmic_tf
}
idf_methods = {
    'standard': compute_standard_idf,
    'probabilistic': compute_probabilistic_idf,
    'none': compute_none_idf
}
norm_methods = [None, 'cosine', 'pivoted', 'byte']
datasets = ['MED','CISI', 'cran', 'npl']
base_path = '../data/processed'

all_results = []
for dataset in datasets:
    print(f"Processing {dataset}...")
    docs, queries, qrels = load_collection(base_path, dataset)
    if not docs or not queries or not qrels:
        continue
    combinations = list(product(tf_methods.keys(), idf_methods.keys(), norm_methods))
    results = Parallel(n_jobs=-1)(delayed(evaluate_collection_gensim)(
        docs, queries, qrels, tf_m, idf_m, norm_m) for tf_m, idf_m, norm_m in combinations)
    all_results.extend(zip([dataset]*len(results), 
                          [tf_m for tf_m, _, _ in combinations], 
                          [idf_m for _, idf_m, _ in combinations], 
                          [norm_m if norm_m else 'none' for _, _, norm_m in combinations], 
                          results))

results_df = pd.DataFrame([(d, tf, idf, n, r['Precision@10'], r['Recall'], r['MAP']) 
                          for d, tf, idf, n, r in all_results],
                          columns=['Dataset', 'TF', 'IDF', 'Norm', 'Precision@10', 'Recall', 'MAP'])
results_df.to_csv(f'../tfidf_results_gensim.csv', index=False)
print(f"Results saved to ../tfidf_results_gensim.csv")

Processing MED...
Processing CISI...
Processing cran...
Processing npl...
Results saved to ./collections/tfidf_results_gensim.csv


## Display Best Configurations
Display the best TF, IDF, and normalization combination for each dataset and the overall best combination across all datasets based on MAP.

In [6]:
import pandas as pd

# Part 1: Best configuration for each dataset
print("\nBest Configurations for Each Dataset:")
best_configs = []
for dataset in datasets:
    subset = results_df[results_df['Dataset'] == dataset]
    if not subset.empty:
        best = subset.loc[subset['MAP'].idxmax()]
        best_configs.append({
            'Dataset': dataset,
            'TF': best['TF'],
            'IDF': best['IDF'],
            'Norm': best['Norm'],
            'MAP': best['MAP'],
            'Precision@10': best['Precision@10'],
            'Recall': best['Recall']
        })
        
# Display as a table
best_configs_df = pd.DataFrame(best_configs)
print("\nTable of Best Configurations for Each Dataset:")
display(best_configs_df)

best_config_all = []
# Part 2: Overall best configuration across all datasets
if not results_df.empty:
    # Calculate mean MAP for each combination of TF, IDF, and Norm across all datasets
    overall_best = results_df.groupby(['TF', 'IDF', 'Norm'])['MAP'].mean().reset_index()
    best_comb = overall_best.loc[overall_best['MAP'].idxmax()]
    best_config_all.append({
            'TF': best_comb['TF'],
            'IDF': best_comb['IDF'],
            'Norm': best_comb['Norm'],
            'MAP': best_comb['MAP'],})
    best_config_all_df = pd.DataFrame(best_config_all)
    print(f"\nBest Configuration for all Dataset:")
    display(best_config_all_df)


Best Configurations for Each Dataset:

Table of Best Configurations for Each Dataset:


Unnamed: 0,Dataset,TF,IDF,Norm,MAP,Precision@10,Recall
0,MED,logarithmic,standard,cosine,0.264143,0.65,0.319586
1,CISI,logarithmic,standard,cosine,0.059436,0.20625,0.081888
2,cran,boolean,standard,pivoted,0.002599,0.008,0.006137
3,npl,augmented,standard,pivoted,0.132711,0.317204,0.192412



Best Configuration for all Dataset:


Unnamed: 0,TF,IDF,Norm,MAP
0,logarithmic,probabilistic,pivoted,0.108332
