<H3>PRI 2023/24: first project delivery</H3>

**GROUP 1**
- Amanda Tofthagen, 113124
- Tora Kristine Løtveit, 112927
- Tuva Grønvold Natvig, 113107

<H3>Part I: demo of facilities</H3>

In [83]:
import json
import os
import time
import xml.etree.ElementTree as ET
from collections import defaultdict
from string import punctuation
import nltk  
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from rank_bm25 import BM25Okapi
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ndcg_score, average_precision_score
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amandatofthagen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/amandatofthagen/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amandatofthagen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amandatofthagen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Preprocessing function and loading data:

In [84]:
def preprocess_text(text):
    # Convert text to lower case and tokenize
    tokens = word_tokenize(text.lower())
    # Remove punctuation
    tokens = [token for token in tokens if token not in punctuation]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Load metadata as both dataframe and list
def load_metadata(file_path):
    df = pd.read_csv(file_path, low_memory=False)
    df = df[['cord_uid', 'title', 'abstract']].dropna()  # Keep only required columns
    df['title'] = df['title'].astype(str)
    df['abstract'] = df['abstract'].astype(str)

    # Store as a list of "title + abstract" for ranking models
    doc_list = df.apply(lambda x: f"{x['title']} {x['abstract']}", axis=1).tolist()
    
    return df, doc_list  # Return both formats


# Load qrels
def load_qrels(file_path):
    qrels = defaultdict(dict)
    with open(file_path, 'r') as f:
        for line in f:
            topic_id, _, doc_id, relevance = line.strip().split()
            qrels[topic_id][doc_id] = int(relevance)
    return qrels

def load_queries(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    queries = {}
    for topic in root.findall('topic'):
        topic_number = topic.get('number')
        query_text = preprocess_text(topic.find('query').text)
        queries[topic_number] = " ".join(query_text)  # Ensure consistency
    return queries

metadata_path = "data/metadata.csv"
qrels_path = "data/qrels.txt"
queries_path = "data/topics.xml"

D, D_list = load_metadata(metadata_path)
qrels = load_qrels(qrels_path)
queries = load_queries(queries_path)

A) **Indexing** (preprocessing and indexing options)

In [85]:
def indexing(D):
    start_time = time.time()  # Start timing
    
    # Initialize the inverted index
    inverted_index = defaultdict(dict)

    # Process each document
    for index, row in D.iterrows():
        # Combine title and abstract for indexing
        document_text = f"{row['title']} {row['abstract']}"
        # Preprocess text
        tokens = preprocess_text(document_text)

        # Build the index
        for term in tokens:
            if index in inverted_index[term]:
                inverted_index[term][index] += 1
            else:
                inverted_index[term][index] = 1

    # Calculate time and space used
    indexing_time = time.time() - start_time
    index_size = sum(sum(freq.values()) for freq in inverted_index.values())  # Calculate the size of the index

    # Return the inverted index, time taken, and estimated size of the index
    return inverted_index, indexing_time, index_size

def save_index_to_json(inverted_index, file_name='inverted_index.json'):
    with open(file_name, 'w') as f:
        json.dump(inverted_index, f, indent=4)

def save_index_to_json(inverted_index, directory='output_data', file_name='inverted_index.json'):
    # Check if the directory exists, if not create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Full path to save the file
    path_to_file = os.path.join(directory, file_name)

    # Save the JSON file
    with open(path_to_file, 'w') as f:
        json.dump(inverted_index, f, indent=4)

# Run it:
inverted_index, time_taken, size = indexing(D)
save_index_to_json(inverted_index)

Boolean query:

In [86]:
def boolean_ir_model(q, k, I, queries, D):
    """
    Performs Boolean Information Retrieval (IR) to rank documents.

    Parameters:
        q (str): The query/topic identifier.
        k (int): The number of top documents to return.
        I (dict): The inverted index.
        queries (dict): Dictionary of preprocessed queries.
        D (pd.DataFrame): Dataframe containing document information.

    Returns:
        list: Ordered list of document identifiers.
    """
    query_terms = queries[q].split()

    # Collect sets of document IDs for each term in the query
    doc_sets = [set(I[term].keys()) for term in query_terms if term in I]

    # If no terms from the query are in the index, return an empty list
    if not doc_sets:
        return []

    # Boolean AND: Intersection of document sets containing all query terms
    relevant_docs = set.intersection(*doc_sets)

    # Rank documents based on term frequencies
    doc_scores = {}
    for doc_id in relevant_docs:
        score = sum(I[term].get(doc_id, 0) for term in query_terms)
        cord_uid = D.iloc[int(doc_id)]['cord_uid'] if int(doc_id) < len(D) else None
        doc_scores[cord_uid] = score

    # Sort documents by score in descending order
    sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)

    # Return top k documents
    top_docs = [doc_id for doc_id, _ in sorted_docs[:k]]
    return top_docs

B) **Summarization**

*B.1 Summarization solution: results for a given document*

In [87]:
def summarize_document(text, num_sentences=10):
    if not isinstance(text, str) or text.strip() == "":
        return "No valid text to summarize."
    
    sentences = sent_tokenize(text)
    if not sentences:
        return "No content available for summarization."
    
    vectorizer = TfidfVectorizer(stop_words='english')
    try:
        tfidf_matrix = vectorizer.fit_transform(sentences)
    except ValueError:
        return "Not enough content for summarization."
    
    scores = np.sum(tfidf_matrix.toarray(), axis=1)
    ranked_sentences = [sentences[i] for i in np.argsort(scores)[-num_sentences:][::-1]]
    
    return ' '.join(ranked_sentences)


*B.2 IR models (TF-IDF, BM25 and EBRT)*

In [88]:
def compute_tfidf(D):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(D)
    return tfidf_matrix, vectorizer

def compute_bm25(D):
    tokenized_corpus = [preprocess_text(doc) for doc in D]
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25

def ranking(q, p, I, method='tfidf', precomputed_bm25=None, precomputed_tfidf=None):
    """
    Rank documents using TF-IDF, BM25, or BERT.
    
    :param q: Query string
    :param p: Number of top documents to return
    :param I: Inverted index (only used for TF-IDF)
    :param method: Retrieval method ('tfidf', 'bm25', 'bert')
    :param precomputed_bm25: Precomputed BM25 model (optional)
    :param precomputed_tfidf: Tuple (TF-IDF matrix, vectorizer) if precomputed
    :return: List of (document_id, score) pairs, sorted by relevance
    """
    query_terms = preprocess_text(q)

    if method == 'tfidf':
        if precomputed_tfidf is None:
            tfidf_matrix, vectorizer = compute_tfidf(D_list)
        else:
            tfidf_matrix, vectorizer = precomputed_tfidf
        query_vector = vectorizer.transform([" ".join(query_terms)])
        scores = np.dot(tfidf_matrix, query_vector.T).toarray().flatten()
        doc_scores = {doc_id: score for doc_id, score in zip(D['cord_uid'], scores)}

    elif method == 'bm25':
        if precomputed_bm25 is None:
            bm25 = compute_bm25(D_list)
        else:
            bm25 = precomputed_bm25
        scores = bm25.get_scores(query_terms)  
        doc_scores = {doc_id: score for doc_id, score in zip(D['cord_uid'], scores)}

    else:
        raise ValueError("Invalid ranking method: Choose 'tfidf', 'bm25', or 'bert'")

    ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:p]
    return ranked_docs

*B.3 Reciprocal rank funsion*

In [89]:
def reciprocal_rank_fusion(rankings, k=50):
    fusion_scores = defaultdict(float)
    for rank_list in rankings:
        for rank, (doc, _) in enumerate(rank_list):
            fusion_scores[doc] += 1 / (k + rank + 1)
    return sorted(fusion_scores.items(), key=lambda x: x[1], reverse=True)

# Example usage:
tfidf_results = ranking(queries["1"], 5, inverted_index, 'bm25')
bm25_results = ranking(queries["1"], 5, inverted_index, 'bm25')

fusion_results = reciprocal_rank_fusion([tfidf_results, bm25_results])
print("Fusion Results:", fusion_results)


Fusion Results: [('75773gwg', 0.0392156862745098), ('kn2z7lho', 0.038461538461538464), ('4fb291hq', 0.03773584905660377), ('hl967ekh', 0.037037037037037035), ('8ccl9aui', 0.03636363636363636)]


*B.4 Maximal Marginal Relevance*

In [90]:
def maximal_marginal_relevance(query, doc_vectors, vectorizer, lambda_param=0.5, top_n=5):
    query_vector = vectorizer.transform([query])
    doc_similarities = cosine_similarity(doc_vectors, query_vector).flatten()
    
    selected = []
    remaining = list(range(len(doc_similarities)))

    selected_similarities = np.zeros(len(doc_similarities))

    for _ in range(top_n):
        if not remaining:
            break

        mmr_scores = lambda_param * doc_similarities[remaining] - (1 - lambda_param) * selected_similarities[remaining]

        next_idx = remaining[np.argmax(mmr_scores)]
        selected.append(next_idx)
        remaining.remove(next_idx)

        if remaining:
            candidate_sims = cosine_similarity(doc_vectors[remaining], doc_vectors[next_idx]).flatten()
            selected_similarities[remaining] = np.maximum(selected_similarities[remaining], candidate_sims)

    return selected

C) **Keyword extraction**

In [91]:
def extract_keywords_tfidf(documents, top_n=10):
    # Initialize TF-IDF Vectorizer 
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    # Transform documents into TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(documents)
    # Get list of all feature names (words)
    feature_names = vectorizer.get_feature_names_out()
   
    top_keywords = {}
    # Get the top N tf-idf scores for each document
    for i, doc in enumerate(documents):
        feature_index = tfidf_matrix[i,:].nonzero()[1]
        tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
        sorted_items = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:top_n]
        top_keywords[i] = [(feature_names[idx], score) for idx, score in sorted_items]
    
    return top_keywords

def extract_keywords_tfidf(documents, top_n=10):
    # Initialize TF-IDF Vectorizer 
    vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
    # Transform documents into TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(documents)
    # Get list of all feature names (words)
    feature_names = vectorizer.get_feature_names_out()
   
    top_keywords = {}
    # Get the top N tf-idf scores for each document
    for i, doc in enumerate(documents):
        feature_index = tfidf_matrix[i,:].nonzero()[1]
        tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
        sorted_items = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:top_n]
        top_keywords[i] = [(feature_names[idx], score) for idx, score in sorted_items]
    
    return top_keywords

D) **Evaluation**

Precomputations for the evalution:

In [92]:
# Precompution for evaluation
precomputed_bm25 = compute_bm25(D_list)  
tfidf_matrix, tfidf_vectorizer = compute_tfidf(D_list)


docs_text = (D['title'].fillna('') + " " + D['abstract'].fillna('')).tolist()
mmr_vectorizer = TfidfVectorizer(stop_words='english')
doc_vectors = mmr_tfidf_matrix = mmr_vectorizer.fit_transform(docs_text)

In [93]:
def evaluation(Q, R, D, inverted_index, precomputed_bm25, precomputed_tfidf, precomputed_vectorizer, precomputed_doc_vectors):
    metrics = {}
    num_queries_missing_relevant_docs = 0

    for method in ['bm25', 'tfidf', 'rrf', 'boolean', 'mmr']:
        ndcg_10_vals, ndcg_1000_vals, p_10_vals, ap_vals = [], [], [], []

        for qid, q_text in Q.items():
            if method == 'bm25':
                ranked_docs = ranking(q_text, 1000, inverted_index, method='bm25', precomputed_bm25=precomputed_bm25)
            
            elif method == 'tfidf':
                ranked_docs = ranking(q_text, 1000, inverted_index, method='tfidf', precomputed_tfidf=precomputed_tfidf)
            
            elif method == 'rrf':
                tfidf_results = ranking(q_text, 1000, inverted_index, method='tfidf', precomputed_tfidf=precomputed_tfidf)
                bm25_results = ranking(q_text, 1000, inverted_index, method='bm25', precomputed_bm25=precomputed_bm25)
                ranked_docs = reciprocal_rank_fusion([tfidf_results, bm25_results]) if tfidf_results and bm25_results else []
            
            elif method == 'boolean':
                ranked_doc_ids = boolean_ir_model(qid, 1000, inverted_index, Q, D)
                ranked_docs = [(doc_id, 1) for doc_id in ranked_doc_ids]
            
            elif method == 'mmr':
                ranked_doc_indices = maximal_marginal_relevance(
                    query=q_text,
                    doc_vectors=mmr_tfidf_matrix,
                    vectorizer=mmr_vectorizer,
                    lambda_param=0.7,
                    top_n=100
                )
                ranked_docs = [(D.iloc[idx]['cord_uid'], 1) for idx in ranked_doc_indices]

            relevant_docs = [1 if doc[0] in R.get(qid, {}) else 0 for doc in ranked_docs]
            scores = [doc[1] for doc in ranked_docs]

            if sum(relevant_docs) == 0:
                num_queries_missing_relevant_docs += 1
                continue

            ndcg_10_vals.append(ndcg_score([relevant_docs], [scores], k=10))
            ndcg_1000_vals.append(ndcg_score([relevant_docs], [scores], k=1000))
            ap_vals.append(average_precision_score(relevant_docs, scores))
            p_10_vals.append(sum(relevant_docs[:10]) / min(10, len(relevant_docs)))

        metrics[method] = {
            "MAP": np.mean(ap_vals) if ap_vals else 0,
            "nDCG@10": np.mean(ndcg_10_vals) if ndcg_10_vals else 0,
            "P@10": np.mean(p_10_vals) if p_10_vals else 0,
            "nDCG@1000": np.mean(ndcg_1000_vals) if ndcg_1000_vals else 0
        }

    if num_queries_missing_relevant_docs > 0:
        print(f"\n⚠️ WARNING: {num_queries_missing_relevant_docs} queries have no relevant documents in qrels.")

    return metrics


# Evaluate with precomputed vectors
eval_metrics = evaluation(
    queries, qrels, D, inverted_index, precomputed_bm25, 
    (tfidf_matrix, tfidf_vectorizer), mmr_vectorizer, mmr_tfidf_matrix
)

# Print results
print("\nEvaluation Metrics:")
for method, scores in eval_metrics.items():
    print(f"{method.upper()}: MAP={scores['MAP']:.4f}, nDCG@10={scores['nDCG@10']:.4f}, P@10={scores['P@10']:.4f}, nDCG@1000={scores['nDCG@1000']:.4f}")



Evaluation Metrics:
BM25: MAP=0.6022, nDCG@10=0.9379, P@10=0.9220, nDCG@1000=0.9058
TFIDF: MAP=0.5395, nDCG@10=0.8086, P@10=0.7920, nDCG@1000=0.8777
RRF: MAP=0.5534, nDCG@10=0.8908, P@10=0.8720, nDCG@1000=0.8002
BOOLEAN: MAP=0.0256, nDCG@10=0.0705, P@10=0.0259, nDCG@1000=0.2674
MMR: MAP=0.5446, nDCG@10=0.5446, P@10=0.7220, nDCG@1000=0.8138


<H3>END</H3>