<H3>PRI 2023/24: first project delivery</H3>

**GROUP 1**
- Amanda Tofthagen, 113124
- Tora Kristine Løtveit, 112927
- Tuva Grønvold Natvig, 113107

<H3>Part I: demo of facilities</H3>

In [36]:
import time
from collections import defaultdict
import json
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from tomlkit import string
from nltk.stem import WordNetLemmatizer
import xml.etree.ElementTree as ET
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, util
from scipy.spatial.distance import cosine
from sklearn.metrics import ndcg_score, average_precision_score

import string 
import numpy as np
import os

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/torakristinelotveit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/torakristinelotveit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/torakristinelotveit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Preprocessing function and loading data:

In [37]:
def preprocess_text(text):
    # Convert text to lower case and tokenize
    tokens = word_tokenize(text.lower())
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Load Metadata as both dataframe and list
def load_metadata(file_path):
    df = pd.read_csv(file_path)
    df = df[['cord_uid', 'title', 'abstract']].dropna()  # Keep only required columns
    df['title'] = df['title'].astype(str)
    df['abstract'] = df['abstract'].astype(str)

    # Store as a list of "title + abstract" for ranking models
    doc_list = df.apply(lambda x: f"{x['title']} {x['abstract']}", axis=1).tolist()
    
    return df, doc_list  # Return both formats


# Load Qrels
def load_qrels(file_path):
    qrels = defaultdict(dict)
    with open(file_path, 'r') as f:
        for line in f:
            topic_id, _, doc_id, relevance = line.strip().split()
            qrels[topic_id][doc_id] = int(relevance)
    return qrels

def load_queries(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    queries = {}
    for topic in root.findall('topic'):
        topic_number = topic.get('number')
        query_text = preprocess_text(topic.find('query').text)
        queries[topic_number] = " ".join(query_text)  # Ensure consistency
    return queries

metadata_path = "data/metadata.csv"
qrels_path = "data/qrels.txt"
queries_path = "data/topics.xml"

D, D_list = load_metadata(metadata_path)
qrels = load_qrels(qrels_path)
queries = load_queries(queries_path)

  df = pd.read_csv(file_path)


A) **Indexing** (preprocessing and indexing options)

In [38]:

def indexing(D):
    start_time = time.time()  # Start timing

    # Initialize the inverted index
    inverted_index = defaultdict(dict)

    # Process each document
    for index, row in D.iterrows():
        # Combine title and abstract for indexing
        document_text = f"{row['title']} {row['abstract']}"
        # Preprocess text
        tokens = preprocess_text(document_text)

        # Build the index
        for term in tokens:
            if index in inverted_index[term]:
                inverted_index[term][index] += 1
            else:
                inverted_index[term][index] = 1

    # Calculate time and space used
    indexing_time = time.time() - start_time
    index_size = sum(sum(freq.values()) for freq in inverted_index.values())  # Calculate the size of the index

    # Return the inverted index, time taken, and estimated size of the index
    return inverted_index, indexing_time, index_size

def save_index_to_json(inverted_index, file_name='inverted_index.json'):
    with open(file_name, 'w') as f:
        json.dump(inverted_index, f, indent=4)



def save_index_to_json(inverted_index, directory='output_data', file_name='inverted_index.json'):
    # Check if the directory exists, if not create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Full path to save the file
    path_to_file = os.path.join(directory, file_name)

    # Save the JSON file
    with open(path_to_file, 'w') as f:
        json.dump(inverted_index, f, indent=4)

# Example usage:
inverted_index, time_taken, size = indexing(D)
save_index_to_json(inverted_index)

Boolean query:

In [None]:
def boolean_query(q, k, I, args=None):
    """
    Perform a Boolean search on the inverted index.
    q: Query string (text).
    k: Number of top terms to consider.
    I: Inverted index (dictionary).
    args: Additional arguments (e.g., Boolean mode: 'AND' or 'OR').
    """
    query_terms = preprocess_text(q)[:k]  # Use only the top-k terms
    if not query_terms:
        return []

    boolean_mode = args.get('mode', 'AND') if args else 'AND'  # Default: AND search

    # Retrieve document sets for each term in the query
    doc_sets = [set(I[term].keys()) for term in query_terms if term in I]

    if not doc_sets:
        return []

    # Apply Boolean logic
    if boolean_mode == 'AND':
        relevant_docs = set.intersection(*doc_sets) if doc_sets else set()
    else:  # 'OR' mode (default if 'AND' fails)
        relevant_docs = set.union(*doc_sets) if doc_sets else set()

    return sorted(relevant_docs)[:k]  # Return up to `k` document IDs

# Example usage:
boolean_results = boolean_query(queries["1"], 5, inverted_index, {'mode': 'OR'})
print("Boolean Query Results:", boolean_results)


Boolean Query Results: [16, 22, 80, 93, 134]


B) **Summarization**

*B.1 Summarization solution: results for a given document*

In [41]:
def summarize_document(text, num_sentences=10):
    if not isinstance(text, str) or text.strip() == "":
        return "No valid text to summarize."
    
    sentences = sent_tokenize(text)
    if not sentences:
        return "No content available for summarization."
    
    vectorizer = TfidfVectorizer(stop_words='english')
    try:
        tfidf_matrix = vectorizer.fit_transform(sentences)
    except ValueError:
        return "Not enough content for summarization."
    
    scores = np.sum(tfidf_matrix.toarray(), axis=1)
    ranked_sentences = [sentences[i] for i in np.argsort(scores)[-num_sentences:][::-1]]
    
    return ' '.join(ranked_sentences)

#Example usage:

print("Summarization:")
print(summarize_document(D_list[1], num_sentences=3))

Summarization:
On the basis of biochemical evidence, it is often presumed that such NO• -dependent oxidations are due to the formation of the oxidant peroxynitrite, although alternative mechanisms involving the phagocyte-derived heme proteins myeloperoxidase and eosinophil peroxidase might be operative during conditions of inflammation. Inflammatory diseases of the respiratory tract are commonly associated with elevated production of nitric oxide (NO•) and increased indices of NO• -dependent oxidative stress. Although NO• is known to have anti-microbial, anti-inflammatory and anti-oxidant properties, various lines of evidence support the contribution of NO• to lung injury in several disease models.


*B.2 IR models (TF-IDF, BM25 and EBRT)*

In [52]:
def compute_tfidf(D):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(D)
    return tfidf_matrix, vectorizer

def compute_bm25(D):
    tokenized_corpus = [preprocess_text(doc) for doc in D]
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25

# Load Sentence-BERT model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

def compute_bert(D):
    embeddings = sbert_model.encode(D, convert_to_tensor=True)
    return embeddings

def ranking(q, p, I, method='tfidf', precomputed_bm25=None):
    """
    Rank documents using TF-IDF, BM25, or BERT.
    
    :param q: Query string
    :param p: Number of top documents to return
    :param I: Inverted index (only used for TF-IDF)
    :param method: Retrieval method ('tfidf', 'bm25', 'bert')
    :param precomputed_bm25: Precomputed BM25 model (optional)
    :return: List of (document_id, score) pairs, sorted by relevance
    """
    query_terms = preprocess_text(q)

    if method == 'tfidf':
        tfidf_matrix, vectorizer = compute_tfidf(D_list)
        query_vector = vectorizer.transform([" ".join(query_terms)])
        scores = np.dot(tfidf_matrix, query_vector.T).toarray().flatten()
        doc_scores = {doc_id: score for doc_id, score in zip(D['cord_uid'], scores)}

    elif method == 'bm25':
        if precomputed_bm25 is None:
            bm25 = compute_bm25(D_list)
        else:
            bm25 = precomputed_bm25
        scores = bm25.get_scores(query_terms)  # Use precomputed model
        doc_scores = {doc_id: score for doc_id, score in zip(D['cord_uid'], scores)}

    elif method == 'bert':
        doc_embeddings = compute_bert(D_list)
        query_embedding = sbert_model.encode(q, convert_to_tensor=True)
        scores = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0].cpu().numpy()
        doc_scores = {doc_id: score for doc_id, score in zip(D['cord_uid'], scores)}

    else:
        raise ValueError("Invalid ranking method: Choose 'tfidf', 'bm25', or 'bert'")

    ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:p]
    return ranked_docs

# Example usage:
bm25 = compute_bm25(D_list) 

ranked_docs = ranking(queries["1"], 5, inverted_index, 'bm25')
print("BM25 Ranking:", ranked_docs)

KeyboardInterrupt: 

*B.3 Reciprocal rank funsion*

In [44]:
def reciprocal_rank_fusion(rankings, k=50):
    fusion_scores = defaultdict(float)
    for rank_list in rankings:
        for rank, (doc, _) in enumerate(rank_list):
            fusion_scores[doc] += 1 / (k + rank + 1)
    return sorted(fusion_scores.items(), key=lambda x: x[1], reverse=True)

# Example usage:
tfidf_results = ranking(queries["1"], 5, inverted_index, 'bm25')
bm25_results = ranking(queries["1"], 5, inverted_index, 'bm25')

fusion_results = reciprocal_rank_fusion([tfidf_results, bm25_results])
print("Fusion Results:", fusion_results)


Fusion Results: [('75773gwg', 0.0392156862745098), ('kn2z7lho', 0.038461538461538464), ('4fb291hq', 0.03773584905660377), ('hl967ekh', 0.037037037037037035), ('8ccl9aui', 0.03636363636363636)]


*B.4 Maximal Marginal Relevance*

In [None]:
def maximal_marginal_relevance(query, documents, lambda_param=0.5, top_n=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(documents + [query])
    query_vector = tfidf_matrix[-1].toarray()[0]
    doc_vectors = tfidf_matrix[:-1].toarray()
    selected = []
    remaining = list(range(len(documents)))
    for _ in range(top_n):
        if not remaining:
            break
        relevance_scores = [1 - cosine(query_vector, doc_vectors[i]) for i in remaining]
        diversity_scores = [max([1 - cosine(doc_vectors[i], doc_vectors[j]) for j in selected] + [0]) for i in remaining]
        mmr_scores = [lambda_param * rel - (1 - lambda_param) * div for rel, div in zip(relevance_scores, diversity_scores)]
        best_doc = remaining[np.argmax(mmr_scores)]
        selected.append(best_doc)
        remaining.remove(best_doc)
    return [documents[i] for i in selected]

# Example usage:
test_query = queries["1"]  
test_documents = D_list[:10]  
mmr_results = maximal_marginal_relevance(test_query, test_documents, lambda_param=0.7, top_n=5)

# Print Results
print("Maximal Marginal Relevance Results:")
for i, doc in enumerate(mmr_results, 1):
    print(f"{i}. {doc[:200]}...")  # Print only first 200 characters for readability

Maximal Marginal Relevance Results:
1. Clinical features of culture-proven Mycoplasma pneumoniae infections at King Abdulaziz University Hospital, Jeddah, Saudi Arabia OBJECTIVE: This retrospective chart review describes the epidemiology a...
2. Role of endothelin-1 in lung disease Endothelin-1 (ET-1) is a 21 amino acid peptide with diverse biological activity that has been implicated in numerous diseases. ET-1 is a potent mitogen regulator o...
3. Sequence requirements for RNA strand transfer during nidovirus discontinuous subgenomic RNA synthesis Nidovirus subgenomic mRNAs contain a leader sequence derived from the 5′ end of the genome fused t...
4. The 21st International Symposium on Intensive Care and Emergency Medicine, Brussels, Belgium, 20-23 March 2001 The 21st International Symposium on Intensive Care and Emergency Medicine was dominated b...
5. Technical Description of RODS: A Real-time Public Health Surveillance System This report describes the design and implementation of t

C) **Keyword extraction**

In [46]:
#code, statistics and/or charts here

D) **Evaluation**

In [53]:
def evaluation(Q, R, D, precomputed_bm25):
    ndcg_values, map_values, precision_10 = [], [], []

    for qid, q_text in Q.items():
        ranked_docs = ranking(q_text, 10, inverted_index, method='bm25', precomputed_bm25=precomputed_bm25)


        # Generate relevance labels (1 if doc is relevant, else 0)
        relevant_docs = [1 if doc[0] in R.get(qid, {}) else 0 for doc in ranked_docs]

        # Get document scores
        scores = [doc[1] for doc in ranked_docs]

        if len(relevant_docs) == 0 or len(scores) == 0:
            continue  # Skip empty cases to avoid errors

        # Ensure both lists have the same shape
        ndcg_values.append(ndcg_score([relevant_docs], [scores]))  # No extra []
        map_values.append(average_precision_score(relevant_docs, scores))  # 1D lists
        precision_10.append(sum(relevant_docs[:10]) / min(10, len(relevant_docs)))  # Handle shorter lists

    return {
        "MAP": np.mean(map_values) if map_values else 0,
        "nDCG@10": np.mean(ndcg_values) if ndcg_values else 0,
        "P@10": np.mean(precision_10) if precision_10 else 0
    }

# Precompute BM25 Once
precomputed_bm25 = compute_bm25(D_list)

# Run Evaluation
eval_metrics = evaluation(queries, qrels, D, precomputed_bm25)
print("Evaluation Metrics:", eval_metrics)

Evaluation Metrics: {'MAP': 0.967641975308642, 'nDCG@10': 0.9873204812903524, 'P@10': 0.9219999999999999}


<H3>Part II: questions materials (optional)</H3>

**(1)** Corpus *D* and summaries *S* description.

In [None]:
#code, statistics and/or charts here

**(2)** Summarization performance for the overall and category-conditional corpora.

In [None]:
#code, statistics and/or charts here

**...** (additional questions with empirical results)

<H3>END</H3>