<H3>PRI 2023/24: second
    project delivery</H3>

**GROUP 1**
- Amanda Tofthagen, 113124
- Tora Kristine Løtveit, 112927
- Tuva Grønvold Natvig, 113107

<H2>Main facilities</H2>

#### Loading and preproccesing data (using with functions made in project 1)

In [2]:
import json
import os
import time
import xml.etree.ElementTree as ET
from collections import defaultdict
from string import punctuation
import nltk  
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import ndcg_score, average_precision_score
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/amandatofthagen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/amandatofthagen/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amandatofthagen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amandatofthagen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def preprocess_text(text):
    # Convert text to lower case and tokenize
    tokens = word_tokenize(text.lower())
    # Remove punctuation
    tokens = [token for token in tokens if token not in punctuation]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Load metadata as both dataframe and list
def load_metadata(file_path):
    df = pd.read_csv(file_path, low_memory=False)
    df = df[['cord_uid', 'title', 'abstract']].dropna()  # Keep only required columns
    df['title'] = df['title'].astype(str)
    df['abstract'] = df['abstract'].astype(str)

    # Store as a list of "title + abstract" for ranking models
    doc_list = df.apply(lambda x: f"{x['title']} {x['abstract']}", axis=1).tolist()
    
    return df, doc_list  # Return both formats


# Load qrels
def load_qrels(file_path):
    qrels = defaultdict(dict)
    with open(file_path, 'r') as f:
        for line in f:
            topic_id, _, doc_id, relevance = line.strip().split()
            qrels[topic_id][doc_id] = int(relevance)
    return qrels

def load_queries(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    queries = {}
    for topic in root.findall('topic'):
        topic_number = topic.get('number')
        query_text = preprocess_text(topic.find('query').text)
        queries[topic_number] = " ".join(query_text)  # Ensure consistency
    return queries

metadata_path = "data2/metadata.csv"
qrels_path = "data2/qrels.txt"
queries_path = "data2/topics.xml"

D, D_list = load_metadata(metadata_path)
qrels = load_qrels(qrels_path)
queries = load_queries(queries_path)

def indexing(D):
    start_time = time.time()  # Start timing
    
    # Initialize the inverted index
    inverted_index = defaultdict(dict)

    # Process each document
    for index, row in D.iterrows():
        # Combine title and abstract for indexing
        document_text = f"{row['title']} {row['abstract']}"
        # Preprocess text
        tokens = preprocess_text(document_text)

        # Build the index
        for term in tokens:
            if index in inverted_index[term]:
                inverted_index[term][index] += 1
            else:
                inverted_index[term][index] = 1

    # Calculate time and space used
    indexing_time = time.time() - start_time
    index_size = sum(sum(freq.values()) for freq in inverted_index.values())  # Calculate the size of the index

    # Return the inverted index, time taken, and estimated size of the index
    return inverted_index, indexing_time, index_size

def save_index_to_json(inverted_index, file_name='inverted_index.json'):
    with open(file_name, 'w') as f:
        json.dump(inverted_index, f, indent=4)

def save_index_to_json(inverted_index, directory='output_data', file_name='inverted_index.json'):
    # Check if the directory exists, if not create it
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Full path to save the file
    path_to_file = os.path.join(directory, file_name)

    # Save the JSON file
    with open(path_to_file, 'w') as f:
        json.dump(inverted_index, f, indent=4)

# Run it:
inverted_index, time_taken, size = indexing(D)
save_index_to_json(inverted_index)

<h3>Part I: clustering</h3>

*A) Clustering*

In [None]:
#code, statistics and/or charts here

*B) Summarization*

In [None]:
#code, statistics and/or charts here

*C) Keyword extraction*

In [3]:
#code, statistics and/or charts here

*D) Evaluation*

In [None]:
#code, statistics and/or charts here

<h3>Part II: classification</h3>

*A) Feature extraction*

In [None]:
#code and statistics here

*B) Classification*

In [4]:
#code here

*C) Ranking extension*

In [4]:
#code here

*D) Evaluation*

In [5]:
#code, statistics and/or charts here

<H2>Question materials (optional)</H2>

<H3>Part I: clustering</H3>

**(1)** Do clustering-guided summarization alters the behavior and efficacy of the IR system?

In [None]:
#code, statistics and/or charts here

**(2)** How sentence representations, clustering choices, and rank criteria impact summarization?

In [None]:
#code, statistics and/or charts here

**...** (additional questions with empirical results)

<H3>END</H3>