<H3>PRI 2023/24: second
    project delivery</H3>

**GROUP 1**
- Amanda Tofthagen, 113124
- Tora Kristine Løtveit, 112927
- Tuva Grønvold Natvig, 113107

<H2>Main facilities</H2>

#### Loading and preproccesing data (using with functions made in project 1)

In [1]:
import nltk  
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from collections import defaultdict
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics.pairwise import euclidean_distances


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>
[nltk_data] Error loading punkt_tab: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1007)>


False

In [35]:
# Setup and data preparation 
def preprocess_text(text):
    # Convert text to lower case and tokenize
    tokens = word_tokenize(text.lower())
    # Remove punctuation
    tokens = [token for token in tokens if token not in punctuation]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Load metadata as both dataframe and list
def load_metadata(file_path):
    df = pd.read_csv(file_path, low_memory=False)
    required_columns = df[['cord_uid', 'title', 'abstract', 'authors', 'journal', 'publish_time']]  
    for col in required_columns:
        if col not in df.columns:
            df[col] = ""  # fill with blank if column is missing
    
    # Drop rows missing both title and abstract
    df = df.dropna(subset=['title', 'abstract'])

    # Ensure all fields are strings
    for col in required_columns:
        df[col] = df[col].astype(str)
   
    # Combine fields into a single text string per document
    docs = df.apply(lambda x: f"{x['title']} {x['abstract']} {x['authors']} {x['journal']} {x['publish_time']}", axis=1).tolist()
    return df, docs
    

# Load qrels
def load_qrels(file_path):  
    qrels = defaultdict(dict)
    with open(file_path, 'r') as f:
        for line in f:
            topic_id, _, doc_id, relevance = line.strip().split()
            qrels[topic_id][doc_id] = int(relevance)
    return qrels

def load_queries(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    queries = {}
    for topic in root.findall('topic'):
        topic_number = topic.get('number')
        query_text = preprocess_text(topic.find('query').text)
        queries[topic_number] = " ".join(query_text)  # Ensure consistency
    return queries

metadata_path = "data2/metadata.csv"
qrels_path = "data2/qrels.txt"
queries_path = "data2/topics.xml"

D, D_list = load_metadata(metadata_path)
qrels = load_qrels(qrels_path)
queries = load_queries(queries_path)

<h3>Part I: clustering</h3>

*A) Clustering*

In [36]:
# Vectorize the documents using BM25 (feature extraction)
class BM25Vectorizer:
    def __init__(self, k1=1.5, b=0.75):
        self.k1 = k1
        self.b = b
        self.vectorizer = None
        self.bm25 = None
        self.terms = None
    
    # Function that transform documents to BM25-weighted vectors
    def transform(self, docs): 
        doc_vectors = []
        for doc in docs:
            doc_tokens = self.vectorizer.build_tokenizer()(doc)
            doc_scores = self.bm25.get_scores(doc_tokens)
            doc_vectors.append(doc_scores)
        return np.array(doc_vectors)

    # Function that fit the BM25-vectorizer to the documents and transform the documents to BM25-weighted vectors
    def fit_transform(self, docs):
        self.vectorizer = TfidfVectorizer()
        self.terms = self.vectorizer.fit(docs).get_feature_names_out()
        tokenized_docs = [self.vectorizer.build_tokenizer()(doc) for doc in docs]
        self.bm25 = BM25Okapi(tokenized_docs, k1=self.k1, b=self.b)
        vectors = self.transform(docs)  
        return vectors

In [39]:
# Function to determine the best number of clusters, based on elbow point and silhouette score
def determine_number_of_clusters(D, max_k):
    # First step: Use elbow method to narrow down the range of potential values for k
    distortions = []
    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(D)
        distortions.append(kmeans.inertia_)
    
    accelerations = np.diff(distortions, 2) # Compute second derivative (accelerations)
    elbow_point = np.argmin(accelerations) # find the "elbow" by looking at the second derivative of the distortions list 

    # Second step: Calculate silhouette scores for values up to the elbow point
    k_range = range(2, elbow_point + 1) 
    silhouette_scores = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(D)
        score = silhouette_score(D, labels)
        silhouette_scores.append(score)
    

    # Select the best k based on the highest silhouette score
    best_k = k_range[np.argmax(silhouette_scores)]
    return best_k

In [40]:
def clustering(D, args=None):
    if args is None:
        args = {}

    # Extract algorithm and maximum number of clusters to evaluate from args:
    algorithm = args.get('algorithm', 'kmeans') # Default algorithm is k-means
    max_k = args.get('max_k', 10) # Default maximum k to evaluate
    vectorizer = args.get('vectorizer', BM25Vectorizer())

    # Prepare document vectors
    document_vectors = vectorizer.fit_transform(D)

    # Determine the best number of clusters (if not provided)
    if 'num_clusters' not in args:
        num_clusters = determine_number_of_clusters(document_vectors, max_k)
    else:
        num_clusters = args['num_clusters']

    # Apply the chosen clustering algorithm
    if algorithm == 'kmeans':
        model = KMeans(n_clusters=num_clusters, random_state=42)
        labels = model.fit_predict(document_vectors)
        centroids = model.cluster_centers_
    else:
        # Add more clustering algorithms as needed
        raise ValueError("Unsupported algorithm")

    # Group documents by cluster
    clusters = {i: [] for i in range(num_clusters)}
    for idx, label in enumerate(labels):
        clusters[label].append(D[idx])

    # Generate output
    clustering_solution = []
    for i in range(num_clusters):
        cluster_info = {
            'centroid': centroids[i] if 'centroids' in locals() else None,
            'documents': clusters[i]
        }
        clustering_solution.append(cluster_info)

    return clustering_solution


# Example usage (with sample size):
sample_size = 100
sample_indices = random.sample(range(len(D_list)), sample_size)
D_sample = [D_list[i] for i in sample_indices]
sample_cluster_result = clustering(D_sample)
print(sample_cluster_result)

[{'centroid': array([59.78863701, 74.76574295, 68.52878377, 63.72637782, 65.12322976,
       68.1211229 , 69.94606546, 65.69611635, 68.50003379, 66.28727029,
       75.03706071, 70.57192694, 75.25434229, 70.81862914, 69.29581143,
       67.79247696, 70.38571615, 58.70114026, 70.39157188, 74.62852314,
       76.73715201, 47.44178257, 70.94953659, 65.95849216, 53.86882202,
       57.84978771, 61.7607585 , 44.33123069, 81.55683566, 67.03068664,
       66.12899226, 72.24863784, 69.35851054, 70.85087283, 63.83687302,
       67.37943647, 13.35621811, 68.93344368, 71.40345197, 67.49016961,
       66.82816967, 76.96252029, 72.93465872, 45.69963455, 63.50828731,
       67.46893709, 70.86616053, 64.79561861, 72.57457455, 68.43123912,
       72.77670873, 64.16197509, 71.2037945 , 65.11192114, 64.19586239,
       72.29451266, 61.43150098, 70.73569193, 62.97977306, 66.66413203,
       67.81509064, 74.43361234, 70.19989996, 30.6225804 , 71.29876257,
       73.0223302 , 72.43954557, 61.44403606, 74.1

In [41]:
def interpret(cluster, D, args=None):
    if args is None:
        args = {}
    
    # Retrieve indices of documents in the cluster from the clustering output
    cluster_documents = cluster['documents']

    # Prepare document vectors using BM25Vectorizer
    vectorizer = BM25Vectorizer()
    document_vectors = vectorizer.fit_transform(cluster_documents)

    # Compute the median vector
    median_vector = np.median(document_vectors, axis=0)

    # Compute the medoid
    if len(document_vectors) > 1:
        distance_matrix = euclidean_distances(document_vectors, document_vectors)
        medoid_index = np.argmin(np.sum(distance_matrix, axis=0))
        medoid_document_content = cluster_documents[medoid_index]
    else:
        medoid_document_content = cluster_documents[0]  # In case there's only one document in the cluster

    # Description based on median and medoid
    description = {
        'median_vector': median_vector,
        'medoid_document_content': medoid_document_content
    }

    return description

for cluster in sample_cluster_result:
    description = interpret(cluster, D_sample)  
    print("Cluster Description:", description)

Cluster Description: {'median_vector': array([46.60948128, 59.58432154, 53.92566662, 53.24163151, 59.7976743 ,
       50.85234839, 50.83158772, 53.44917897, 53.06500808, 37.20118165,
       54.24784526, 57.252426  , 40.70816365, 32.62023541, 60.06038788,
       47.41615035, 50.17572652, 44.36774677, 11.614916  , 48.23869627,
       38.79302917, 52.48358533, 46.07016413, 52.92175482, 54.12099588,
       48.76676405, 53.09013818, 50.11778957, 58.89727478, 47.06060547,
       57.07035423, 51.51018081, 24.09414177, 53.18717415, 53.59593388,
       42.47977251, 58.68213328, 53.77939426, 58.24404084, 53.16958505,
       59.47800462, 56.72349514, 55.81574924, 56.45165432, 55.8350752 ,
       56.95836544, 49.28326076, 58.12088351, 51.30016237, 54.74782323,
       58.199566  , 53.22863517, 48.2128579 , 53.21684014, 60.71313397,
       25.56115103, 41.0477968 ]), 'medoid_document_content': 'The challenge of emerging and re-emerging infectious diseases Infectious diseases have for centuries ranke

In [None]:
#code, statistics and/or charts here

*B) Summarization*

In [None]:
#code, statistics and/or charts here

*C) Keyword extraction*

In [None]:
#code, statistics and/or charts here

*D) Evaluation*

In [None]:
#code, statistics and/or charts here

<h3>Part II: classification</h3>

*A) Feature extraction*

In [None]:
#code and statistics here

*B) Classification*

In [None]:
#code here

*C) Ranking extension*

In [None]:
#code here

*D) Evaluation*

In [None]:
#code, statistics and/or charts here

<H2>Question materials (optional)</H2>

<H3>Part I: clustering</H3>

**(1)** Do clustering-guided summarization alters the behavior and efficacy of the IR system?

In [None]:
#code, statistics and/or charts here

**(2)** How sentence representations, clustering choices, and rank criteria impact summarization?

In [None]:
#code, statistics and/or charts here

**...** (additional questions with empirical results)

<H3>END</H3>