In [1]:
import os
# Move to Thesis directory (two levels up)
os.chdir(os.path.abspath(os.path.join("..", "..")))

# Move to model/src if it exists
model_dir = os.path.join(os.getcwd(), "model", "src")
if os.path.exists(model_dir):
    os.chdir(model_dir)

print("Current Directory:", os.getcwd())

Current Directory: c:\Users\1176153\Downloads\github\Thesis\model\src


In [2]:
from libs import data_handeling as dh
from libs.settings import data_catalog as dc
from libs import data_retrievers as dr
from langchain.vectorstores import Chroma

In [3]:
documents_chunked_with_ids = dh.load_documents_from_pickle(dc.DOCUMENTS_CHUNKED_WITH_IDS)
documents_chunked_with_ids

✅ Loaded 1017 documents from ..\..\data\Preprocessing_text\all_programs_chunked\documents_chunks_with_ids_without_metadata_embedded.pkl


[Document(id='07185832-3c4f-493a-b1be-db87a61f89a2', metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science'}, page_content="Teaching Staff en Programs Bachelor's\nDegrees Data Science Teaching Staff Américo Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant Professor\ncneves@novaims.unl.pt\n\nCatarina Palha\nInvited 

# Printing text content from original documents

In [57]:
from typing import List, Optional
from langchain_core.documents import Document

def print_text_context_from_program_documents(
    data: List[Document],  # Changed to accept a list of Document objects
    course_names_to_include: Optional[List[str]] = None,
    doc_types_to_include: Optional[List[str]] = None,
    words_per_chunk: int = 10  # Added parameter to control the number of words per chunk
):
    """
    Prints the text content of the documents based on the specified filters, along with their document IDs.
    Displays the text content vertically in chunks of specified words (default 50).

    Parameters:
    - data (list): List of Document objects, each containing 'id', 'metadata', and 'text'.
    - course_names_to_include (list, optional): Filter by specific course names (case insensitive).
    - doc_types_to_include (list, optional): Filter by document types ('teaching_staff', 'study_plan', 'main_info').
    - words_per_chunk (int, optional): Number of words to display per vertical chunk (default is 50).
    """
    filtered_data = []

    for doc in data:
        course_name = doc.metadata.get("course_name", "").lower()
        doc_type = doc.metadata.get("doc_type", "").lower()

        # Filter by course name
        if course_names_to_include and course_name not in [name.lower() for name in course_names_to_include]:
            continue

        # Filter by doc type
        if doc_types_to_include and doc_type not in [dt.lower() for dt in doc_types_to_include]:
            continue

        filtered_data.append(doc)

    if not filtered_data:
        print("⚠️ No documents matched the filters.")
        return

    # Print the text content of the filtered documents in chunks of words
    for doc in filtered_data:
        doc_id = doc.id  # Accessing the 'id' field directly
        print(f"\n--- Document ID: {doc_id} ---")
        print(f"Course Name: {doc.metadata.get('course_name')}")
        print(f"Document Type: {doc.metadata.get('doc_type')}")
        print("\nText Content:")

        # Split the content into words
        words = doc.page_content.split()

        # Process the content in chunks of 'words_per_chunk' words
        for i in range(0, len(words), words_per_chunk):
            chunk = " ".join(words[i:i + words_per_chunk])  # Join the chunk of words
            print(chunk)  # Print the chunk of words

        print("\n" + "-"*50)




In [63]:
print_text_context_from_program_documents(
    data=documents_chunked_with_ids,
    course_names_to_include=['Master Degree In Statistics And Information Management With A Specialization In Data Analytics'],  # Pass as a list
    doc_types_to_include=['study_plan']  # Pass as a list
)


--- Document ID: f0e0659d-2a5c-4567-af0a-d2c43f0b7ec0 ---
Course Name: Master Degree In Statistics And Information Management With A Specialization In Data Analytics
Document Type: study_plan

Text Content:
1 st year - Fall Semester Course Units Type Duration
ECTS Forecasting Methods Mandatory Quarterly 3,5 Multivariate Analytics Mandatory Semester
7,5 Statistics for Data Analytics Mandatory Semester 7,5 Time Series
Analysis Mandatory Quarterly 4 Applied Network Analysis Elective Semester 7,5
Banking and Insurance Economics Elective Quarterly 3,5 Business Intelligence I
Elective Semester 7,5 Business Process Management Elective Semester 7,5 Change
Management Elective Quarterly 4 Computational Statistics I Elective Semester 6
Data Governance Elective Quarterly 3,5 Data Management and Storage Elective
Quarterly 4 Data Mining I Elective Semester 7,5 Data Privacy,
Security and Ethics Elective Quarterly 4 DataBase Management Systems Elective
Semester 7,5 Fixed Income Securities Elective Qu

In [48]:
tfidf_retriever = dr.load_sparse_retriever(retriever_type="TF-IDF", documents_chunked=documents_chunked_with_ids, top_k=4)
query = "Postgraduate Program in Enterprise Data Science & Analytics"
print(tfidf_retriever.invoke(query))

[Document(metadata={'source': 'postgraduate-program-enterprise-data-science-analytics_main_course.txt', 'degree': 'postgraduate', 'doc_type': 'main_info', 'course_name': 'Postgraduate Program Enterprise Data Science Analytics'}, page_content='Postgraduate in Enterprise Data Science & Analytics Postgraduate Enterprise Data Science & Analytics Developed in partnership with Microsoft, the Postgraduate in Enterprise Data Science & Analytics will present the methodologies and tools that will transform data into information, on which enterprises can base strategic information on entering new markets, launching new product or service lines, optimizing processes, transforming business models and, generally, competing in a market increasingly driven by data-driven decisions'), Document(metadata={'source': 'master-degree-program-in-data-science-and-advanced-analytics-with-a-specialization-in-data-science_main_course.txt', 'degree': 'masters', 'doc_type': 'main_info', 'course_name': 'Master Degre

In [32]:
bm25_retriever = dr.load_sparse_retriever(retriever_type="BM25", documents_chunked=documents_chunked_with_ids, top_k=5)
chroma_retriever = dr.load_vector_retriever(collection_name="documents_without_metadata_embedded", top_k=5)

hybrid_retriever = dr.load_hybrid_retriever(bm25_retriever, chroma_retriever, weight_sparse=0.5, weight_vector=0.5)

  embedding_model = HuggingFaceEmbeddings(
  vector_store = Chroma(


In [45]:
query = "Master's in Data Science"

# Perform the retrieval
retrieval_results = hybrid_retriever.invoke(query)

# Print the entire document vertically, including all attributes
print("Query:", query)
for idx, doc in enumerate(retrieval_results, 1):
    print(f"Document {idx}:")
    
    # Print the ID of the document
    print(f"ID: {doc.id}")
    
    # Print the page content in chunks of 20 characters
    print("Page Content:")
    chunk_size = 100
    for i in range(0, len(doc.page_content), chunk_size):
        print(doc.page_content[i:i+chunk_size])  # Print each 20-character chunk
    
    # If metadata is available, print it as well
    if doc.metadata:
        print("Metadata:")
        for key, value in doc.metadata.items():
            print(f"  {key}: {value}")
    
    print("\n" + "-"*50 + "\n")  # Separator between documents


Query: Master's in Data Science
Document 1:
ID: None
Page Content:
Master's Degree in Data Science and Advanced Analytics, with a specialization in Data Science Master
 Degree in Data Science and Advanced Analytics, with a specialization in Data Science The Master Deg
ree in Data Science and Advanced Analytics, with a specialization in Data Science, is aimed at techn
ically oriented people with solid scientific background, who want to strengthen and deepen their ski
lls on the most used paradigms and environments for software development , and apply them to solve c
omplex real-world problems involving vast amounts of data
Metadata:
  source: master-degree-program-in-data-science-and-advanced-analytics-with-a-specialization-in-data-science_main_course.txt
  degree: masters
  doc_type: main_info
  course_name: Master Degree Program In Data Science And Advanced Analytics With A Specialization In Data Science
  id: d6c3c806-7850-456d-a4d7-b8c532299c02

-------------------------------------

# Setting up the evaluater

In [4]:
from typing import List, Set, Dict, Union
import numpy as np
from langchain_core.documents import Document

# Helper function
def extract_doc_id(doc: Document) -> str:
    """Extract the ID from the document metadata."""
    return doc.metadata.get("id", "")

# Metric functions
def precision_at_k(retrieved_ids: List[str], relevant_ids: Set[str], k: int) -> float:
    retrieved_k = retrieved_ids[:k]
    relevant_retrieved = sum(1 for doc_id in retrieved_k if doc_id in relevant_ids)
    return relevant_retrieved / k if k else 0.0

def recall_at_k(retrieved_ids: List[str], relevant_ids: Set[str], k: int) -> float:
    if not relevant_ids:
        return 0.0
    retrieved_k = retrieved_ids[:k]
    relevant_retrieved = sum(1 for doc_id in retrieved_k if doc_id in relevant_ids)
    return relevant_retrieved / len(relevant_ids)

def mrr_at_k(retrieved_ids: List[str], relevant_ids: Set[str], k: int) -> float:
    retrieved_k = retrieved_ids[:k]
    for idx, doc_id in enumerate(retrieved_k, start=1):
        if doc_id in relevant_ids:
            return 1.0 / idx
    return 0.0

# Updated Evaluation Loop
def evaluate_retrieval_system(
    retrieved_docs_per_query: Dict[str, List[Document]],  # <-- Now list of Documents
    relevant_docs_per_query: Dict[str, Set[str]],
    k_values: List[int] = [5]
) -> Dict[str, Dict[int, float]]:
    """
    Args:
        retrieved_docs_per_query: mapping from query_id to list of retrieved Documents.
        relevant_docs_per_query: mapping from query_id to set of ground-truth relevant document IDs.
        k_values: list of cutoff values for Precision@K, Recall@K, MRR@K.

    Returns:
        metrics: nested dictionary with structure metrics[metric_name][k] = average_score
    """
    metrics = {
        "Precision@K": {k: [] for k in k_values},
        "Recall@K": {k: [] for k in k_values},
        "MRR@K": {k: [] for k in k_values},
    }

    for query_id, retrieved_docs in retrieved_docs_per_query.items():
        # Extract doc IDs from Documents
        retrieved_ids = [extract_doc_id(doc) for doc in retrieved_docs]
        relevant_ids = relevant_docs_per_query.get(query_id, set())
        
        for k in k_values:
            precision = precision_at_k(retrieved_ids, relevant_ids, k)
            recall = recall_at_k(retrieved_ids, relevant_ids, k)
            mrr = mrr_at_k(retrieved_ids, relevant_ids, k)

            metrics["Precision@K"][k].append(precision)
            metrics["Recall@K"][k].append(recall)
            metrics["MRR@K"][k].append(mrr)

    # Aggregate (mean)
    averaged_metrics = {
        metric: {k: np.mean(values) if values else 0.0 for k, values in ks.items()}
        for metric, ks in metrics.items()
    }
    
    return averaged_metrics


In [5]:
import json
from typing import Dict, Set

def load_ground_truth(filepath: str) -> Dict[str, Set[str]]:
    """
    Load ground-truth relevant documents per query from the JSON file.
    
    Args:
        filepath: Path to the JSON file.

    Returns:
        Dictionary mapping from query string to set of relevant document IDs.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    gt = {}
    for item in data["evaluation_retrievers_dataset"]:
        query = item["query"]
        relevant_docs = set(item["relevant_docs"])
        gt[query] = relevant_docs

    return gt



## Problem documents support id, but retriever methods don't, we need to save the id parameter inside the metadata:

In [6]:
def move_id_to_metadata(documents):
    for doc in documents:
        if hasattr(doc, 'id') and doc.id is not None:
            doc.metadata['id'] = doc.id
    return documents

In [7]:
documents_chunked_with_ids = move_id_to_metadata(documents_chunked_with_ids)
documents_chunked_with_ids

[Document(id='07185832-3c4f-493a-b1be-db87a61f89a2', metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science', 'id': '07185832-3c4f-493a-b1be-db87a61f89a2'}, page_content="Teaching Staff en Programs Bachelor's\nDegrees Data Science Teaching Staff Américo Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant Professor\ncne

## Load the evaluation dataset

In [9]:
groundthruth_dataset = load_ground_truth(r'C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\retriever_evaluation_dataset.json')
groundthruth_dataset

{'What are the admission requirements for the Master Degree in Law and Financial Markets?': {'044839f6-fe8a-4bc4-9d50-a0e52100df1',
  'cfbf2fed-1f0f-46d2-b009-0b90315ca27c'},
 "What are the main professional opportunities of the Bachelor's degree in Information Systems?": {'56bd04db-ab9d-48bc-8fe7-82583c83853d',
  '6970a629-331d-40d2-85c7-04b99c219040'},
 "What is the study plan of the Master's in Data Science and advanced analytics?": {'3800e187-b87c-4902-b7e9-9c49cfcf8e87',
  '8978c024-442f-4cbb-aa67-517182f84770',
  'c9cd0f7e-e2bd-4a07-b4aa-28498940635c'},
 'How many ECTS are required to complete the Postgraduate Program in Information Management and Business Intelligence in Healthcare?': {'20e1ecce-8ce2-41e5-af90-c630e65c5ee7'},
 "Are there any specializations offered in the Master's in Data Driven Marketing?": {'598a030a-8552-45c9-a3cc-6dc24ea9be0b',
  '98189b9d-52ba-4778-97cc-36046cf30161',
  '9b8c3451-1513-42f2-a478-fe7f50216dd6',
  'fa3a6814-7368-4ee5-a9cf-d4ac28e80004'},
 "Wha

## Run retrievers

In [10]:
from typing import List, Dict, Union
from langchain_core.documents import Document

def run_retrieval(
    retriever: object,
    queries: List[str],
    top_k: int = 5
) -> Dict[str, List[Document]]:
    """
    Runs a retriever over a list of queries.

    Args:
        retriever: A retriever object that supports .invoke().
        queries: A list of query strings.
        top_k: Number of documents to retrieve per query. Passed dynamically if retriever supports it.

    Returns:
        Dict mapping each query string to its list of retrieved Documents.
    """
    retrieved_docs_per_query = {}

    for query in queries:
        # Some retrievers allow top_k passed dynamically (e.g., BM25, dense retrievers)
        try:
            retrieved_docs = retriever.invoke(query, top_k=top_k)
        except TypeError:
            # Some retrievers don't accept top_k (like Hybrid ensemble ones)
            retrieved_docs = retriever.invoke(query)
        
        retrieved_docs_per_query[query] = retrieved_docs

    return retrieved_docs_per_query


## Load query examples:

In [11]:
import json
from typing import List, Dict, Set

def load_queries_and_ground_truth(
    filepath: str
) -> (List[str], Dict[str, Set[str]]):
    """
    Loads queries and relevant document IDs from a JSON file.

    Args:
        filepath: Path to the JSON file.

    Returns:
        queries: List of query strings.
        relevant_docs_per_query: Mapping from query string to set of relevant document IDs.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Access the correct key to get the list of queries and relevant docs
    dataset = data["evaluation_retrievers_dataset"]

    queries = []
    relevant_docs_per_query = {}

    for item in dataset:
        query = item["query"]
        relevant_ids = set(item["relevant_docs"])

        queries.append(query)
        relevant_docs_per_query[query] = relevant_ids

    return queries, relevant_docs_per_query


In [12]:
# Load your data
queries, relevant_docs_per_query = load_queries_and_ground_truth(r'C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\retriever_evaluation_dataset.json')

In [13]:
queries

['What are the admission requirements for the Master Degree in Law and Financial Markets?',
 "What are the main professional opportunities of the Bachelor's degree in Information Systems?",
 "What is the study plan of the Master's in Data Science and advanced analytics?",
 'How many ECTS are required to complete the Postgraduate Program in Information Management and Business Intelligence in Healthcare?',
 "Are there any specializations offered in the Master's in Data Driven Marketing?",
 "What is the difference between the Bachelor's in Information Management and the Bachelor's in Information Systems?",
 'Can you give me the email of professor Bruno Jardim?',
 'Who is the program coordinator of the Postgraduate Program In Digital Transformation?',
 'Can you give me the email of professor Ivo Bernardo and Bruno Jardim?',
 'How is the curriculum for the Postgraduate Program in Business Intelligence?',
 "What are the main courses in the Master's Degree Program in Statistics and Informatio

In [14]:
relevant_docs_per_query

{'What are the admission requirements for the Master Degree in Law and Financial Markets?': {'044839f6-fe8a-4bc4-9d50-a0e52100df1',
  'cfbf2fed-1f0f-46d2-b009-0b90315ca27c'},
 "What are the main professional opportunities of the Bachelor's degree in Information Systems?": {'56bd04db-ab9d-48bc-8fe7-82583c83853d',
  '6970a629-331d-40d2-85c7-04b99c219040'},
 "What is the study plan of the Master's in Data Science and advanced analytics?": {'3800e187-b87c-4902-b7e9-9c49cfcf8e87',
  '8978c024-442f-4cbb-aa67-517182f84770',
  'c9cd0f7e-e2bd-4a07-b4aa-28498940635c'},
 'How many ECTS are required to complete the Postgraduate Program in Information Management and Business Intelligence in Healthcare?': {'20e1ecce-8ce2-41e5-af90-c630e65c5ee7'},
 "Are there any specializations offered in the Master's in Data Driven Marketing?": {'598a030a-8552-45c9-a3cc-6dc24ea9be0b',
  '98189b9d-52ba-4778-97cc-36046cf30161',
  '9b8c3451-1513-42f2-a478-fe7f50216dd6',
  'fa3a6814-7368-4ee5-a9cf-d4ac28e80004'},
 "Wha

# Evaluating TF-IDF retriever

## Running TF-IDF retriever

In [68]:
tfidf_retriever = dr.load_sparse_retriever(retriever_type="TF-IDF", documents_chunked=documents_chunked_with_ids, top_k=5)

In [69]:
# Run
retrieved_docs_tfidf = run_retrieval(tfidf_retriever, queries, top_k=5)

**Second precision approach that evaluates the k based on the k of the ground truth**

In [15]:
def precision_at_k(retrieved_ids: List[str], relevant_ids: Set[str], k: int) -> float:
    # Only consider the relevant documents in the ground truth
    retrieved_k = retrieved_ids[:k]
    relevant_retrieved = sum(1 for doc_id in retrieved_k if doc_id in relevant_ids)
    return relevant_retrieved / min(k, len(relevant_ids)) if len(relevant_ids) else 0.0

def recall_at_k(retrieved_ids: List[str], relevant_ids: Set[str], k: int) -> float:
    # If no relevant docs, return 0.0 for recall
    if not relevant_ids:
        return 0.0
    retrieved_k = retrieved_ids[:k]
    relevant_retrieved = sum(1 for doc_id in retrieved_k if doc_id in relevant_ids)
    return relevant_retrieved / len(relevant_ids)

def mrr_at_k(retrieved_ids: List[str], relevant_ids: Set[str], k: int) -> float:
    retrieved_k = retrieved_ids[:k]
    for idx, doc_id in enumerate(retrieved_k, start=1):
        if doc_id in relevant_ids:
            return 1.0 / idx
    return 0.0

# Updated Evaluation Loop with flexible ground-truth size
def evaluate_retrieval_system(
    retrieved_docs_per_query: Dict[str, List[Document]],  # <-- Now list of Documents
    relevant_docs_per_query: Dict[str, Set[str]],  # Set of relevant doc IDs
    k_values: List[int] = [5]
) -> Dict[str, Dict[int, float]]:
    """
    Args:
        retrieved_docs_per_query: mapping from query_id to list of retrieved Documents.
        relevant_docs_per_query: mapping from query_id to set of ground-truth relevant document IDs.
        k_values: list of cutoff values for Precision@K, Recall@K, MRR@K.

    Returns:
        metrics: nested dictionary with structure metrics[metric_name][k] = average_score
    """
    metrics = {
        "Precision@K": {k: [] for k in k_values},
        "Recall@K": {k: [] for k in k_values},
        "MRR@K": {k: [] for k in k_values},
    }

    for query_id, retrieved_docs in retrieved_docs_per_query.items():
        # Extract doc IDs from Documents
        retrieved_ids = [doc.metadata['id'] for doc in retrieved_docs]
        relevant_ids = relevant_docs_per_query.get(query_id, set())
        
        for k in k_values:
            precision = precision_at_k(retrieved_ids, relevant_ids, k)
            recall = recall_at_k(retrieved_ids, relevant_ids, k)
            mrr = mrr_at_k(retrieved_ids, relevant_ids, k)

            metrics["Precision@K"][k].append(precision)
            metrics["Recall@K"][k].append(recall)
            metrics["MRR@K"][k].append(mrr)

    # Aggregate (mean)
    averaged_metrics = {
        metric: {k: np.mean(values) if values else 0.0 for k, values in ks.items()}
        for metric, ks in metrics.items()
    }
    
    return averaged_metrics


In [70]:
# Evaluate
evaluation_tfidf = evaluate_retrieval_system(retrieved_docs_tfidf, relevant_docs_per_query, k_values=[5])
evaluation_tfidf

{'Precision@K': {5: np.float64(0.36666666666666664)},
 'Recall@K': {5: np.float64(0.3364285714285714)},
 'MRR@K': {5: np.float64(0.3208333333333333)}}

# Evaluation BM25 retriever

In [71]:
bm25_retriever = dr.load_sparse_retriever(retriever_type="BM25", documents_chunked=documents_chunked_with_ids, top_k=5)

In [72]:
# Run
retrieved_docs_bm25 = run_retrieval(bm25_retriever, queries, top_k=5)

In [73]:
# Evaluate
evaluation_bm25 = evaluate_retrieval_system(retrieved_docs_bm25, relevant_docs_per_query, k_values=[5])
evaluation_bm25

{'Precision@K': {5: np.float64(0.24)},
 'Recall@K': {5: np.float64(0.22666666666666666)},
 'MRR@K': {5: np.float64(0.2791666666666667)}}

# Evaluating Dense retriever

In [74]:
from langchain.embeddings import HuggingFaceEmbeddings
import torch

embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True}  # Important for BGE
)


In [75]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="documents_with_ids_without_metadata_embedded_in_pagecontent",
    embedding_function=embedding_model,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [23]:
ids = vector_store.add_documents(documents=documents_chunked_with_ids)

In [76]:
chroma_retriever = dr.load_vector_retriever(collection_name="documents_with_ids_without_metadata_embedded_in_pagecontent", top_k=5)

In [77]:
# Run
retrieved_docs_chroma = run_retrieval(chroma_retriever, queries, top_k=5)

In [78]:
# Evaluate
evaluation_chroma = evaluate_retrieval_system(retrieved_docs_chroma, relevant_docs_per_query, k_values=[5])
evaluation_chroma

{'Precision@K': {5: np.float64(0.2975)},
 'Recall@K': {5: np.float64(0.2808333333333333)},
 'MRR@K': {5: np.float64(0.3583333333333333)}}

# Evaluating Hybrid retriever

In [79]:
tfidf_retriever = dr.load_sparse_retriever(retriever_type="TF-IDF", documents_chunked=documents_chunked_with_ids, top_k=6)
bm25_retriever = dr.load_sparse_retriever(retriever_type="BM25", documents_chunked=documents_chunked_with_ids, top_k=6)
chroma_retriever = dr.load_vector_retriever(collection_name="documents_with_ids_without_metadata_embedded_in_pagecontent", top_k=6)

In [80]:
hybrid_retriever = dr.load_hybrid_retriever(tfidf_retriever, chroma_retriever, weight_sparse=0.5, weight_vector=0.5)

In [81]:
# Run
retrieved_docs_hybrid = run_retrieval(hybrid_retriever, queries)

In [82]:
# Evaluate
evaluation_hybrid = evaluate_retrieval_system(retrieved_docs_hybrid, relevant_docs_per_query, k_values=[5])
evaluation_hybrid

{'Precision@K': {5: np.float64(0.39416666666666667)},
 'Recall@K': {5: np.float64(0.36464285714285716)},
 'MRR@K': {5: np.float64(0.3208333333333333)}}

In [48]:
# Iterate over the retrieval results dictionary
for query_text, documents in retrieved_docs_hybrid.items():
    # Print the query text (this is now the key in the retrieval_results dictionary)
    print(f"Query: {query_text}")
    
    # Iterate over the documents related to this query
    for idx, doc in enumerate(documents, 1):
        print(f"Document {idx}:")
        
        # Print the ID of the document
        print(f"ID: {doc.metadata['id']}")
        
        # Print the page content in chunks of 100 characters
        print("Page Content:")
        chunk_size = 100
        for i in range(0, len(doc.page_content), chunk_size):
            print(doc.page_content[i:i+chunk_size])  # Print each 100-character chunk
        
        # If metadata is available, print it as well
        if doc.metadata:
            print("Metadata:")
            for key, value in doc.metadata.items():
                print(f"  {key}: {value}")
        
        print("\n" + "-"*50 + "\n")  # Separator between documents


Query: What are the admission requirements for the Master Degree in Law and Financial Markets?
Document 1:
ID: b4a7a86b-4421-4dec-ad3e-cdb45d65e572
Page Content:
Master Degree in Law and Financial Markets Master Degree in Law and Financial Markets Consistently r
anked by Eduniversal among the top master's courses in Western Europe , the Master Degree in Law and
 Financial Markets is a product of an innovative partnership joining together NOVA Information Manag
ement School and NOVA School of Law and it is fully taught in English
Metadata:
  source: master-degree-in-law-and-financial-markets_main_course.txt
  degree: masters
  doc_type: main_info
  course_name: Master Degree In Law And Financial Markets
  id: b4a7a86b-4421-4dec-ad3e-cdb45d65e572

--------------------------------------------------

Document 2:
ID: 044839f6-fe8a-4bc4-9d50-a0e52100df1c
Page Content:
. Learn more Who is it for? The Master's Degree in Law and Financial Markets is presented as a tool 
to deepen the knowledge 

# Evaluating Hybrid Reranking retriever

In [83]:
hybrid_retriever_reranking = dr.get_reranking(hybrid_retriever, top_n=4)

In [84]:
# Run
retrieved_docs_hybrid_reranking = run_retrieval(hybrid_retriever_reranking, queries, top_k=5)

In [85]:
# Evaluate
evaluation_hybrid_reraking = evaluate_retrieval_system(retrieved_docs_hybrid_reranking, relevant_docs_per_query, k_values=[5])
evaluation_hybrid_reraking

{'Precision@K': {5: np.float64(0.4141666666666667)},
 'Recall@K': {5: np.float64(0.3789285714285714)},
 'MRR@K': {5: np.float64(0.4625)}}

In [33]:
retrieved_docs_hybrid_reranking

{'What are the admission requirements for the Master Degree in Law and Financial Markets?': [Document(metadata={'course_name': 'Postgraduate Program In Data Science For Finance', 'degree': 'postgraduate', 'doc_type': 'main_info', 'id': '0079105f-6597-496b-b193-791a6cded9c2', 'source': 'postgraduate-program-in-data-science-for-finance_main_course.txt'}, page_content='. Candidates with a first or upper second Class honors degree (or international equivalent) from a recognized university in a highly quantitative subject such as economics, mathematics, statistics, engineering, computation, science, or management pursuing a successful career in finance are the main target of this . Work experience is not essential, but you are strongly recommended to undertake relevant internships and to share your expertise in class'),
  Document(metadata={'source': 'master-degree-in-law-and-financial-markets_main_course.txt', 'degree': 'masters', 'doc_type': 'main_info', 'course_name': 'Master Degree In L

In [35]:
relevant_docs_per_query

{'What are the admission requirements for the Master Degree in Law and Financial Markets?': {'33d4937c-9329-46dd-b548-ff886c002a79',
  '384ce944-0324-4cb4-9c0c-c05872ad5f0f'},
 "What are the main professional opportunities of the Bachelor's degree in Information Systems?": {'6970a629-331d-40d2-85c7-04b99c219040'},
 "What is the study plan of the Master's in Data Science and advanced analytics?": {'3800e187-b87c-4902-b7e9-9c49cfcf8e87',
  '8978c024-442f-4cbb-aa67-517182f84770',
  'c9cd0f7e-e2bd-4a07-b4aa-28498940635c'},
 'How many credits are required to complete the Postgraduate Program in Information Management and Business Intelligence in Healthcare?': {'2fad2712-627c-4cbe-8d04-f8abcefdb6e8'},
 "Are there any specializations offered in the Master's in Data Driven Marketing?": {'598a030a-8552-45c9-a3cc-6dc24ea9be0b'},
 "What is the difference between the Bachelor's in Information Management and the Bachelor's in Information Systems?": {'1d26bc64-2e81-49de-a949-3f9cbcbe8f64',
  '2483d1

# IMPROVING DOCUMENT CHUNKS WITH EMBEDDED METADATA

In [3]:
documents_chunked_with_ids_and_metadata = dh.load_documents_from_pickle(dc.DOCUMENTS_CHUNKED_WITH_IDS_AND_METADATA)
documents_chunked_with_ids_and_metadata

✅ Loaded 1017 documents from ..\..\data\Preprocessing_text\all_programs_chunked\documents_chunks_with_ids_and_metadata_embedded.pkl


[Document(id='07185832-3c4f-493a-b1be-db87a61f89a2', metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science'}, page_content="Data Science Teaching Staff en Programs Bachelor's\nDegrees Data Science Teaching Staff Américo Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant Professor\ncneves@novaims.unl.pt\n\nCatarina Pa

## Adding id to metadata field

In [7]:
def move_id_to_metadata(documents):
    for doc in documents:
        if hasattr(doc, 'id') and doc.id is not None:
            doc.metadata['id'] = doc.id
    return documents

In [8]:
documents_chunked_with_ids_and_metadata = move_id_to_metadata(documents_chunked_with_ids_and_metadata)
documents_chunked_with_ids_and_metadata

[Document(id='07185832-3c4f-493a-b1be-db87a61f89a2', metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science', 'id': '07185832-3c4f-493a-b1be-db87a61f89a2'}, page_content="Data Science Teaching Staff en Programs Bachelor's\nDegrees Data Science Teaching Staff Américo Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant P

## Evaluating TF-IDF retriever

In [16]:
tfidf_retriever = dr.load_sparse_retriever(retriever_type="TF-IDF", documents_chunked=documents_chunked_with_ids_and_metadata, top_k=5)

In [17]:
retrieved_docs_tfidf = run_retrieval(tfidf_retriever, queries, top_k=5)

In [18]:
# Evaluate
evaluation_tfidf = evaluate_retrieval_system(retrieved_docs_tfidf, relevant_docs_per_query, k_values=[5])
evaluation_tfidf

{'Precision@K': {5: np.float64(0.3583333333333333)},
 'Recall@K': {5: np.float64(0.32476190476190475)},
 'MRR@K': {5: np.float64(0.3)}}

## Evaluating BM25 retriever

In [19]:
bm25_retriever = dr.load_sparse_retriever(retriever_type="BM25", documents_chunked=documents_chunked_with_ids_and_metadata, top_k=5)

In [20]:
# Run
retrieved_docs_bm25 = run_retrieval(bm25_retriever, queries, top_k=5)

In [21]:
# Evaluate
evaluation_bm25 = evaluate_retrieval_system(retrieved_docs_bm25, relevant_docs_per_query, k_values=[5])
evaluation_bm25

{'Precision@K': {5: np.float64(0.25166666666666665)},
 'Recall@K': {5: np.float64(0.23380952380952377)},
 'MRR@K': {5: np.float64(0.2683333333333333)}}

## Evaluating DENSE retriever

In [22]:
from langchain.embeddings import HuggingFaceEmbeddings
import torch

embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True}  # Important for BGE
)


  embedding_model = HuggingFaceEmbeddings(


In [23]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="documents_with_ids_and_metadata_embedded_in_pagecontent",
    embedding_function=embedding_model,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [24]:
ids = vector_store.add_documents(documents=documents_chunked_with_ids_and_metadata)

In [25]:
chroma_retriever = dr.load_vector_retriever(collection_name="documents_with_ids_and_metadata_embedded_in_pagecontent", top_k=5)

  vector_store = Chroma(


In [26]:
# Run
retrieved_docs_chroma = run_retrieval(chroma_retriever, queries, top_k=5)

In [27]:
# Evaluate
evaluation_chroma = evaluate_retrieval_system(retrieved_docs_chroma, relevant_docs_per_query, k_values=[5])
evaluation_chroma

{'Precision@K': {5: np.float64(0.30333333333333334)},
 'Recall@K': {5: np.float64(0.28500000000000003)},
 'MRR@K': {5: np.float64(0.3583333333333333)}}

## Evaluating HYBRID retriever

In [56]:
hybrid_retriever = dr.load_hybrid_retriever(tfidf_retriever, chroma_retriever, weight_sparse=0.5, weight_vector=0.5)

In [57]:
# Run
retrieved_docs_hybrid = run_retrieval(hybrid_retriever, queries)

In [58]:
# Evaluate
evaluation_hybrid = evaluate_retrieval_system(retrieved_docs_hybrid, relevant_docs_per_query, k_values=[5])
evaluation_hybrid

{'Precision@K': {5: np.float64(0.3816666666666667)},
 'Recall@K': {5: np.float64(0.3542857142857143)},
 'MRR@K': {5: np.float64(0.34750000000000003)}}

## Evaluating HYBRID RERAKING retriever

In [64]:
hybrid_retriever_reranking = dr.get_reranking(hybrid_retriever, top_n=4)

In [65]:
# Run
retrieved_docs_hybrid_reranking = run_retrieval(hybrid_retriever_reranking, queries, top_k=4)

In [66]:
# Evaluate
evaluation_hybrid_reraking = evaluate_retrieval_system(retrieved_docs_hybrid_reranking, relevant_docs_per_query, k_values=[5])
evaluation_hybrid_reraking

{'Precision@K': {5: np.float64(0.4083333333333333)},
 'Recall@K': {5: np.float64(0.37476190476190474)},
 'MRR@K': {5: np.float64(0.425)}}

In [63]:
# Iterate over the retrieval results dictionary
for query_text, documents in retrieved_docs_hybrid_reranking.items():
    # Print the query text (this is now the key in the retrieval_results dictionary)
    print(f"Query: {query_text}")
    
    # Iterate over the documents related to this query
    for idx, doc in enumerate(documents, 1):
        print(f"Document {idx}:")
        
        # Print the ID of the document
        print(f"ID: {doc.metadata['id']}")
        
        # Print the page content in chunks of 100 characters
        print("Page Content:")
        chunk_size = 100
        for i in range(0, len(doc.page_content), chunk_size):
            print(doc.page_content[i:i+chunk_size])  # Print each 100-character chunk
        
        # If metadata is available, print it as well
        if doc.metadata:
            print("Metadata:")
            for key, value in doc.metadata.items():
                print(f"  {key}: {value}")
        
        print("\n" + "-"*50 + "\n")  # Separator between documents

Query: What are the admission requirements for the Master Degree in Law and Financial Markets?
Document 1:
ID: 044839f6-fe8a-4bc4-9d50-a0e52100df1c
Page Content:
Master Degree In Law And Financial Markets . Learn more Who is it for? The Master's Degree in Law an
d Financial Markets is presented as a tool to deepen the knowledge of those who already hold a law d
egree. Therefore, holders of a bachelor's degree (or equivalent) in Law will be given preference in 
admission. This aims to provide Law graduates a comprehensive and specialized training that enables 
them to enter into legal research, to embrace a legal profession in banking, insurance or capital ma
rkets, or practicing Law at a renowned law firm
Metadata:
  source: master-degree-in-law-and-financial-markets_main_course.txt
  degree: masters
  doc_type: main_info
  course_name: Master Degree In Law And Financial Markets
  id: 044839f6-fe8a-4bc4-9d50-a0e52100df1c

--------------------------------------------------

Document 2:
