In [1]:
import os
# Move to Thesis directory (two levels up)
os.chdir(os.path.abspath(os.path.join("..", "..")))

# Move to model/src if it exists
model_dir = os.path.join(os.getcwd(), "model", "src")
if os.path.exists(model_dir):
    os.chdir(model_dir)

print("Current Directory:", os.getcwd())

Current Directory: c:\Users\1176153\Downloads\github\Thesis\model\src


In [2]:
from libs import data_handeling as dh
from libs.settings import data_catalog as dc
from libs import data_retrievers as dr
from langchain.vectorstores import Chroma

In [3]:
documents_chunked_with_ids = dh.load_documents_from_pickle(dc.DOCUMENTS_CHUNKED_WITH_IDS)
documents_chunked_with_ids

✅ Loaded 383 documents from ..\..\data\Preprocessing_text\all_programs_chunked\documents_chunks_with_ids_without_metadata_embedded.pkl


[Document(id='55edd98c-cb22-4814-b1e3-6045090824ce', metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science'}, page_content='Teaching Staff\nAmérico Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant Professor\ncneves@novaims.unl.pt\n\nCatarina Palha\nInvited Teaching Assistant\ncpalha@novaims.unl.pt\n\nDhruv Akshay\n

# Printing text content from original documents

In [3]:
from typing import List, Optional
from langchain_core.documents import Document

def print_text_context_from_program_documents(
    data: List[Document],  # Changed to accept a list of Document objects
    course_names_to_include: Optional[List[str]] = None,
    doc_types_to_include: Optional[List[str]] = None,
    words_per_chunk: int = 10  # Added parameter to control the number of words per chunk
):
    """
    Prints the text content of the documents based on the specified filters, along with their document IDs.
    Displays the text content vertically in chunks of specified words (default 50).

    Parameters:
    - data (list): List of Document objects, each containing 'id', 'metadata', and 'text'.
    - course_names_to_include (list, optional): Filter by specific course names (case insensitive).
    - doc_types_to_include (list, optional): Filter by document types ('teaching_staff', 'study_plan', 'main_info').
    - words_per_chunk (int, optional): Number of words to display per vertical chunk (default is 50).
    """
    filtered_data = []

    for doc in data:
        course_name = doc.metadata.get("course_name", "").lower()
        doc_type = doc.metadata.get("doc_type", "").lower()

        # Filter by course name
        if course_names_to_include and course_name not in [name.lower() for name in course_names_to_include]:
            continue

        # Filter by doc type
        if doc_types_to_include and doc_type not in [dt.lower() for dt in doc_types_to_include]:
            continue

        filtered_data.append(doc)

    if not filtered_data:
        print("⚠️ No documents matched the filters.")
        return

    # Print the text content of the filtered documents in chunks of words
    for doc in filtered_data:
        doc_id = doc.id  # Accessing the 'id' field directly
        print(f"\n--- Document ID: {doc_id} ---")
        print(f"Course Name: {doc.metadata.get('course_name')}")
        print(f"Document Type: {doc.metadata.get('doc_type')}")
        print("\nText Content:")

        # Split the content into words
        words = doc.page_content.split()

        # Process the content in chunks of 'words_per_chunk' words
        for i in range(0, len(words), words_per_chunk):
            chunk = " ".join(words[i:i + words_per_chunk])  # Join the chunk of words
            print(chunk)  # Print the chunk of words

        print("\n" + "-"*50)




In [34]:
print_text_context_from_program_documents(
    data=documents_chunked_with_ids_and_metadata,
    course_names_to_include=['Master Degree Program In Data Science And Advanced Analytics With A Specialization In Data Science'],  # Pass as a list
    doc_types_to_include=['study_plan']  # Pass as a list
)


--- Document ID: d0066a77-46b7-4077-be1c-0326307ada22 ---
Course Name: Master Degree Program In Data Science And Advanced Analytics With A Specialization In Data Science
Document Type: study_plan

Text Content:
Master Degree Program In Data Science And Advanced Analytics With
A Specialization In Data Science masters 1 st year -
Fall Semester Course Units Type Duration ECTS Data Mining Mandatory
Semester 7,5 Machine Learning Mandatory Semester 7,5 Programming for Data
Science Mandatory Quarterly 3,5 Statistics for Data Science Mandatory Semester
7,5 Storing and Retrieving Data Mandatory Quarterly 4Keywords: curriculum, syllabus,
program overview, academic plan, course structure, degree requirements, credit distribution,
module list, subject breakdown, learning outcomes, ECTS allocation, semester planning,
course roadmap, educational objectives, program outline, instructional content, course progression,
academic curriculum, study track, course catalog, study plan

---------------------

In [48]:
tfidf_retriever = dr.load_sparse_retriever(retriever_type="TF-IDF", documents_chunked=documents_chunked_with_ids, top_k=4)
query = "Postgraduate Program in Enterprise Data Science & Analytics"
print(tfidf_retriever.invoke(query))

[Document(metadata={'source': 'postgraduate-program-enterprise-data-science-analytics_main_course.txt', 'degree': 'postgraduate', 'doc_type': 'main_info', 'course_name': 'Postgraduate Program Enterprise Data Science Analytics'}, page_content='Postgraduate in Enterprise Data Science & Analytics Postgraduate Enterprise Data Science & Analytics Developed in partnership with Microsoft, the Postgraduate in Enterprise Data Science & Analytics will present the methodologies and tools that will transform data into information, on which enterprises can base strategic information on entering new markets, launching new product or service lines, optimizing processes, transforming business models and, generally, competing in a market increasingly driven by data-driven decisions'), Document(metadata={'source': 'master-degree-program-in-data-science-and-advanced-analytics-with-a-specialization-in-data-science_main_course.txt', 'degree': 'masters', 'doc_type': 'main_info', 'course_name': 'Master Degre

In [32]:
bm25_retriever = dr.load_sparse_retriever(retriever_type="BM25", documents_chunked=documents_chunked_with_ids, top_k=5)
chroma_retriever = dr.load_vector_retriever(collection_name="documents_without_metadata_embedded", top_k=5)

hybrid_retriever = dr.load_hybrid_retriever(bm25_retriever, chroma_retriever, weight_sparse=0.5, weight_vector=0.5)

  embedding_model = HuggingFaceEmbeddings(
  vector_store = Chroma(


In [None]:
query = "bruno jardim"

# Perform the retrieval
retrieval_results = hybrid_retriever.invoke(query)

# Print the entire document vertically, including all attributes
print("Query:", query)
for idx, doc in enumerate(retrieval_results, 1):
    print(f"Document {idx}:")
    
    # Print the ID of the document
    print(f"ID: {doc.id}")
    
    # Print the page content in chunks of 20 characters
    print("Page Content:")
    chunk_size = 100
    for i in range(0, len(doc.page_content), chunk_size):
        print(doc.page_content[i:i+chunk_size])  # Print each 20-character chunk
    
    # If metadata is available, print it as well
    if doc.metadata:
        print("Metadata:")
        for key, value in doc.metadata.items():
            print(f"  {key}: {value}")
    
    print("\n" + "-"*50 + "\n")  # Separator between documents


Query: bruno jardim
Document 1:
ID: None
Page Content:
Master Degree In Data Driven Marketing With A Specialization In Digital Marketing And Analytics Work
ing Hours Format masters Faculty Afshin
Ashofteh Assistant Professor
aashofteh@novaims.unl.pt

Alexa
ndra Variz
Adjunct Lecturer
avariz@novaims.unl.pt

Américo Rio
Invited Assistant Professor
americo.r
io@novaims.unl.pt

Ana Cristina
Costa Associate Professor
cristina@novaims.unl.pt

Ana Edmundo
Invit
ed Teaching Assistant
aedmundo@novaims.unl.pt

Ana Gonçalves
Research Assistant
agoncalves@novaims.u
nl.pt

Ana Pena
Adjunct Lecturer
apena@novaims.unl.pt

André Barriguinha
Professor of the Practice
a
barriguinha@novaims.unl.pt

Augusto Santos
Assistant Professor
ajrsantos@novaims.unl.pt

Bruno Amara
l
Adjunct Lecturer
bamaral@novaims.unl.pt

Bruno Damásio
Assistant Professor
bdamasio@novaims.unl.pt


Bruno Jardim
Assistant Professor
bjardim@novaims.unl.pt

Bruno Nunes
Adjunct Lecturer
bbras@novaim
s.unl.pt

Carlos Bispo
Adjunct Lectu

# Setting up the evaluater

In [4]:
from typing import List, Set, Dict, Union
import numpy as np
from langchain_core.documents import Document

# Helper function
def extract_doc_id(doc: Document) -> str:
    """Extract the ID from the document metadata."""
    return doc.metadata.get("id", "")

def precision_at_k(retrieved_ids: List[str], relevant_ids: Set[str], k: int) -> float:
    # Only consider the relevant documents in the ground truth
    retrieved_k = retrieved_ids[:k]
    relevant_retrieved = sum(1 for doc_id in retrieved_k if doc_id in relevant_ids)
    return relevant_retrieved / min(k, len(relevant_ids)) if len(relevant_ids) else 0.0

def recall_at_k(retrieved_ids: List[str], relevant_ids: Set[str], k: int) -> float:
    # If no relevant docs, return 0.0 for recall
    if not relevant_ids:
        return 0.0
    retrieved_k = retrieved_ids[:k]
    relevant_retrieved = sum(1 for doc_id in retrieved_k if doc_id in relevant_ids)
    return relevant_retrieved / len(relevant_ids)

def mrr_at_k(retrieved_ids: List[str], relevant_ids: Set[str], k: int) -> float:
    retrieved_k = retrieved_ids[:k]
    for idx, doc_id in enumerate(retrieved_k, start=1):
        if doc_id in relevant_ids:
            return 1.0 / idx
    return 0.0

# Updated Evaluation Loop with flexible ground-truth size
def evaluate_retrieval_system(
    retrieved_docs_per_query: Dict[str, List[Document]],  # <-- Now list of Documents
    relevant_docs_per_query: Dict[str, Set[str]],  # Set of relevant doc IDs
    k_values: List[int] = [5]
) -> Dict[str, Dict[int, float]]:
    """
    Args:
        retrieved_docs_per_query: mapping from query_id to list of retrieved Documents.
        relevant_docs_per_query: mapping from query_id to set of ground-truth relevant document IDs.
        k_values: list of cutoff values for Precision@K, Recall@K, MRR@K.

    Returns:
        metrics: nested dictionary with structure metrics[metric_name][k] = average_score
    """
    metrics = {
        "Precision@K": {k: [] for k in k_values},
        "Recall@K": {k: [] for k in k_values},
        "MRR@K": {k: [] for k in k_values},
    }

    for query_id, retrieved_docs in retrieved_docs_per_query.items():
        # Extract doc IDs from Documents
        retrieved_ids = [doc.metadata['id'] for doc in retrieved_docs]
        relevant_ids = relevant_docs_per_query.get(query_id, set())
        
        for k in k_values:
            precision = precision_at_k(retrieved_ids, relevant_ids, k)
            recall = recall_at_k(retrieved_ids, relevant_ids, k)
            mrr = mrr_at_k(retrieved_ids, relevant_ids, k)

            metrics["Precision@K"][k].append(precision)
            metrics["Recall@K"][k].append(recall)
            metrics["MRR@K"][k].append(mrr)

    # Aggregate (mean)
    averaged_metrics = {
        metric: {k: np.mean(values) if values else 0.0 for k, values in ks.items()}
        for metric, ks in metrics.items()
    }
    
    return averaged_metrics


In [5]:
import json
from typing import Dict, Set

def load_ground_truth(filepath: str) -> Dict[str, Set[str]]:
    """
    Load ground-truth relevant documents per query from the JSON file.
    
    Args:
        filepath: Path to the JSON file.

    Returns:
        Dictionary mapping from query string to set of relevant document IDs.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    gt = {}
    for item in data["evaluation_retrievers_dataset"]:
        query = item["query"]
        relevant_docs = set(item["relevant_docs"])
        gt[query] = relevant_docs

    return gt



## Problem documents support id, but retriever methods don't, we need to save the id parameter inside the metadata:

In [7]:
def move_id_to_metadata(documents):
    for doc in documents:
        if hasattr(doc, 'id') and doc.id is not None:
            doc.metadata['id'] = doc.id
    return documents

In [8]:
documents_chunked_with_ids = move_id_to_metadata(documents_chunked_with_ids)
documents_chunked_with_ids

[Document(id='db01edcd-03fa-4da0-8b70-be229accc8c9', metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science', 'id': 'db01edcd-03fa-4da0-8b70-be229accc8c9'}, page_content="Teaching Staff en Programs Bachelor's\nDegrees Data Science Teaching Staff Américo Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant Professor\ncne

## Load the evaluation dataset

In [5]:
groundthruth_dataset = load_ground_truth(r'C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\retriever_evaluation_dataset.json')
groundthruth_dataset

{'What are the admission requirements for the Master Degree in Law and Financial Markets?': {'f2f8d7b5-6137-462c-967c-75e4e0dbbd82'},
 "What are the main professional opportunities of the Bachelor's degree in Information Systems?": {'9f473cfb-b50a-4a6e-8d5a-2c33bc5fa786'},
 "What are the course units of the Fall semesters for the Master's in Data Science and advanced analytics with specialization in Data Science?": {'d0066a77-46b7-4077-be1c-0326307ada22'},
 'How many ECTS are required to complete the Postgraduate Program in Information Management and Business Intelligence in Healthcare?': {'cc6bddeb-c3de-4521-a747-d44f3a8d7a08'},
 "Who is the Coordinator of the Master's in Data Driven Marketing With A Specialization In Data Science For Marketing Working?": {'bd1a7274-1c41-46d7-abd3-1bf3244f95d2'},
 "What is the difference between the Bachelor's in Information Management and the Bachelor's in Information Systems?": {'13dd9632-4c91-4920-a8a5-0a8015f6851a',
  '1ea6b17e-478b-41d9-8b33-8c31

## Run retrievers

In [6]:
from typing import List, Dict, Union
from langchain_core.documents import Document

def run_retrieval(
    retriever: object,
    queries: List[str],
    top_k: int = 5
) -> Dict[str, List[Document]]:
    """
    Runs a retriever over a list of queries.

    Args:
        retriever: A retriever object that supports .invoke().
        queries: A list of query strings.
        top_k: Number of documents to retrieve per query. Passed dynamically if retriever supports it.

    Returns:
        Dict mapping each query string to its list of retrieved Documents.
    """
    retrieved_docs_per_query = {}

    for query in queries:
        # Some retrievers allow top_k passed dynamically (e.g., BM25, dense retrievers)
        try:
            retrieved_docs = retriever.invoke(query, top_k=top_k)
        except TypeError:
            # Some retrievers don't accept top_k (like Hybrid ensemble ones)
            retrieved_docs = retriever.invoke(query)
        
        retrieved_docs_per_query[query] = retrieved_docs

    return retrieved_docs_per_query


## Load query examples:

In [7]:
import json
from typing import List, Dict, Set

def load_queries_and_ground_truth(
    filepath: str
) -> (List[str], Dict[str, Set[str]]):
    """
    Loads queries and relevant document IDs from a JSON file.

    Args:
        filepath: Path to the JSON file.

    Returns:
        queries: List of query strings.
        relevant_docs_per_query: Mapping from query string to set of relevant document IDs.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Access the correct key to get the list of queries and relevant docs
    dataset = data["evaluation_retrievers_dataset"]

    queries = []
    relevant_docs_per_query = {}

    for item in dataset:
        query = item["query"]
        relevant_ids = set(item["relevant_docs"])

        queries.append(query)
        relevant_docs_per_query[query] = relevant_ids

    return queries, relevant_docs_per_query


In [53]:
# Load your data
queries, relevant_docs_per_query = load_queries_and_ground_truth(r'C:\Users\1176153\Downloads\github\Thesis\data\Preprocessing_text\retriever_evaluation_easy_queries.json')

In [54]:
queries

['What is the aim for the Master Degree In Data Driven Marketing With A Specialization In Digital Marketing And Analytics?',
 "What are the main professional opportunities of the Bachelor's degree in Information Management?",
 "What are the main professional opportunities of the Bachelor's degree in Data Science?",
 'What are the course units of the Fall semesters for the postgraduate program in business intelligence?',
 'What are the course units of the Fall semesters for the Master Degree Program In Data Science And Advanced Analytics With A Specialization In Business Analytics?',
 'What are the course units of the Spring semesters for the postgraduate program in digital enterprise management?',
 'What are the course units of the Spring semesters for the postgraduate program in digital transformation?',
 'What are the course units of the Spring semesters for the postgraduate program in marketing intelligence?',
 'What are the course units of the Spring semesters for the postgraduate 

In [55]:
relevant_docs_per_query

{'What is the aim for the Master Degree In Data Driven Marketing With A Specialization In Digital Marketing And Analytics?': {'bf118e2c-9761-4429-b21c-4d323b34dd8a'},
 "What are the main professional opportunities of the Bachelor's degree in Information Management?": {'32b065e4-a5eb-46c3-ad47-43d4aec04c0a'},
 "What are the main professional opportunities of the Bachelor's degree in Data Science?": {'ad86c5d1-c828-44dd-af1a-f98ff84878bc'},
 'What are the course units of the Fall semesters for the postgraduate program in business intelligence?': {'64bf66f3-e8b4-4582-8659-5c8de78a30cb'},
 'What are the course units of the Fall semesters for the Master Degree Program In Data Science And Advanced Analytics With A Specialization In Business Analytics?': {'c92481aa-1998-44a7-895b-5c18b5c91454'},
 'What are the course units of the Spring semesters for the postgraduate program in digital enterprise management?': {'aae9a081-ba65-4462-88d0-2320d21df95e'},
 'What are the course units of the Spring

# Evaluating TF-IDF retriever

## Running TF-IDF retriever

In [68]:
tfidf_retriever = dr.load_sparse_retriever(retriever_type="TF-IDF", documents_chunked=documents_chunked_with_ids, top_k=5)

In [69]:
# Run
retrieved_docs_tfidf = run_retrieval(tfidf_retriever, queries, top_k=5)

In [70]:
# Evaluate
evaluation_tfidf = evaluate_retrieval_system(retrieved_docs_tfidf, relevant_docs_per_query, k_values=[5])
evaluation_tfidf

{'Precision@K': {5: np.float64(0.36666666666666664)},
 'Recall@K': {5: np.float64(0.3364285714285714)},
 'MRR@K': {5: np.float64(0.3208333333333333)}}

# Evaluation BM25 retriever

In [71]:
bm25_retriever = dr.load_sparse_retriever(retriever_type="BM25", documents_chunked=documents_chunked_with_ids, top_k=5)

In [72]:
# Run
retrieved_docs_bm25 = run_retrieval(bm25_retriever, queries, top_k=5)

In [73]:
# Evaluate
evaluation_bm25 = evaluate_retrieval_system(retrieved_docs_bm25, relevant_docs_per_query, k_values=[5])
evaluation_bm25

{'Precision@K': {5: np.float64(0.24)},
 'Recall@K': {5: np.float64(0.22666666666666666)},
 'MRR@K': {5: np.float64(0.2791666666666667)}}

# Evaluating Dense retriever

In [74]:
from langchain.embeddings import HuggingFaceEmbeddings
import torch

embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True}  # Important for BGE
)


In [75]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="documents_with_ids_without_metadata_embedded_in_pagecontent",
    embedding_function=embedding_model,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [23]:
ids = vector_store.add_documents(documents=documents_chunked_with_ids)

In [76]:
chroma_retriever = dr.load_vector_retriever(collection_name="documents_with_ids_without_metadata_embedded_in_pagecontent", top_k=5)

In [77]:
# Run
retrieved_docs_chroma = run_retrieval(chroma_retriever, queries, top_k=5)

In [78]:
# Evaluate
evaluation_chroma = evaluate_retrieval_system(retrieved_docs_chroma, relevant_docs_per_query, k_values=[5])
evaluation_chroma

{'Precision@K': {5: np.float64(0.2975)},
 'Recall@K': {5: np.float64(0.2808333333333333)},
 'MRR@K': {5: np.float64(0.3583333333333333)}}

# Evaluating Hybrid retriever

In [79]:
tfidf_retriever = dr.load_sparse_retriever(retriever_type="TF-IDF", documents_chunked=documents_chunked_with_ids, top_k=6)
bm25_retriever = dr.load_sparse_retriever(retriever_type="BM25", documents_chunked=documents_chunked_with_ids, top_k=6)
chroma_retriever = dr.load_vector_retriever(collection_name="documents_with_ids_without_metadata_embedded_in_pagecontent", top_k=6)

In [80]:
hybrid_retriever = dr.load_hybrid_retriever(tfidf_retriever, chroma_retriever, weight_sparse=0.5, weight_vector=0.5)

In [81]:
# Run
retrieved_docs_hybrid = run_retrieval(hybrid_retriever, queries)

In [82]:
# Evaluate
evaluation_hybrid = evaluate_retrieval_system(retrieved_docs_hybrid, relevant_docs_per_query, k_values=[5])
evaluation_hybrid

{'Precision@K': {5: np.float64(0.39416666666666667)},
 'Recall@K': {5: np.float64(0.36464285714285716)},
 'MRR@K': {5: np.float64(0.3208333333333333)}}

In [48]:
# Iterate over the retrieval results dictionary
for query_text, documents in retrieved_docs_hybrid.items():
    # Print the query text (this is now the key in the retrieval_results dictionary)
    print(f"Query: {query_text}")
    
    # Iterate over the documents related to this query
    for idx, doc in enumerate(documents, 1):
        print(f"Document {idx}:")
        
        # Print the ID of the document
        print(f"ID: {doc.metadata['id']}")
        
        # Print the page content in chunks of 100 characters
        print("Page Content:")
        chunk_size = 100
        for i in range(0, len(doc.page_content), chunk_size):
            print(doc.page_content[i:i+chunk_size])  # Print each 100-character chunk
        
        # If metadata is available, print it as well
        if doc.metadata:
            print("Metadata:")
            for key, value in doc.metadata.items():
                print(f"  {key}: {value}")
        
        print("\n" + "-"*50 + "\n")  # Separator between documents


Query: What are the admission requirements for the Master Degree in Law and Financial Markets?
Document 1:
ID: b4a7a86b-4421-4dec-ad3e-cdb45d65e572
Page Content:
Master Degree in Law and Financial Markets Master Degree in Law and Financial Markets Consistently r
anked by Eduniversal among the top master's courses in Western Europe , the Master Degree in Law and
 Financial Markets is a product of an innovative partnership joining together NOVA Information Manag
ement School and NOVA School of Law and it is fully taught in English
Metadata:
  source: master-degree-in-law-and-financial-markets_main_course.txt
  degree: masters
  doc_type: main_info
  course_name: Master Degree In Law And Financial Markets
  id: b4a7a86b-4421-4dec-ad3e-cdb45d65e572

--------------------------------------------------

Document 2:
ID: 044839f6-fe8a-4bc4-9d50-a0e52100df1c
Page Content:
. Learn more Who is it for? The Master's Degree in Law and Financial Markets is presented as a tool 
to deepen the knowledge 

# Evaluating Hybrid Reranking retriever

In [83]:
hybrid_retriever_reranking = dr.get_reranking(hybrid_retriever, top_n=4)

In [84]:
# Run
retrieved_docs_hybrid_reranking = run_retrieval(hybrid_retriever_reranking, queries, top_k=5)

In [85]:
# Evaluate
evaluation_hybrid_reraking = evaluate_retrieval_system(retrieved_docs_hybrid_reranking, relevant_docs_per_query, k_values=[5])
evaluation_hybrid_reraking

{'Precision@K': {5: np.float64(0.4141666666666667)},
 'Recall@K': {5: np.float64(0.3789285714285714)},
 'MRR@K': {5: np.float64(0.4625)}}

In [33]:
retrieved_docs_hybrid_reranking

{'What are the admission requirements for the Master Degree in Law and Financial Markets?': [Document(metadata={'course_name': 'Postgraduate Program In Data Science For Finance', 'degree': 'postgraduate', 'doc_type': 'main_info', 'id': '0079105f-6597-496b-b193-791a6cded9c2', 'source': 'postgraduate-program-in-data-science-for-finance_main_course.txt'}, page_content='. Candidates with a first or upper second Class honors degree (or international equivalent) from a recognized university in a highly quantitative subject such as economics, mathematics, statistics, engineering, computation, science, or management pursuing a successful career in finance are the main target of this . Work experience is not essential, but you are strongly recommended to undertake relevant internships and to share your expertise in class'),
  Document(metadata={'source': 'master-degree-in-law-and-financial-markets_main_course.txt', 'degree': 'masters', 'doc_type': 'main_info', 'course_name': 'Master Degree In L

In [35]:
relevant_docs_per_query

{'What are the admission requirements for the Master Degree in Law and Financial Markets?': {'33d4937c-9329-46dd-b548-ff886c002a79',
  '384ce944-0324-4cb4-9c0c-c05872ad5f0f'},
 "What are the main professional opportunities of the Bachelor's degree in Information Systems?": {'6970a629-331d-40d2-85c7-04b99c219040'},
 "What is the study plan of the Master's in Data Science and advanced analytics?": {'3800e187-b87c-4902-b7e9-9c49cfcf8e87',
  '8978c024-442f-4cbb-aa67-517182f84770',
  'c9cd0f7e-e2bd-4a07-b4aa-28498940635c'},
 'How many credits are required to complete the Postgraduate Program in Information Management and Business Intelligence in Healthcare?': {'2fad2712-627c-4cbe-8d04-f8abcefdb6e8'},
 "Are there any specializations offered in the Master's in Data Driven Marketing?": {'598a030a-8552-45c9-a3cc-6dc24ea9be0b'},
 "What is the difference between the Bachelor's in Information Management and the Bachelor's in Information Systems?": {'1d26bc64-2e81-49de-a949-3f9cbcbe8f64',
  '2483d1

# IMPROVING DOCUMENT CHUNKS WITH EMBEDDED METADATA

In [8]:
documents_chunked_with_ids_and_metadata = dh.load_documents_from_pickle(dc.DOCUMENTS_CHUNKED_WITH_IDS_AND_METADATA)
documents_chunked_with_ids_and_metadata

✅ Loaded 383 documents from ..\..\data\Preprocessing_text\all_programs_chunked\documents_chunks_with_ids_and_metadata_embedded.pkl


[Document(id='55edd98c-cb22-4814-b1e3-6045090824ce', metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science'}, page_content='Data Science bachelor Teaching Staff\nAmérico Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant Professor\ncneves@novaims.unl.pt\n\nCatarina Palha\nInvited Teaching Assistant\ncpalha@novaims.un

## Adding id to metadata field

In [9]:
def move_id_to_metadata(documents):
    for doc in documents:
        if hasattr(doc, 'id') and doc.id is not None:
            doc.metadata['id'] = doc.id
    return documents

In [10]:
documents_chunked_with_ids_and_metadata = move_id_to_metadata(documents_chunked_with_ids_and_metadata)
documents_chunked_with_ids_and_metadata

[Document(id='55edd98c-cb22-4814-b1e3-6045090824ce', metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science', 'id': '55edd98c-cb22-4814-b1e3-6045090824ce'}, page_content='Data Science bachelor Teaching Staff\nAmérico Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant Professor\ncneves@novaims.unl.pt\n\nCatarina Palha\

## Evaluating TF-IDF retriever

In [11]:
tfidf_retriever = dr.load_sparse_retriever(retriever_type="TF-IDF", documents_chunked=documents_chunked_with_ids_and_metadata, top_k=5)

In [12]:
retrieved_docs_tfidf = run_retrieval(tfidf_retriever, queries, top_k=5)

NameError: name 'queries' is not defined

In [59]:
# Evaluate
evaluation_tfidf = evaluate_retrieval_system(retrieved_docs_tfidf, relevant_docs_per_query, k_values=[5])
evaluation_tfidf

{'Precision@K': {5: np.float64(0.6650943396226415)},
 'Recall@K': {5: np.float64(0.6580188679245284)},
 'MRR@K': {5: np.float64(0.439622641509434)}}

## Evaluating BM25 retriever

In [13]:
bm25_retriever = dr.load_sparse_retriever(retriever_type="BM25", documents_chunked=documents_chunked_with_ids_and_metadata, top_k=5)

In [61]:
# Run
retrieved_docs_bm25 = run_retrieval(bm25_retriever, queries, top_k=5)

In [62]:
# Evaluate
evaluation_bm25 = evaluate_retrieval_system(retrieved_docs_bm25, relevant_docs_per_query, k_values=[5])
evaluation_bm25

{'Precision@K': {5: np.float64(0.5349056603773585)},
 'Recall@K': {5: np.float64(0.5306603773584906)},
 'MRR@K': {5: np.float64(0.35597484276729563)}}

## Evaluating DENSE retriever

In [14]:
from langchain.embeddings import HuggingFaceEmbeddings
import torch

embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True}  # Important for BGE
)


  embedding_model = HuggingFaceEmbeddings(


In [16]:
from langchain_chroma import Chroma
# documents_with_ids_and_metadata_embedded_in_pagecontent -> first try with embedded metadata without parent documents

vector_store = Chroma(
    collection_name="parent_documents_with_ids_and_metadata_embedded_v2",
    embedding_function=embedding_model,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [17]:
ids = vector_store.add_documents(documents=documents_chunked_with_ids_and_metadata)

In [18]:
#new: parent_documents_with_ids_and_metadata_embedded_v2
#old: parent_documents_with_ids_and_metadata_embedded_in_pagecontent
chroma_retriever = dr.load_vector_retriever(collection_name="parent_documents_with_ids_and_metadata_embedded_v2", top_k=5)

  vector_store = Chroma(


In [63]:
# Run
retrieved_docs_chroma = run_retrieval(chroma_retriever, queries, top_k=5)

In [64]:
# Evaluate
evaluation_chroma = evaluate_retrieval_system(retrieved_docs_chroma, relevant_docs_per_query, k_values=[5])
evaluation_chroma

{'Precision@K': {5: np.float64(0.8852201257861635)},
 'Recall@K': {5: np.float64(0.8852201257861635)},
 'MRR@K': {5: np.float64(0.6358490566037737)}}

## Evaluating HYBRID retriever

In [19]:
hybrid_retriever = dr.load_hybrid_retriever(tfidf_retriever, chroma_retriever, weight_sparse=0.5, weight_vector=0.5)

In [67]:
# Run
retrieved_docs_hybrid = run_retrieval(hybrid_retriever, queries)

In [68]:
# Evaluate
evaluation_hybrid = evaluate_retrieval_system(retrieved_docs_hybrid, relevant_docs_per_query, k_values=[5])
evaluation_hybrid

{'Precision@K': {5: np.float64(0.8823899371069183)},
 'Recall@K': {5: np.float64(0.8781446540880503)},
 'MRR@K': {5: np.float64(0.5830188679245283)}}

## Evaluating HYBRID RERAKING retriever

In [20]:
hybrid_retriever_reranking = dr.get_reranking(hybrid_retriever, top_n=5)

In [None]:
# Run
retrieved_docs_hybrid_reranking = run_retrieval(hybrid_retriever_reranking, queries, top_k=4)

In [71]:
# Evaluate
evaluation_hybrid_reraking = evaluate_retrieval_system(retrieved_docs_hybrid_reranking, relevant_docs_per_query, k_values=[5])
evaluation_hybrid_reraking

{'Precision@K': {5: np.float64(0.9836477987421385)},
 'Recall@K': {5: np.float64(0.9779874213836478)},
 'MRR@K': {5: np.float64(0.9371069182389938)}}

In [21]:
query = ['Is a background in statistics necessary to apply for the postgraduate program in geospatial data science?']
retrieved_docs_hybrid_reranking = run_retrieval(hybrid_retriever_reranking, query, top_k=4)

In [22]:
# Iterate over the retrieval results dictionary
for query_text, documents in retrieved_docs_hybrid_reranking.items():
    # Print the query text (this is now the key in the retrieval_results dictionary)
    print(f"Query: {query_text}")
    
    # Iterate over the documents related to this query
    for idx, doc in enumerate(documents, 1):
        print(f"Document {idx}:")
        
        # Print the ID of the document
        print(f"ID: {doc.metadata['id']}")
        
        # Print the page content in chunks of 100 characters
        print("Page Content:")
        chunk_size = 100
        for i in range(0, len(doc.page_content), chunk_size):
            print(doc.page_content[i:i+chunk_size])  # Print each 100-character chunk
        
        # If metadata is available, print it as well
        if doc.metadata:
            print("Metadata:")
            for key, value in doc.metadata.items():
                print(f"  {key}: {value}")
        
        print("\n" + "-"*50 + "\n")  # Separator between documents

Query: Is a background in statistics necessary to apply for the postgraduate program in geospatial data science?
Document 1:
ID: 299e7384-fbf4-4cb3-9553-bd51c74bedea
Page Content:
Postgraduate Program In Geospatial Data Science postgraduate Admissions and fees
NOVA IMS is looking
 for excellent students who have the potential to become good technical staff and excellent managers
. Qualities such as maturity, determination and motivation, both in the academic path as well in a p
rofessional career, are some of the required attributes. 3 rd Application phase From March 10 th and
 April 10 th , 2025. Admissions' Requirements The applicants to the Postgraduate in Geographical Sci
ences and Information Systems should hold a relevant bachelor's degree, or equivalent, in Statistics
 and Information Management, Geography, Economics, Engineering, Computer Sciences, Physics, Mathemat
ics, Architecture, Urban and Regional Planning, or in other relevant scientific fields to be approve
d by the Sci

In [20]:
# GET THE NAMES OF THE COURSES
import os

# Define your folder path
folder_path = r"C:\Users\1176153\Downloads\github\Thesis\data\Webscrapping\postgraduate_master_degrees\maininfo"

# Get a list of all .txt files in the folder
txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]

# Extract course names by removing "_main_course.txt" and replacing hyphens
course_names = []
for filename in txt_files:
    if filename.endswith("_main_course.txt"):
        clean_name = filename.replace("_main_course.txt", "")
        clean_name = clean_name.replace("-", " ")  # Remove hyphens
        course_names.append(clean_name)

# Print the cleaned course names
for course in course_names:
    print(course)


european master of science in information systems management
master degree in data driven marketing with a specialization in data science for marketing working hours format
master degree in data driven marketing with a specialization in digital marketing and analytics working hours format
master degree in data driven marketing with a specialization in marketing intelligence working hours format
master degree in data driven marketing with a specialization in marketing research and crm
master degree in geographic information systems and science with a specialization in geographic information systems and science
master degree in geographic information systems and science with a specialization in geospatial data science
master degree in geospatial technologies
master degree in information management with a specialization in digital transformation
master degree in information management with a specialization in information systems management working hours format
master degree in law and fin