In [1]:
import os
# Move to Thesis directory (two levels up)
os.chdir(os.path.abspath(os.path.join("..", "..")))

# Move to model/src if it exists
model_dir = os.path.join(os.getcwd(), "model", "src")
if os.path.exists(model_dir):
    os.chdir(model_dir)

print("Current Directory:", os.getcwd())

Current Directory: c:\Users\1176153\Downloads\github\Thesis\model\src


In [2]:
from libs import data_handeling as dh
from libs.settings import data_catalog as dc
from libs import data_retrievers as dr
from langchain.vectorstores import Chroma

In [3]:
documents_chunked = dh.load_documents_from_pickle(dc.DOCUMENTS_CHUNKED)


✅ Loaded 383 documents from ..\..\data\Preprocessing_text\all_programs_chunked\docs_all_programs_chunked_without_metadata.pkl


# Add document with IDS

In [6]:
import uuid
import pickle
import os

# Function to add UUIDs and save to a pickle file in a specified folder
def add_ids_to_documents_and_save(documents, file_name):
    # Ensure the folder exists, create it if it doesn't
    folder_path = r"..\..\data\Preprocessing_text\all_programs_chunked"
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Add unique UUID to each document
    for doc in documents:
        if not doc.id:  # Check if the document already has an ID
            doc.id = str(uuid.uuid4())  # Generate and assign a new UUID
            
    # Define the full path for the pickle file
    pickle_file_path = os.path.join(folder_path, file_name)

    # Save the updated documents to a pickle file
    with open(pickle_file_path, 'wb') as f:
        pickle.dump(documents, f)
    print(f"Documents saved to {pickle_file_path}")

    return documents

# Example usage
documents_chunks_with_ids_without_metadata_embedded = add_ids_to_documents_and_save(documents_chunked, 
                                                                                    'documents_chunks_with_ids_without_metadata_embedded.pkl')


Documents saved to ..\..\data\Preprocessing_text\all_programs_chunked\documents_chunks_with_ids_without_metadata_embedded.pkl


In [7]:
documents_chunks_with_ids_without_metadata_embedded

[Document(id='55edd98c-cb22-4814-b1e3-6045090824ce', metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science'}, page_content='Teaching Staff\nAmérico Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant Professor\ncneves@novaims.unl.pt\n\nCatarina Palha\nInvited Teaching Assistant\ncpalha@novaims.unl.pt\n\nDhruv Akshay\n

# Sparse Search

In [35]:
from langchain.retrievers import BM25Retriever
from langchain.retrievers import TFIDFRetriever

bm25_retriever = BM25Retriever.from_documents(documents_chunks_with_ids_without_metadata_embedded, k=4)
tfidf_retriever = TFIDFRetriever.from_documents(documents_chunks_with_ids_without_metadata_embedded, k=4)

query = "data science bachelor"





In [36]:
# Get top 3 documents
top_docs_bm25 = bm25_retriever.invoke(query)
top_docs_bm25


[Document(id='6a24d95b-2237-4afb-b6b9-62438c7a103f', metadata={'source': 'postgraduate-program-enterprise-data-science-analytics_main_course.txt', 'degree': 'postgraduate', 'doc_type': 'main_info', 'course_name': 'Postgraduate Program Enterprise Data Science Analytics'}, page_content='. Goals The main goal of this is to train technical staff to use Big Data and Machine Learning tools and methodologies in the: Exploration and transformation of data; Creation of data models and data visualization; Application of statistical methods to data; Application of data science methodologies; Implementation and validation of Machine Learning models; Application of data science techniques to common scenarios in enterprise environments'),
 Document(id='0cdbf4cc-9ef5-44af-a6af-5c19ff1f56b7', metadata={'source': 'master-degree-in-data-driven-marketing-with-a-specialization-in-data-science-for-marketing-working-hours-format_main_course.txt', 'degree': 'masters', 'doc_type': 'main_info', 'course_name': 

In [37]:
# Get top 3 documents
top_docs_tfidf = tfidf_retriever.invoke(query)
top_docs_tfidf


[Document(metadata={'source': 'bachelor_data-science_main_course_text.txt', 'degree': 'bachelor', 'doc_type': 'main_info', 'course_name': 'Data Science'}, page_content="Text from https://www.novaims.unl.pt/en///bachelor-s-degrees/data-science/: Data Science Degree in Data Science en Bachelor's Degrees Data Science In the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data). They will become true data scientists - considered the sexiest profession of the 21 st century by the Harvard Business Review"),
 Document(metadata={'source': 'master-degree-program-in-data-science-and-advanced-analytics-with-a-specialization-in-data-science_main_course.txt', 'degree': 'masters', 'doc_type': 'main_info', 'course_name': 'Master Degree Program In Data Science And Advanced Analytics With A Specialization In Data Science'}, page_content="Master's Degree in Data Science and Advanced Analyti

# Semantic Search

In [21]:
from langchain.embeddings import HuggingFaceEmbeddings
import torch

embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True}  # Important for BGE
)


In [22]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="documents_chunks_without_metadata_embedded",
    embedding_function=embedding_model,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [23]:
ids = vector_store.add_documents(documents=documents_chunks_with_ids_without_metadata_embedded)

In [24]:
ids

['07185832-3c4f-493a-b1be-db87a61f89a2',
 '00fc1236-6315-4e0e-a9bb-f3ef38024702',
 '447d3b80-9c50-4a38-b00c-d13210a66ee4',
 'd7fd9f9a-cdb4-4070-86fc-1541f7b23e38',
 '376b63cc-d867-425e-9ac2-37150fdb19dc',
 'b69a5ddf-78b3-4f0f-bc78-7064fc4d2fae',
 '7015535f-4f98-48c0-888a-d91f7cc7c80e',
 '746bf0f7-c33d-43f1-a89c-4d6c9e2795fb',
 'bd1a1468-8045-4fbc-89eb-42954ed417cf',
 '45f4177a-75a0-436f-8582-d540a5c9b218',
 'd549ecfe-44d3-4ba0-b99b-43520fa4ee4a',
 '659a9510-2423-4a0e-a424-10db3d2b550c',
 '9b06a471-ffbc-4f22-a0ee-0df00d2b11bb',
 '9daada0c-a354-48f0-a00e-d25452d3301d',
 '0f4b41f9-060c-4396-86f2-8b782435e340',
 '8d515f97-41d6-4800-8eff-def23863dc21',
 '0683685f-bc47-4f82-a82d-b49c0c6514f1',
 '7c9dc5e4-e47a-4a38-ae2a-910559d6c81f',
 '0591a941-aa07-48b6-ad8f-46f7ff8d5e0b',
 '45d91427-7428-448d-b082-42ad1dd45292',
 '4feec462-6e89-463b-a7ac-c17ed1d0e56d',
 '3ef2242a-8d86-42f0-9489-8cac30c0a8bc',
 '6eb8494f-3b9a-40b4-b852-12e860925e2a',
 '4b9840b9-ecf7-423f-88e1-f6be128fb13d',
 '5fd4960a-8201-

In [26]:
results = vector_store.similarity_search(query)

In [27]:
results

[Document(id='08ad9c30-b2b5-417f-8de2-705bd319be60', metadata={'course_name': 'Data Science', 'degree': 'bachelor', 'doc_type': 'main_info', 'source': 'bachelor_data-science_main_course_text.txt'}, page_content="Text from https://www.novaims.unl.pt/en///bachelor-s-degrees/data-science/: Data Science Degree in Data Science en Bachelor's Degrees Data Science In the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data). They will become true data scientists - considered the sexiest profession of the 21 st century by the Harvard Business Review"),
 Document(id='40c9ffe2-583f-4501-8282-a56fd1c96162', metadata={'course_name': 'Data Science', 'degree': 'bachelor', 'doc_type': 'main_info', 'source': 'bachelor_data-science_main_course_text.txt'}, page_content='. Duration 3 years (6 semesters) Timetable Daytime Start September 2025 Career Opportunities The Bachelor´s Degree in Data 

In [28]:
# Assuming your vector_store is a Chroma instance
Chroma_retriever = vector_store.as_retriever()

In [29]:
# Your query
query = "data science"

# Retrieve relevant documents
retrieved_docs = Chroma_retriever.invoke(query)

# Print the results
for doc in retrieved_docs:
    print(doc)

page_content='Text from https://www.novaims.unl.pt/en///bachelor-s-degrees/data-science/: Data Science Degree in Data Science en Bachelor's Degrees Data Science In the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data). They will become true data scientists - considered the sexiest profession of the 21 st century by the Harvard Business Review' metadata={'course_name': 'Data Science', 'degree': 'bachelor', 'doc_type': 'main_info', 'source': 'bachelor_data-science_main_course_text.txt'}
page_content='. In this context, the graduate in Data Science should: Understand the theoretical foundations of statistics, Machine Learning and Artificial Intelligence methods; Identify and understand the most efficient algorithm for each specific problem; Design and develop state-of-the-art data science algorithms; Work closely with IT specialists to integrate Data Science algorithms in

# Hybrid Search

In [30]:
from langchain.retrievers import EnsembleRetriever

ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, Chroma_retriever], weights=[0.5, 0.5]
)

In [31]:
ensemble_results = ensemble_retriever.invoke("Ivo Bernardo")
ensemble_results

[Document(id='00fc1236-6315-4e0e-a9bb-f3ef38024702', metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science'}, page_content='Ivo Bernardo\nAdjunct Lecturer\nibernardo@novaims.unl.pt\n\nJoana Neves\nAssistant Professor\njneves@novaims.unl.pt\n\nJorge Neves\nInvited Assistant Professor\njnneves@novaims.unl.pt\n\nJosé Carvalho\nInvited Teaching Assistant\njcarvalho@novaims.unl.pt\n\nLeon Debatin\nAdjunct Lecturer\n20230549@novaims.unl.pt\n\nLeonardo Vanneschi\nFull Professor\nlvanneschi@novaims.unl.pt\n\nLiah Rosenfeld\nInvited Teaching Assistant\nlrosenfeld@novaims.unl.pt\n\nManuel Rodrigues\nAdjunct Lecturer\nmerodrigues@novaims.unl.pt\n\nManuela Aparício\nAssistant Professor\nmanuela.aparicio@novaims.unl.pt\n\nMaria Fernandes\nInvited Assistant Professor\nmfernandes@novaims.unl.pt\n\nMariana Dias\nInvited Teaching Assistant\nmidias@novaims.unl.pt\n\nMarisa Fernandes\nAdjunct Lecturer\nmmferna

In [77]:
# Then manually slice the list
top_4_ensemble_results = ensemble_results[:4]
top_4_ensemble_results

[Document(metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science'}, page_content='Ivo Bernardo\nAdjunct Lecturer\nibernardo@novaims.unl.pt\n\nJoana Neves\nAssistant Professor\njneves@novaims.unl.pt\n\nJorge Neves\nInvited Assistant Professor\njnneves@novaims.unl.pt\n\nJosé Carvalho\nInvited Teaching Assistant\njcarvalho@novaims.unl.pt\n\nLeon Debatin\nAdjunct Lecturer\n20230549@novaims.unl.pt\n\nLeonardo Vanneschi\nFull Professor\nlvanneschi@novaims.unl.pt\n\nLiah Rosenfeld\nInvited Teaching Assistant\nlrosenfeld@novaims.unl.pt\n\nManuel Rodrigues\nAdjunct Lecturer\nmerodrigues@novaims.unl.pt\n\nManuela Aparício\nAssistant Professor\nmanuela.aparicio@novaims.unl.pt\n\nMaria Fernandes\nInvited Assistant Professor\nmfernandes@novaims.unl.pt\n\nMariana Dias\nInvited Teaching Assistant\nmidias@novaims.unl.pt\n\nMarisa Fernandes\nAdjunct Lecturer\nmmfernandes@novaims.unl.pt\n\nMarisa Nunes\nAdjunc

## Hybrid Search + Reranking

In [32]:
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers import ContextualCompressionRetriever

# 1. Load the HuggingFace model
model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")


In [33]:
# 2. Wrap it into a LangChain-compatible reranker
compressor = CrossEncoderReranker(model=model, top_n=4)  # you want top 4

# 3. Wrap retriever + compressor
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=ensemble_retriever,  # your retriever
)



In [None]:
# 4. Retrieve + rerank in one step
final_docs = compression_retriever.invoke("Ivo Bernardo")

# optional: view results
for i, doc in enumerate(final_docs):
    print(f"\nRank {i+1}: {doc.metadata.get('source', '')}\n{doc.page_content}")


Rank 1: postgraduate-program-in-business-intelligence_Faculty.txt
Teaching Staff
Teaching Staff Teaching Staff Bruno Jardim Assistant Professor
bjardim@novaims.unl.pt

Bruno Rodrigues
Adjunct Lecturer
brodrigues@novaims.unl.pt

Dhruv Akshay
Pandit Invited Teaching Assistant
dpandit@novaims.unl.pt

Duarte Rodrigues
Adjunct Lecturer
duarte.rodrigues@novaims.unl.pt

Farina Pontejos
Invited Teaching Assistant
fpontejos@novaims.unl.pt

Fernando Bação
Full Professor
bacao@novaims.unl.pt

Frederico Cruz
Jesus Associate Professor
fjesus@novaims.unl.pt

Guilherme Victorino
Assistant Professor
gmvictorino@novaims.unl.pt

Ivo Bernardo
Adjunct Lecturer
ibernardo@novaims.unl.pt

Joana Neves
Assistant Professor
jneves@novaims.unl.pt

João Gomes
Adjunct Lecturer
jgomes@novaims.unl.pt

João Martins
Professor of the Practice
jmartins@novaims.unl.pt

Leon Debatin
Adjunct Lecturer
20230549@novaims.unl.pt

Luís Batista
Adjunct Lecturer
lbatista@novaims.unl.pt

Lourenço Baptista
Assistant Professor
m.bapt

# Testing data_retrievers.py

In [6]:
tfidf_retriever = dr.load_sparse_retriever(retriever_type="TF-IDF", documents_chunked=documents_chunked, top_k=4)
bm25_retriever = dr.load_sparse_retriever(retriever_type="BM25", documents_chunked=documents_chunked, top_k=4)

In [4]:
chroma_retriever = dr.load_vector_retriever(collection_name="documents_without_metadata_embedded", top_k=4)

  embedding_model = HuggingFaceEmbeddings(
  vector_store = Chroma(


In [7]:
hybrid_retriever = dr.load_hybrid_retriever(bm25_retriever, chroma_retriever, weight_sparse=0.5, weight_vector=0.5)

In [8]:
hybrid_retriever_reranking = dr.get_reranking(hybrid_retriever, top_n=4)

In [9]:
query = "data science bachelor"

In [10]:
print(tfidf_retriever.invoke(query))
print("\n")
print(bm25_retriever.invoke(query))
print("\n")
print(chroma_retriever.invoke(query))
print("\n")
print(hybrid_retriever.invoke(query))
print("\n")
print(hybrid_retriever_reranking.invoke(query))


[Document(metadata={'source': 'bachelor_data-science_main_course_text.txt', 'degree': 'bachelor', 'doc_type': 'main_info', 'course_name': 'Data Science'}, page_content="Text from https://www.novaims.unl.pt/en///bachelor-s-degrees/data-science/: Data Science Degree in Data Science en Bachelor's Degrees Data Science In the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data). They will become true data scientists - considered the sexiest profession of the 21 st century by the Harvard Business Review"), Document(metadata={'source': 'master-degree-program-in-data-science-and-advanced-analytics-with-a-specialization-in-data-science_main_course.txt', 'degree': 'masters', 'doc_type': 'main_info', 'course_name': 'Master Degree Program In Data Science And Advanced Analytics With A Specialization In Data Science'}, page_content="Master's Degree in Data Science and Advanced Analytic