In [1]:
import os
# Move to Thesis directory (two levels up)
os.chdir(os.path.abspath(os.path.join("..", "..")))

# Move to model/src if it exists
model_dir = os.path.join(os.getcwd(), "model", "src")
if os.path.exists(model_dir):
    os.chdir(model_dir)

print("Current Directory:", os.getcwd())

Current Directory: c:\Users\1176153\Downloads\github\Thesis\model\src


In [2]:
import os
from langchain_core.documents import Document
import pickle
import re
import torch

In [5]:
# Folder where your pickle files are stored
pickle_folder = r"../../data/Preprocessing_text/bachelors_data"

# List to store all LangChain Documents
all_documents = []

# Loop through all files in the folder
for filename in os.listdir(pickle_folder):
    if filename.endswith(".pkl"):
        file_path = os.path.join(pickle_folder, filename)
        with open(file_path, "rb") as f:
            docs_dict = pickle.load(f)
        
        # Convert each key-value pair into a Document object
        documents = [
            Document(page_content=text, metadata={"source": key})
            for key, text in docs_dict.items()
        ]
        
        all_documents.extend(documents)

print(f"Total documents loaded: {len(all_documents)}")
print("Sample document metadata:", all_documents[0].metadata)

Total documents loaded: 9
Sample document metadata: {'source': 'data-science_main_course_extracted_text.txt'}


# TOKENIZATION MATTERS:
from transformers import AutoTokenizer

- For mistral: 

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", use_fast=True)

- for Llama:

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", use_fast=True)




In [3]:
import re
from typing import List, Dict
from langchain_core.documents import Document
from transformers import AutoTokenizer, PreTrainedTokenizerFast

# Load tokenizer globally
def load_tokenizer(model_name: str = "mistralai/Mistral-7B-Instruct-v0.2") -> PreTrainedTokenizerFast:
    try:
        return AutoTokenizer.from_pretrained(model_name, use_fast=True)
    except Exception as e:
        raise RuntimeError(f"❌ Failed to load tokenizer: {e}")

tokenizer = load_tokenizer()

def count_tokens(text: str) -> int:
    try:
        return len(tokenizer.encode(text, truncation=False))
    except Exception as e:
        print(f"⚠️ Token counting failed for text: {text[:50]}... \nError: {e}")
        return float("inf")

def extract_teaching_staff_blocks(text: str) -> List[str]:
    """
    Extracts blocks of teaching staff information in the format:
    Name\nTitle\nEmail
    """
    pattern = r"([A-Z][a-zA-Zçéàèíóõâêã]+\s+[A-Z][a-zA-Zçéàèíóõâêã]+.*?)\s([\w\.\s]+?)\s([\w\.-]+@novaims\.unl\.pt)"
    matches = re.findall(pattern, text)
    return [f"{name}\n{title}\n{email}\n\n" for name, title, email in matches]

def group_blocks_by_token_limit(
    blocks: List[str],
    max_token_size: int,
    chunk_overlap: bool = False,
    overlap_size: int = 0
) -> List[str]:
    """
    Groups extracted blocks into text chunks not exceeding max_token_size tokens.
    Optionally includes overlapping blocks between consecutive chunks.
    """
    grouped_chunks = []
    current_chunk = []
    current_token_count = 0
    i = 0

    while i < len(blocks):
        block = blocks[i]
        token_count = count_tokens(block)

        if token_count > max_token_size:
            raise ValueError(
                f"❌ Block too large to fit in a chunk (size={token_count} tokens, max={max_token_size}):\n{block}"
            )

        if current_token_count + token_count > max_token_size:
            if current_chunk:
                grouped_chunks.append("".join(current_chunk).strip())

                if chunk_overlap and overlap_size > 0:
                    # Keep last N blocks as overlap
                    current_chunk = current_chunk[-overlap_size:]
                    current_token_count = sum(count_tokens(b) for b in current_chunk)
                else:
                    current_chunk = []
                    current_token_count = 0
        else:
            current_chunk.append(block)
            current_token_count += token_count
            i += 1

    if current_chunk:
        grouped_chunks.append("".join(current_chunk).strip())

    return grouped_chunks

def chunk_teaching_staff_documents(
    doc_dict: Dict[str, str],
    max_tokens: int = 512,
    chunk_overlap: bool = False,
    overlap_size: int = 0
) -> List[Document]:
    """
    Main entry point. Accepts a dictionary {doc_name: text} and returns chunked Documents.
    """
    all_docs = []

    for name, text in doc_dict.items():
        if not isinstance(text, str) or not text.strip():
            print(f"⚠️ Skipping empty or invalid text for: {name}")
            continue

        try:
            blocks = extract_teaching_staff_blocks(text)
            grouped_chunks = group_blocks_by_token_limit(
                blocks,
                max_token_size=max_tokens,
                chunk_overlap=chunk_overlap,
                overlap_size=overlap_size
            )

            for chunk in grouped_chunks:
                all_docs.append(Document(page_content=chunk, metadata={"source": name}))

        except Exception as e:
            print(f"❌ Error processing teaching staff in '{name}': {e}")

    return all_docs




In [22]:
with open("../../data/Preprocessing_text/bachelors_data/dict_teachingstaff_cleaned.pkl", "rb") as f:
    teachingstaff_dict = pickle.load(f)
staff_chunks = chunk_teaching_staff_documents(teachingstaff_dict)
staff_chunks

[Document(metadata={'source': 'data-science_teaching-staff_extracted_text.txt'}, page_content="Teaching Staff en Programs Bachelor's\nDegrees Data Science Teaching Staff Américo Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant Professor\ncneves@novaims.unl.pt\n\nCatarina Palha\nInvited Teaching Assistant\ncpalha@novaims.unl.pt\n\nDhruv Akshay\nPandit Invited Teaching Assistant\ndpandit@novaims.unl.pt\n\nDiogo

In [23]:
print(staff_chunks[0].page_content)

Teaching Staff en Programs Bachelor's
Degrees Data Science Teaching Staff Américo Rio Invited Assistant Professor
americo.rio@novaims.unl.pt

Ana Cristina
Costa Associate Professor
cristina@novaims.unl.pt

Artur Varanda
Adjunct Lecturer
avaranda@novaims.unl.pt

Augusto Santos
Assistant Professor
ajrsantos@novaims.unl.pt

Bruno Damásio
Assistant Professor
bdamasio@novaims.unl.pt

Carina Albuquerque
Assistant Professor
calbuquerque@novaims.unl.pt

Carolina Maria
Shaul Adjunct Lecturer
cshaul@novaims.unl.pt

Carolina Santos
Maximiano Adjunct Lecturer
cmaximiano@novaims.unl.pt

Carolina Vasconcelos
Invited Teaching Assistant
cvasconcelos@novaims.unl.pt

Catarina Neves
Assistant Professor
cneves@novaims.unl.pt

Catarina Palha
Invited Teaching Assistant
cpalha@novaims.unl.pt

Dhruv Akshay
Pandit Invited Teaching Assistant
dpandit@novaims.unl.pt

Diogo Rasteiro
Research Assistant
drasteiro@novaims.unl.pt

Fernando Bação
Full Professor
bacao@novaims.unl.pt

Filipe Marques
Professor of the Prac

In [24]:
for i, doc in enumerate(staff_chunks):
    num_tokens = count_tokens(doc.page_content)
    print(f"Chunk {i+1}: {num_tokens} tokens")
    if num_tokens > 512:
        print("❗️WARNING: Token limit exceeded")

Chunk 1: 487 tokens
Chunk 2: 482 tokens
Chunk 3: 331 tokens
Chunk 4: 487 tokens
Chunk 5: 501 tokens
Chunk 6: 495 tokens
Chunk 7: 81 tokens
Chunk 8: 488 tokens
Chunk 9: 484 tokens
Chunk 10: 463 tokens


# Costum chunking for Study plan

- Chunking by year and semestres and having a argument to set the total number of tokens to have in each document. Optional Chunk_overlap 

In [4]:
import re
from typing import List, Dict
from langchain_core.documents import Document
from transformers import AutoTokenizer, PreTrainedTokenizerFast

'''
# Without overlap (default)
docs = chunk_all_documents(my_texts_dict, max_tokens=512)

# With 20-token overlap between chunks
docs = chunk_all_documents(my_texts_dict, max_tokens=512, chunk_overlap=True, overlap_size=20)
'''

def load_tokenizer(model_name: str = "mistralai/Mistral-7B-Instruct-v0.2") -> PreTrainedTokenizerFast:
    try:
        return AutoTokenizer.from_pretrained(model_name, use_fast=True)
    except Exception as e:
        raise RuntimeError(f"❌ Failed to load tokenizer: {e}")

def count_tokens(text: str) -> int:
    try:
        return len(tokenizer.encode(text, truncation=False))
    except Exception as e:
        print(f"⚠️ Token counting failed for text: {text[:50]}... \nError: {e}")
        return float("inf")

def split_to_token_limit(text: str, max_tokens: int, overlap: bool = False, overlap_size: int = 50) -> List[str]:
    tokens = tokenizer.encode(text, truncation=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True).strip()
        chunks.append(chunk_text)
        start = end - overlap_size if overlap else end
    return chunks

def smart_chunk_by_semester(text: str, max_tokens: int = 512, chunk_overlap: bool = False, overlap_size: int = 50) -> List[str]:
    if not isinstance(text, str) or not text.strip():
        return []

    pattern = re.compile(r"(\d+\s?(?:st|nd|rd|th) year - (?:Fall|Spring) Semester)", re.IGNORECASE)
    matches = list(pattern.finditer(text))

    if not matches:
        print("⚠️ No semester headers found — chunking whole text instead.")
        return split_to_token_limit(text, max_tokens, overlap=chunk_overlap, overlap_size=overlap_size)

    chunks = []
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        chunk_text = text[start:end].strip()

        if count_tokens(chunk_text) <= max_tokens:
            chunks.append(chunk_text)
        else:
            chunks.extend(split_to_token_limit(chunk_text, max_tokens, overlap=chunk_overlap, overlap_size=overlap_size))

    return chunks

def chunk_study_plan_documents(
    doc_dict: Dict[str, str],
    max_tokens: int = 512,
    chunk_overlap: bool = False,
    overlap_size: int = 50
) -> List[Document]:
    
    all_docs = []

    if not isinstance(doc_dict, dict):
        raise ValueError("❌ Input must be a dictionary with document names and text.")

    for name, text in doc_dict.items():
        if not isinstance(text, str) or not text.strip():
            print(f"⚠️ Skipping empty or invalid text for: {name}")
            continue
        try:
            chunks = smart_chunk_by_semester(
                text,
                max_tokens=max_tokens,
                chunk_overlap=chunk_overlap,
                overlap_size=overlap_size
            )
            for chunk in chunks:
                all_docs.append(Document(page_content=chunk, metadata={"source": name}))
        except Exception as e:
            print(f"❌ Error chunking document '{name}': {e}")
    
    return all_docs




In [10]:
from pathlib import Path
import pickle

with open("../../data/Preprocessing_text/bachelors_data/dict_studyplan_cleaned.pkl", "rb") as f:
    studyplan_dict = pickle.load(f)

docs = chunk_all_documents(studyplan_dict, max_tokens=800)

print(f"Total docs: {len(docs)}")
print(docs[0].metadata)

Total docs: 18
{'source': 'data-science_program_extracted_text.txt'}


In [7]:
docs

[Document(metadata={'source': 'data-science_program_extracted_text.txt'}, page_content="1 st year - Fall Semester Course Units Duration Type Contact Hours Total Workload ECTS Computers' Architecture Semester Mandatory TP - 45 | OT - 5 112 4 Foundational aspects of data science Semester Mandatory TP - 45 | OT - 5 112 4 Information Systems Semester Mandatory TP - 64,5 | S - 3 | OT - 7,5 168 6 Introduction to Programming Semester Mandatory TP - 67,5 | OT - 7,5 196 7 Linear Algebra Semester Mandatory T - 22,5 | PL - 22,5 | OT - 7,5 112 4 Show subtitles T – Theoretical Teaching; TP – Theoretical/ Practical Teaching; TC – Field Work; PL – Theoretical/ Practical Teaching; S – Seminar; OT – Tutor Group; ** – Course Unit available in both semesters;"),
 Document(metadata={'source': 'data-science_program_extracted_text.txt'}, page_content='1 st year - Spring Semester Course Units Duration Type Contact Hours Total Workload ECTS Algorithms and Data Structures Semester Mandatory TP - 67,5 | OT - 7,

In [17]:
for i, doc in enumerate(docs):
    num_tokens = count_tokens(doc.page_content)
    print(f"Chunk {i+1}: {num_tokens} tokens")
    if num_tokens > 512:
        print("❗️WARNING: Token limit exceeded")

Chunk 1: 164 tokens
Chunk 2: 218 tokens
Chunk 3: 177 tokens
Chunk 4: 159 tokens
Chunk 5: 177 tokens
Chunk 6: 323 tokens
Chunk 7: 182 tokens
Chunk 8: 186 tokens
Chunk 9: 177 tokens
Chunk 10: 173 tokens
Chunk 11: 194 tokens
Chunk 12: 485 tokens
Chunk 13: 182 tokens
Chunk 14: 184 tokens
Chunk 15: 165 tokens
Chunk 16: 207 tokens
Chunk 17: 186 tokens
Chunk 18: 460 tokens


# Costum chunking for Main info

In [14]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Utilizador\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [5]:
import pickle
import re
from typing import List, Dict
from nltk import sent_tokenize
from transformers import AutoTokenizer, PreTrainedTokenizerFast

# === Tokenizer Setup ===
def load_tokenizer(model_name: str = "mistralai/Mistral-7B-Instruct-v0.2") -> PreTrainedTokenizerFast:
    try:
        return AutoTokenizer.from_pretrained(model_name, use_fast=True)
    except Exception as e:
        raise RuntimeError(f"❌ Failed to load tokenizer: {e}")

tokenizer = load_tokenizer()

def count_tokens(text: str) -> int:
    try:
        return len(tokenizer.encode(text, truncation=False))
    except Exception as e:
        print(f"⚠️ Token counting failed for text: {text[:50]}... \nError: {e}")
        return float("inf")

# === Helpers ===
def is_potential_section_header(line: str) -> bool:
    line = line.strip()
    if not line:
        return False
    return (
        len(line) < 80 and (
            line.isupper() or
            line.istitle() or
            bool(re.match(r"^\d+[\).\s]", line))
        )
    )

# === Main Chunking Logic ===
def chunk_main_info_documents(
    doc_dict: Dict[str, str],
    max_tokens: int = 512,
    chunk_overlap: bool = False,
    overlap_size: int = 50
) -> List[Document]:
    chunked_documents = []

    for course_name, text in doc_dict.items():
        lines = text.split("\n")
        sections = []
        current_section = {"header": None, "content": ""}

        for line in lines:
            if is_potential_section_header(line):
                if current_section["header"] or current_section["content"].strip():
                    sections.append(current_section)
                current_section = {"header": line.strip(), "content": ""}
            else:
                current_section["content"] += line.strip() + " "

        if current_section["header"] or current_section["content"].strip():
            sections.append(current_section)

        for section in sections:
            full_text = (section["header"] + "\n" if section["header"] else "") + section["content"]
            sentences = sent_tokenize(full_text)

            buffer = []
            token_count = 0
            i = 0

            while i < len(sentences):
                sentence = sentences[i]
                sentence_tokens = count_tokens(sentence)

                if sentence_tokens > max_tokens:
                    raise ValueError(
                        f"❌ Sentence too large to fit in a chunk (size={sentence_tokens} tokens, max={max_tokens}):\n{sentence}"
                    )

                if token_count + sentence_tokens <= max_tokens:
                    buffer.append(sentence)
                    token_count += sentence_tokens
                    i += 1
                else:
                    chunk_text = " ".join(buffer).strip()
                    chunked_documents.append(
                        Document(page_content=chunk_text, metadata={"source": course_name})
                    )

                    # Handle overlap
                    if chunk_overlap:
                        overlap_buffer = []
                        overlap_token_count = 0
                        for s in reversed(buffer):
                            s_tokens = count_tokens(s)
                            if overlap_token_count + s_tokens <= overlap_size:
                                overlap_buffer.insert(0, s)
                                overlap_token_count += s_tokens
                            else:
                                break
                        buffer = overlap_buffer
                        token_count = overlap_token_count
                    else:
                        buffer = []
                        token_count = 0

            if buffer:
                chunk_text = " ".join(buffer).strip()
                chunked_documents.append(
                    Document(page_content=chunk_text, metadata={"source": course_name})
                )

    return chunked_documents


In [None]:
with open("../../data/Preprocessing_text/bachelors_data/dict_maininfo_cleaned.pkl", "rb") as f:
    maininfo_dict = pickle.load(f)

maininfo_chunks = chunk_dict_maininfo(maininfo_dict, max_tokens=800, chunk_overlap=True, overlap_size=200)

print(f"Total docs: {len(maininfo_chunks)}")
print(maininfo_chunks[0].metadata)

Total docs: 19
{'source': 'data-science_main_course_extracted_text.txt'}


In [40]:
print(maininfo_chunks[0].page_content)

Text from https://www.novaims.unl.pt/en///bachelor-s-degrees/data-science/: Data Science Degree in Data Science en Bachelor's Degrees Data Science In the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data). They will become true data scientists - considered the sexiest profession of the 21 st century by the Harvard Business Review. The main objective of this course is to train future professionals capable of understanding, developing and using models, algorithms and the most advanced techniques in data science, to analyze and extract knowledge from Big Data. The 3 rd phase of applications under the International Student Statute for the 2025/26 academic year are open from February 26 th to March 27 th , 2025. Duration 3 years (6 semesters) Timetable Daytime Start September 2025 Career Opportunities The Bachelor´s Degree in Data Science allows a quick integration in the mo

In [41]:
for i, doc in enumerate(maininfo_chunks):
    num_tokens = count_tokens(doc.page_content)
    print(f"Chunk {i+1}: {num_tokens} tokens")
    if num_tokens > 800:
        print("❗️WARNING: Token limit exceeded")

Chunk 1: 749 tokens
Chunk 2: 756 tokens
Chunk 3: 779 tokens
Chunk 4: 775 tokens
Chunk 5: 775 tokens
Chunk 6: 294 tokens
Chunk 7: 772 tokens
Chunk 8: 772 tokens
Chunk 9: 763 tokens
Chunk 10: 755 tokens
Chunk 11: 776 tokens
Chunk 12: 355 tokens
Chunk 13: 771 tokens
Chunk 14: 751 tokens
Chunk 15: 651 tokens
Chunk 16: 764 tokens
Chunk 17: 776 tokens
Chunk 18: 775 tokens
Chunk 19: 319 tokens


# Testing data_chunking.py

In [6]:
from libs.settings import data_catalog as dc
import libs.data_extraction as de

In [7]:
cleaned_data = de.load_all_programs_dict_textfiles(dc.HP_PATH_CLEANED_DOCS_DICTS)
cleaned_data

{'dict_bs_maininfo_raw_cleaned': {'data-science_main_course_extracted_text.txt': "Text from https://www.novaims.unl.pt/en///bachelor-s-degrees/data-science/: Data Science Degree in Data Science en Bachelor's Degrees Data Science In the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data). They will become true data scientists - considered the sexiest profession of the 21 st century by the Harvard Business Review. The main objective of this course is to train future professionals capable of understanding, developing and using models, algorithms and the most advanced techniques in data science, to analyze and extract knowledge from Big Data. The 3 rd phase of applications under the International Student Statute for the 2025/26 academic year are open from February 26 th to March 27 th , 2025. Duration 3 years (6 semesters) Timetable Daytime Start September 2025 Career Opport

In [8]:
first_dict = dict([next(iter(cleaned_data.items()))])
first_dict

{'dict_bs_maininfo_raw_cleaned': {'data-science_main_course_extracted_text.txt': "Text from https://www.novaims.unl.pt/en///bachelor-s-degrees/data-science/: Data Science Degree in Data Science en Bachelor's Degrees Data Science In the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data). They will become true data scientists - considered the sexiest profession of the 21 st century by the Harvard Business Review. The main objective of this course is to train future professionals capable of understanding, developing and using models, algorithms and the most advanced techniques in data science, to analyze and extract knowledge from Big Data. The 3 rd phase of applications under the International Student Statute for the 2025/26 academic year are open from February 26 th to March 27 th , 2025. Duration 3 years (6 semesters) Timetable Daytime Start September 2025 Career Opport

In [9]:
CHUNKER_REGISTRY = {
    ("bachelor", "maininfo"): chunk_main_info_documents,
    ("bachelor", "studyplan"): chunk_study_plan_documents,
    ("bachelor", "teachingstaff"): chunk_teaching_staff_documents,
    ("postgrad_master", "maininfo"): chunk_main_info_documents,
    ("postgrad_master", "studyplan"): chunk_study_plan_documents,
    ("postgrad_master", "teachingstaff"): chunk_teaching_staff_documents,
}

def extract_routing_info(dict_name: str) -> tuple[str, str]:
    match = re.match(r"dict_(bs|pm)_(\w+?)_raw_cleaned", dict_name)
    if not match:
        raise ValueError(f"Unexpected dict name format: {dict_name}")
    
    degree_map = {"bs": "bachelor", "pm": "postgrad_master"}
    degree_code, doc_type = match.groups()

    return degree_map.get(degree_code, "unknown"), doc_type

In [20]:
for dict_name, doc_dict in cleaned_data.items():
    print(dict_name)
    print(doc_dict)

dict_bs_maininfo_raw_cleaned
{'data-science_main_course_extracted_text.txt': "Text from https://www.novaims.unl.pt/en///bachelor-s-degrees/data-science/: Data Science Degree in Data Science en Bachelor's Degrees Data Science In the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data). They will become true data scientists - considered the sexiest profession of the 21 st century by the Harvard Business Review. The main objective of this course is to train future professionals capable of understanding, developing and using models, algorithms and the most advanced techniques in data science, to analyze and extract knowledge from Big Data. The 3 rd phase of applications under the International Student Statute for the 2025/26 academic year are open from February 26 th to March 27 th , 2025. Duration 3 years (6 semesters) Timetable Daytime Start September 2025 Career Opportunit

In [10]:
from typing import Callable, Dict, Tuple
from langchain.schema import Document  # or wherever you're importing from
import re


# --- Dynamic chunking entrypoint ---
def route_and_chunk_documents(docs_by_type: dict) -> list[Document]:
    all_docs = []

    for dict_name, doc_dict in docs_by_type.items():
        try:
            degree, doc_type = extract_routing_info(dict_name)
            chunk_fn = CHUNKER_REGISTRY[(degree, doc_type)]
        except Exception as e:
            print(f"⚠️ Skipping {dict_name}: {e}")
            continue

        for file_name, raw_text in doc_dict.items():
            if not isinstance(raw_text, str) or not raw_text.strip():
                continue

            #metadata = infer_metadata(file_name, degree, doc_type)

            chunks = chunk_fn(
                doc_dict={file_name: raw_text},
                max_tokens=512,
                chunk_overlap=True,
                overlap_size=1
                #metadata=None,  # update your chunkers to accept this param
            )
            all_docs.extend(chunks)

    return all_docs


In [11]:
# Step 2: Call the routing + chunking pipeline
langchain_docs = route_and_chunk_documents(first_dict)


In [13]:
for doc in langchain_docs:
    print("-----")
    print("Content:\n", doc.page_content[:300], "...")
    print("Metadata:", doc.metadata)

-----
Content:
 Text from https://www.novaims.unl.pt/en///bachelor-s-degrees/data-science/: Data Science Degree in Data Science en Bachelor's Degrees Data Science In the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volu ...
Metadata: {'source': 'data-science_main_course_extracted_text.txt'}
-----
Content:
 Program Goals Program Structure Admissions and Fees Achievement Awards Teaching Staff Tuition Fee Scholarships Calendar, Timetable and Exams FAQ Regulations and Accreditations Who is it for? This degree is intended for all final-year students who aspire to become data scientists. Thus, the study cyc ...
Metadata: {'source': 'data-science_main_course_extracted_text.txt'}
-----
Content:
 See study plan Admissions and Fees Establishment Code: 0906 - Universidade Nova de Lisboa - Instituto Superior de Estatística e Gestão de Informação Course Code: L188 Entry Exam Subjects (One of the followi