In [1]:
import os
# Move to Thesis directory (two levels up)
os.chdir(os.path.abspath(os.path.join("..", "..")))

# Move to model/src if it exists
model_dir = os.path.join(os.getcwd(), "model", "src")
if os.path.exists(model_dir):
    os.chdir(model_dir)

print("Current Directory:", os.getcwd())

Current Directory: c:\Users\1176153\Downloads\github\Thesis\model\src


In [2]:
from huggingface_hub import login

# Replace with your actual token
login(token="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")

In [3]:
from langchain_core.documents import Document
from libs import data_handeling as dh
from libs.settings import data_catalog as dc

In [None]:
bachelors_data_clenaed = dh.load_pickle_to_dict(dc.BACHELORS_DATA_CLEANED)
bachelors_data_clenaed

{'bachelor_data-science_teaching-staff_text.txt': {'text': "Text from https://www.novaims.unl.pt/en//programs/bachelor-s-degrees/data-science/teaching-staff/#: Teaching Staff en Programs Bachelor's Degrees Data Science Teaching Staff Américo Rio Invited Assistant Professor americo.rio@novaims.unl.pt Ana Cristina Costa Associate Professor cristina@novaims.unl.pt Artur Varanda Adjunct Lecturer avaranda@novaims.unl.pt Augusto Santos Assistant Professor ajrsantos@novaims.unl.pt Bruno Damásio Assistant Professor bdamasio@novaims.unl.pt Carina Albuquerque Assistant Professor calbuquerque@novaims.unl.pt Carolina Maria Shaul Adjunct Lecturer cshaul@novaims.unl.pt Carolina Santos Maximiano Adjunct Lecturer cmaximiano@novaims.unl.pt Carolina Vasconcelos Invited Teaching Assistant cvasconcelos@novaims.unl.pt Catarina Neves Assistant Professor cneves@novaims.unl.pt Catarina Palha Invited Teaching Assistant cpalha@novaims.unl.pt Dhruv Akshay Pandit Invited Teaching Assistant dpandit@novaims.unl.pt 

# TOKENIZATION MATTERS:
from transformers import AutoTokenizer

- For mistral: 

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", use_fast=True)

- for Llama:

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", use_fast=True)




In [14]:
import re
from typing import List, Dict
from langchain_core.documents import Document
from transformers import AutoTokenizer, PreTrainedTokenizerFast

# Load tokenizer globally
def load_tokenizer(model_name: str = "mistralai/Mistral-7B-Instruct-v0.2") -> PreTrainedTokenizerFast:
    try:
        return AutoTokenizer.from_pretrained(model_name, use_fast=True)
    except Exception as e:
        raise RuntimeError(f"❌ Failed to load tokenizer: {e}")

tokenizer = load_tokenizer()

def count_tokens(text: str) -> int:
    try:
        return len(tokenizer.encode(text, truncation=False))
    except Exception as e:
        print(f"⚠️ Token counting failed for text: {text[:50]}... \nError: {e}")
        return float("inf")

def extract_teaching_staff_blocks(text: str) -> List[str]:
    """
    Extracts blocks of teaching staff information in the format:
    Name\nTitle\nEmail
    """
    pattern = r"([A-Z][a-zA-Zçéàèíóõâêã]+\s+[A-Z][a-zA-Zçéàèíóõâêã]+.*?)\s([\w\.\s]+?)\s([\w\.-]+@novaims\.unl\.pt)"
    matches = re.findall(pattern, text)
    return [f"{name}\n{title}\n{email}\n\n" for name, title, email in matches]

def group_blocks_by_token_limit(
    blocks: List[str],
    max_token_size: int,
    chunk_overlap: bool = False,
    overlap_size: int = 0
) -> List[str]:
    """
    Groups extracted blocks into text chunks not exceeding max_token_size tokens.
    Optionally includes overlapping blocks between consecutive chunks.
    """
    grouped_chunks = []
    current_chunk = []
    current_token_count = 0
    i = 0

    while i < len(blocks):
        block = blocks[i]
        token_count = count_tokens(block)

        if token_count > max_token_size:
            raise ValueError(
                f"❌ Block too large to fit in a chunk (size={token_count} tokens, max={max_token_size}):\n{block}"
            )

        if current_token_count + token_count > max_token_size:
            if current_chunk:
                grouped_chunks.append("".join(current_chunk).strip())

                if chunk_overlap and overlap_size > 0:
                    # Keep last N blocks as overlap
                    current_chunk = current_chunk[-overlap_size:]
                    current_token_count = sum(count_tokens(b) for b in current_chunk)
                else:
                    current_chunk = []
                    current_token_count = 0
        else:
            current_chunk.append(block)
            current_token_count += token_count
            i += 1

    if current_chunk:
        grouped_chunks.append("".join(current_chunk).strip())

    return grouped_chunks

from typing import List, Dict, Optional, Any
from langchain.schema import Document

def chunk_teaching_staff_documents(
    data: Dict[str, Dict],
    max_tokens: int = 512,
    chunk_overlap: bool = False,
    overlap_size: int = 0,
    course_names_to_include: Optional[List[str]] = None,
    doc_types_to_include: Optional[List[str]] = None,
    include_metadata: bool = True,
    extra_metadata: Optional[Dict[str, Any]] = None,
) -> List[Document]:
    """
    Chunk teaching staff documents into LangChain Documents with optional filters and metadata injection.

    Parameters:
        data: Dict of {filename: {'text': ..., 'metadata': {...}}}
        max_tokens: Max token length for chunks
        chunk_overlap: Whether to overlap chunks
        overlap_size: Overlap size (in blocks)
        course_names_to_include: Optional filter by course name
        doc_types_to_include: Optional filter by doc_type
        include_metadata: If True, include existing metadata
        extra_metadata: Optional extra metadata to attach to each chunk

    Returns:
        List of LangChain Document objects

    Example: 
        chunked_docs = chunk_teaching_staff_documents(
        data=my_data,
        max_tokens=512,
        chunk_overlap=False,
        overlap_size=0,
        doc_types_to_include=["teaching_staff"],
        course_names_to_include=["Data Science"],
        extra_metadata={"chunked_by": "teaching_staff_function", "version": "v1.2"},
        include_metadata=True
)
    """
    all_docs = []

    for file_name, file_data in data.items():
        text = file_data.get("text", "")
        metadata = file_data.get("metadata", {})

        if not isinstance(text, str) or not text.strip():
            print(f"⚠️ Skipping empty or invalid text for: {file_name}")
            continue

        course_name = metadata.get("course_name", "")
        doc_type = metadata.get("doc_type", "")

        if course_names_to_include and course_name not in course_names_to_include:
            continue
        if doc_types_to_include and doc_type not in doc_types_to_include:
            continue

        try:
            blocks = extract_teaching_staff_blocks(text)
            grouped_chunks = group_blocks_by_token_limit(
                blocks,
                max_token_size=max_tokens,
                chunk_overlap=chunk_overlap,
                overlap_size=overlap_size
            )

            for chunk in grouped_chunks:
                doc_metadata = {"source": file_name}

                if include_metadata:
                    doc_metadata.update(metadata)

                if extra_metadata:
                    doc_metadata.update(extra_metadata)

                all_docs.append(Document(page_content=chunk, metadata=doc_metadata))

        except Exception as e:
            print(f"❌ Error processing teaching staff in '{file_name}': {e}")

    return all_docs






In [None]:
teachingstaff_chunks = chunk_teaching_staff_documents(data = bachelors_data_clenaed,
                                                        max_tokens=512,
                                                        chunk_overlap = False,
                                                        overlap_size= 0,
                                                        course_names_to_include = None,
                                                        doc_types_to_include = ['teaching_staff'],
                                                        include_metadata= False,
                                                        extra_metadata = None
                                                        )
                                                    

In [16]:
teachingstaff_chunks

[Document(metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science'}, page_content="Teaching Staff en Programs Bachelor's\nDegrees Data Science Teaching Staff Américo Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant Professor\ncneves@novaims.unl.pt\n\nCatarina Palha\nInvited Teaching Assistant\ncpalha@novaims.unl.pt\n

In [11]:
print(teachingstaff_chunks[0].page_content)

Teaching Staff en Programs Bachelor's
Degrees Data Science Teaching Staff Américo Rio Invited Assistant Professor
americo.rio@novaims.unl.pt

Ana Cristina
Costa Associate Professor
cristina@novaims.unl.pt

Artur Varanda
Adjunct Lecturer
avaranda@novaims.unl.pt

Augusto Santos
Assistant Professor
ajrsantos@novaims.unl.pt

Bruno Damásio
Assistant Professor
bdamasio@novaims.unl.pt

Carina Albuquerque
Assistant Professor
calbuquerque@novaims.unl.pt

Carolina Maria
Shaul Adjunct Lecturer
cshaul@novaims.unl.pt

Carolina Santos
Maximiano Adjunct Lecturer
cmaximiano@novaims.unl.pt

Carolina Vasconcelos
Invited Teaching Assistant
cvasconcelos@novaims.unl.pt

Catarina Neves
Assistant Professor
cneves@novaims.unl.pt

Catarina Palha
Invited Teaching Assistant
cpalha@novaims.unl.pt

Dhruv Akshay
Pandit Invited Teaching Assistant
dpandit@novaims.unl.pt

Diogo Rasteiro
Research Assistant
drasteiro@novaims.unl.pt

Fernando Bação
Full Professor
bacao@novaims.unl.pt

Filipe Marques
Professor of the Prac

In [12]:
for i, doc in enumerate(teachingstaff_chunks):
    num_tokens = count_tokens(doc.page_content)
    print(f"Chunk {i+1}: {num_tokens} tokens")
    if num_tokens > 512:
        print("❗️WARNING: Token limit exceeded")

Chunk 1: 487 tokens
Chunk 2: 482 tokens
Chunk 3: 331 tokens
Chunk 4: 487 tokens
Chunk 5: 501 tokens
Chunk 6: 495 tokens
Chunk 7: 81 tokens
Chunk 8: 488 tokens
Chunk 9: 484 tokens
Chunk 10: 463 tokens


# Costum chunking for Study plan

- Chunking by year and semestres and having a argument to set the total number of tokens to have in each document. Optional Chunk_overlap 

In [17]:
import re
from typing import List, Dict
from langchain_core.documents import Document
from transformers import AutoTokenizer, PreTrainedTokenizerFast

'''
# Without overlap (default)
docs = chunk_all_documents(my_texts_dict, max_tokens=512)

# With 20-token overlap between chunks
docs = chunk_all_documents(my_texts_dict, max_tokens=512, chunk_overlap=True, overlap_size=20)
'''

def load_tokenizer(model_name: str = "mistralai/Mistral-7B-Instruct-v0.2") -> PreTrainedTokenizerFast:
    try:
        return AutoTokenizer.from_pretrained(model_name, use_fast=True)
    except Exception as e:
        raise RuntimeError(f"❌ Failed to load tokenizer: {e}")

def count_tokens(text: str) -> int:
    try:
        return len(tokenizer.encode(text, truncation=False))
    except Exception as e:
        print(f"⚠️ Token counting failed for text: {text[:50]}... \nError: {e}")
        return float("inf")

def split_to_token_limit(text: str, max_tokens: int, overlap: bool = False, overlap_size: int = 50) -> List[str]:
    tokens = tokenizer.encode(text, truncation=False)
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True).strip()
        chunks.append(chunk_text)
        start = end - overlap_size if overlap else end
    return chunks

def smart_chunk_by_semester(text: str, max_tokens: int = 512, chunk_overlap: bool = False, overlap_size: int = 50) -> List[str]:
    if not isinstance(text, str) or not text.strip():
        return []

    pattern = re.compile(r"(\d+\s?(?:st|nd|rd|th) year - (?:Fall|Spring) Semester)", re.IGNORECASE)
    matches = list(pattern.finditer(text))

    if not matches:
        print("⚠️ No semester headers found — chunking whole text instead.")
        return split_to_token_limit(text, max_tokens, overlap=chunk_overlap, overlap_size=overlap_size)

    chunks = []
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        chunk_text = text[start:end].strip()

        if count_tokens(chunk_text) <= max_tokens:
            chunks.append(chunk_text)
        else:
            chunks.extend(split_to_token_limit(chunk_text, max_tokens, overlap=chunk_overlap, overlap_size=overlap_size))

    return chunks

def chunk_study_plan_documents(
    data: Dict[str, Dict],
    max_tokens: int = 512,
    chunk_overlap: bool = False,
    overlap_size: int = 0,
    course_names_to_include: Optional[List[str]] = None,
    doc_types_to_include: Optional[List[str]] = None,
    include_metadata: bool = True,
    extra_metadata: Optional[Dict[str, Any]] = None,
) -> List[Document]:
    all_docs = []

    for file_name, file_data in data.items():
        text = file_data.get("text", "")
        metadata = file_data.get("metadata", {})

        if not isinstance(text, str) or not text.strip():
            print(f"⚠️ Skipping empty or invalid text for: {file_name}")
            continue

        course_name = metadata.get("course_name", "")
        doc_type = metadata.get("doc_type", "")

        if course_names_to_include and course_name not in course_names_to_include:
            continue
        if doc_types_to_include and doc_type not in doc_types_to_include:
            continue

        try:
            chunks = smart_chunk_by_semester(
                text,
                max_tokens=max_tokens,
                chunk_overlap=chunk_overlap,
                overlap_size=overlap_size
            )
            for chunk in chunks:
                doc_metadata = {"source": file_name}

                if include_metadata:
                    doc_metadata.update(metadata)

                if extra_metadata:
                    doc_metadata.update(extra_metadata)

                all_docs.append(Document(page_content=chunk, metadata=doc_metadata))

        except Exception as e:
            print(f"❌ Error chunking study plan in '{file_name}': {e}")

    return all_docs




In [19]:
chunked_study_docs = chunk_study_plan_documents(
    data=bachelors_data_clenaed,
    max_tokens=800,
    chunk_overlap=False,
    overlap_size=0,
    course_names_to_include=None,
    doc_types_to_include=["study_plan"],
    extra_metadata=None,
    include_metadata=True
)
chunked_study_docs

[Document(metadata={'source': 'bachelor_data-science_studyplan_text.txt', 'degree': 'bachelor', 'doc_type': 'study_plan', 'course_name': 'Data Science'}, page_content="1 st year - Fall Semester Course Units Duration Type Contact Hours Total Workload ECTS Computers' Architecture Semester Mandatory TP - 45 | OT - 5 112 4 Foundational aspects of data science Semester Mandatory TP - 45 | OT - 5 112 4 Information Systems Semester Mandatory TP - 64,5 | S - 3 | OT - 7,5 168 6 Introduction to Programming Semester Mandatory TP - 67,5 | OT - 7,5 196 7 Linear Algebra Semester Mandatory T - 22,5 | PL - 22,5 | OT - 7,5 112 4 Show subtitles T – Theoretical Teaching; TP – Theoretical/ Practical Teaching; TC – Field Work; PL – Theoretical/ Practical Teaching; S – Seminar; OT – Tutor Group; ** – Course Unit available in both semesters;"),
 Document(metadata={'source': 'bachelor_data-science_studyplan_text.txt', 'degree': 'bachelor', 'doc_type': 'study_plan', 'course_name': 'Data Science'}, page_content

In [20]:
print(f"Total docs: {len(chunked_study_docs)}")

Total docs: 18


In [22]:
for i, doc in enumerate(chunked_study_docs):
    num_tokens = count_tokens(doc.page_content)
    print(f"Chunk {i+1}: {num_tokens} tokens")
    if num_tokens > 800:
        print("❗️WARNING: Token limit exceeded")

Chunk 1: 243 tokens
Chunk 2: 323 tokens
Chunk 3: 253 tokens
Chunk 4: 235 tokens
Chunk 5: 266 tokens
Chunk 6: 493 tokens
Chunk 7: 271 tokens
Chunk 8: 276 tokens
Chunk 9: 260 tokens
Chunk 10: 257 tokens
Chunk 11: 283 tokens
Chunk 12: 748 tokens
Chunk 13: 273 tokens
Chunk 14: 272 tokens
Chunk 15: 246 tokens
Chunk 16: 300 tokens
Chunk 17: 274 tokens
Chunk 18: 713 tokens


# Costum chunking for Main info

In [14]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Utilizador\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

**OLD**

In [59]:
import pickle
import re
from typing import List, Dict
from nltk import sent_tokenize
from transformers import AutoTokenizer, PreTrainedTokenizerFast

# === Tokenizer Setup ===
def load_tokenizer(model_name: str = "mistralai/Mistral-7B-Instruct-v0.2") -> PreTrainedTokenizerFast:
    try:
        return AutoTokenizer.from_pretrained(model_name, use_fast=True)
    except Exception as e:
        raise RuntimeError(f"❌ Failed to load tokenizer: {e}")

tokenizer = load_tokenizer()

def count_tokens(text: str) -> int:
    try:
        return len(tokenizer.encode(text, truncation=False))
    except Exception as e:
        print(f"⚠️ Token counting failed for text: {text[:50]}... \nError: {e}")
        return float("inf")

# === Helpers ===
def is_potential_section_header(line: str) -> bool:
    line = line.strip()
    if not line:
        return False
    return (
        len(line) < 80 and (
            line.isupper() or
            line.istitle() or
            bool(re.match(r"^\d+[\).\s]", line))
        )
    )

def chunk_main_info_documents(
    data: Dict[str, Dict],
    max_tokens: int = 512,
    chunk_overlap: bool = False,
    overlap_size: int = 50,
    course_names_to_include: Optional[List[str]] = None,
    doc_types_to_include: Optional[List[str]] = None,
    include_metadata: bool = True,
    extra_metadata: Optional[Dict[str, Any]] = None,
) -> List[Document]:
    documents = []

    for filename, content_dict in data.items():
        text = content_dict.get("text", "")
        metadata = content_dict.get("metadata", {})

        course_name = metadata.get("course_name", "")
        doc_type = metadata.get("doc_type", "")

        if course_names_to_include and course_name not in course_names_to_include:
            continue
        if doc_types_to_include and doc_type not in doc_types_to_include:
            continue
        if not isinstance(text, str) or not text.strip():
            print(f"⚠️ Skipping empty or invalid text in: {filename}")
            continue

        lines = text.split("\n")
        sections = []
        current_section = {"header": None, "content": ""}

        for line in lines:
            if is_potential_section_header(line):
                if current_section["header"] or current_section["content"].strip():
                    sections.append(current_section)
                current_section = {"header": line.strip(), "content": ""}
            else:
                current_section["content"] += line.strip() + " "

        if current_section["header"] or current_section["content"].strip():
            sections.append(current_section)

        for section in sections:
            full_text = (section["header"] + "\n" if section["header"] else "") + section["content"]
            sentences = sent_tokenize(full_text)

            buffer = []
            token_count = 0
            i = 0

            while i < len(sentences):
                sentence = sentences[i]
                sentence_tokens = count_tokens(sentence)

                if sentence_tokens > max_tokens:
                    raise ValueError(
                        f"❌ Sentence too large to fit in a chunk (size={sentence_tokens} tokens, max={max_tokens}):\n{sentence}"
                    )

                if token_count + sentence_tokens <= max_tokens:
                    buffer.append(sentence)
                    token_count += sentence_tokens
                    i += 1
                else:
                    chunk_text = " ".join(buffer).strip()
                    doc_metadata = {"source": filename}
                    if include_metadata:
                        doc_metadata.update(metadata)
                        if extra_metadata:
                            doc_metadata.update(extra_metadata)
                    documents.append(Document(page_content=chunk_text, metadata=doc_metadata))

                    # Handle overlap
                    if chunk_overlap:
                        overlap_buffer = []
                        overlap_token_count = 0
                        for s in reversed(buffer):
                            s_tokens = count_tokens(s)
                            if overlap_token_count + s_tokens <= overlap_size:
                                overlap_buffer.insert(0, s)
                                overlap_token_count += s_tokens
                            else:
                                break
                        buffer = overlap_buffer
                        token_count = overlap_token_count
                    else:
                        buffer = []
                        token_count = 0

            if buffer:
                chunk_text = " ".join(buffer).strip()
                doc_metadata = {"source": filename}
                if include_metadata:
                    doc_metadata.update(metadata)
                    if extra_metadata:
                        doc_metadata.update(extra_metadata)
                documents.append(Document(page_content=chunk_text, metadata=doc_metadata))

    return documents

In [88]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from typing import List, Dict, Optional, Any
from transformers import AutoTokenizer, PreTrainedTokenizerFast

# === Tokenizer Setup ===
def load_tokenizer(model_name: str = "mistralai/Mistral-7B-Instruct-v0.2") -> PreTrainedTokenizerFast:
    try:
        return AutoTokenizer.from_pretrained(model_name, use_fast=True)
    except Exception as e:
        raise RuntimeError(f"❌ Failed to load tokenizer: {e}")

tokenizer = load_tokenizer()

def count_tokens(text: str) -> int:
    try:
        return len(tokenizer.encode(text, truncation=False))
    except Exception as e:
        print(f"⚠️ Token counting failed for text: {text[:50]}... \nError: {e}")
        return float("inf")

# === RecursiveCharacterTextSplitter Setup ===
def get_text_splitter(max_tokens: int, chunk_overlap: int) -> RecursiveCharacterTextSplitter:
    return RecursiveCharacterTextSplitter(
        chunk_size=max_tokens,  # Max number of characters per chunk
        chunk_overlap=chunk_overlap,  # Number of overlapping characters between chunks
        length_function=len,  # Function to compute the length of each chunk (by character count)
        separators=["\n", "\n\n", ".", "!", "?"]  # Prefer breaking on these separators for better semantic breaks
    )

# === Main Chunking Function ===
def chunk_main_info_documents(
    data: Dict[str, Dict],
    max_tokens: int = 512,
    chunk_overlap: bool = False,
    overlap_size: int = 50,
    course_names_to_include: Optional[List[str]] = None,
    doc_types_to_include: Optional[List[str]] = None,
    include_metadata: bool = True,
    extra_metadata: Optional[Dict[str, Any]] = None,
) -> List[Document]:
    documents = []
    text_splitter = get_text_splitter(max_tokens, overlap_size if chunk_overlap else 0)

    for filename, content_dict in data.items():
        text = content_dict.get("text", "")
        metadata = content_dict.get("metadata", {})

        course_name = metadata.get("course_name", "")
        doc_type = metadata.get("doc_type", "")

        if course_names_to_include and course_name not in course_names_to_include:
            continue
        if doc_types_to_include and doc_type not in doc_types_to_include:
            continue
        if not isinstance(text, str) or not text.strip():
            print(f"⚠️ Skipping empty or invalid text in: {filename}")
            continue

        # Split the full text using RecursiveCharacterTextSplitter
        chunks = text_splitter.split_text(text)

        # Create Document objects for each chunk
        for chunk in chunks:
            doc_metadata = {"source": filename}
            if include_metadata:
                doc_metadata.update(metadata)
                if extra_metadata:
                    doc_metadata.update(extra_metadata)
            
            documents.append(Document(page_content=chunk, metadata=doc_metadata))

    return documents




In [89]:
chunked_maininfo_docs = chunk_main_info_documents(
    data=bachelors_data_clenaed,
    max_tokens=1000,
    chunk_overlap=True,
    overlap_size=100,
    course_names_to_include=None,
    doc_types_to_include=["main_info"],
    include_metadata=False,
    extra_metadata=None
)
chunked_maininfo_docs

[Document(metadata={'source': 'bachelor_data-science_main_course_text.txt'}, page_content="Text from https://www.novaims.unl.pt/en///bachelor-s-degrees/data-science/: Data Science Degree in Data Science en Bachelor's Degrees Data Science In the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data). They will become true data scientists - considered the sexiest profession of the 21 st century by the Harvard Business Review. The main objective of this course is to train future professionals capable of understanding, developing and using models, algorithms and the most advanced techniques in data science, to analyze and extract knowledge from Big Data. The 3 rd phase of applications under the International Student Statute for the 2025/26 academic year are open from February 26 th to March 27 th , 2025"),
 Document(metadata={'source': 'bachelor_data-science_main_course_text.tx

In [90]:

print(f"Total docs: {len(chunked_maininfo_docs)}")


Total docs: 53


In [91]:
for i, doc in enumerate(chunked_maininfo_docs):
    num_tokens = count_tokens(doc.page_content)
    print(f"Chunk {i+1}: {num_tokens} tokens")
    if num_tokens > 800:
        print("❗️WARNING: Token limit exceeded")

Chunk 1: 188 tokens
Chunk 2: 203 tokens
Chunk 3: 167 tokens
Chunk 4: 149 tokens
Chunk 5: 60 tokens
Chunk 6: 157 tokens
Chunk 7: 108 tokens
Chunk 8: 310 tokens
Chunk 9: 243 tokens
Chunk 10: 242 tokens
Chunk 11: 213 tokens
Chunk 12: 150 tokens
Chunk 13: 383 tokens
Chunk 14: 221 tokens
Chunk 15: 307 tokens
Chunk 16: 208 tokens
Chunk 17: 120 tokens
Chunk 18: 198 tokens
Chunk 19: 172 tokens
Chunk 20: 182 tokens
Chunk 21: 38 tokens
Chunk 22: 170 tokens
Chunk 23: 108 tokens
Chunk 24: 316 tokens
Chunk 25: 243 tokens
Chunk 26: 263 tokens
Chunk 27: 213 tokens
Chunk 28: 150 tokens
Chunk 29: 383 tokens
Chunk 30: 234 tokens
Chunk 31: 308 tokens
Chunk 32: 196 tokens
Chunk 33: 196 tokens
Chunk 34: 184 tokens
Chunk 35: 209 tokens
Chunk 36: 195 tokens
Chunk 37: 170 tokens
Chunk 38: 53 tokens
Chunk 39: 114 tokens
Chunk 40: 163 tokens
Chunk 41: 174 tokens
Chunk 42: 239 tokens
Chunk 43: 292 tokens
Chunk 44: 311 tokens
Chunk 45: 240 tokens
Chunk 46: 218 tokens
Chunk 47: 202 tokens
Chunk 48: 376 tokens
Chun

In [29]:
dh.print_text_context_from_program_dicts(bachelors_data_clenaed, course_names_to_include= ['Data Science'], doc_types_to_include= ['main_info'] )


--- Document: bachelor_data-science_main_course_text.txt ---
Course Name: Data Science
Document Type: main_info

Text Content:
Text from https://www.novaims.unl.pt/en///bachelor-s-degrees/data-science/: Data Science Degree in Data Science en Bachelor's Degrees Data Science In the Bachelor´s Degree in Data Science, students learn the most modern techniques of artificial intelligence and machine learning to analyze large volumes of data (Big Data). They will become true data scientists - considered the sexiest profession of the 21 st century by the Harvard Business Review. The main objective of this course is to train future professionals capable of understanding, developing and using models, algorithms and the most advanced techniques in data science, to analyze and extract knowledge from Big Data. The 3 rd phase of applications under the International Student Statute for the 2025/26 academic year are open from February 26 th to March 27 th , 2025. Duration 3 years (6 semesters) Timeta

# Making a function to run all chunking methdos

In [92]:
import os
import pickle
from typing import List, Optional
from langchain.schema import Document

def save_documents_to_pickle(
    documents: List[Document],
    output_file_name: str,
    output_folder: Optional[str] = None
) -> None:
    """
    Saves a list of LangChain Document objects to a pickle file.

    Args:
        documents (List[Document]): List of chunked Document objects.
        filename (str): The name of the file to save (should end in .pkl).
        directory (Optional[str]): Optional directory path to save into. Defaults to current directory.
    """
    if not output_file_name.endswith(".pkl"):
        output_file_name += ".pkl"

    if output_folder:
        os.makedirs(output_folder, exist_ok=True)
        file_path = os.path.join(output_folder, output_file_name)
    else:
        file_path = output_file_name

    try:
        with open(file_path, "wb") as f:
            pickle.dump(documents, f)
        print(f"✅ Saved {len(documents)} documents to {file_path}")
    except Exception as e:
        print(f"❌ Failed to save documents to pickle: {e}")


In [None]:
def create_docs_programs_cleaned_chunked():
    # Load raw data
    datasets = {
        "bachelors": load_pickle_to_dict(dc.BACHELORS_DATA_CLEANED),
        "postgradmasters": load_pickle_to_dict(dc.POSTGRAD_AND_MASTERS_DATA_CLEANED),
    }

    # Configuration for each chunking job
    chunk_jobs = [
        {
            "name": "teachingstaff",
            "func": dch.chunk_teaching_staff_documents,
            "max_tokens": 512,
            "chunk_overlap": False,
            "overlap_size": 0,
            "doc_types": ["teaching_staff"],
        },
        {
            "name": "studyplan",
            "func": dch.chunk_study_plan_documents,
            "max_tokens": 800,
            "chunk_overlap": False,
            "overlap_size": 0,
            "doc_types": ["study_plan"],
        },
        {
            "name": "maininfo",
            "func": dch.chunk_main_info_documents,
            "max_tokens": 512,
            "chunk_overlap": True,
            "overlap_size": 100,
            "doc_types": ["main_info"],
        },
    ]

    all_chunks: List[Document] = []

    for program_name, data in datasets.items():
        for job in chunk_jobs:
            chunks = job["func"](
                data=data,
                max_tokens=job["max_tokens"],
                chunk_overlap=job["chunk_overlap"],
                overlap_size=job["overlap_size"],
                course_names_to_include=None,
                doc_types_to_include=job["doc_types"],
                include_metadata=False,
                extra_metadata=None
            )
            print(f"✅ {program_name} - {job['name']} chunks: {len(chunks)}")
            all_chunks.extend(chunks)

    print(f"📦 Total documents chunked: {len(all_chunks)}")

    # Save
    save_documents_to_pickle(
        documents=all_chunks,
        filename="docs_all_programs_chunked_without_metadata",
        directory=dc.PATH_DOCS_CHUNKED
    )



    

# Testing data_chunking.py

In [4]:
dh.create_docs_programs_cleaned_chunked()

✅ bachelors - teachingstaff chunks: 10
✅ bachelors - studyplan chunks: 18
✅ bachelors - maininfo chunks: 19
✅ postgradmasters - teachingstaff chunks: 72
⚠️ No semester headers found — chunking whole text instead.
⚠️ No semester headers found — chunking whole text instead.
⚠️ No semester headers found — chunking whole text instead.
⚠️ No semester headers found — chunking whole text instead.
⚠️ No semester headers found — chunking whole text instead.
⚠️ No semester headers found — chunking whole text instead.
⚠️ No semester headers found — chunking whole text instead.
⚠️ No semester headers found — chunking whole text instead.
⚠️ No semester headers found — chunking whole text instead.
⚠️ No semester headers found — chunking whole text instead.
⚠️ No semester headers found — chunking whole text instead.
✅ postgradmasters - studyplan chunks: 79
✅ postgradmasters - maininfo chunks: 116
Total documents chunked: 314
✅ Saved 314 documents to ..\..\data\Preprocessing_text\all_programs_chunked\

In [5]:
documents_chunked = dh.load_documents_from_pickle(dc.DOCUMENTS_CHUNKED)
documents_chunked

✅ Loaded 314 documents from ..\..\data\Preprocessing_text\all_programs_chunked\docs_all_programs_chunked_without_metadata.pkl


[Document(metadata={'source': 'bachelor_data-science_teaching-staff_text.txt', 'degree': 'bachelor', 'doc_type': 'teaching_staff', 'course_name': 'Data Science'}, page_content="Teaching Staff en Programs Bachelor's\nDegrees Data Science Teaching Staff Américo Rio Invited Assistant Professor\namerico.rio@novaims.unl.pt\n\nAna Cristina\nCosta Associate Professor\ncristina@novaims.unl.pt\n\nArtur Varanda\nAdjunct Lecturer\navaranda@novaims.unl.pt\n\nAugusto Santos\nAssistant Professor\najrsantos@novaims.unl.pt\n\nBruno Damásio\nAssistant Professor\nbdamasio@novaims.unl.pt\n\nCarina Albuquerque\nAssistant Professor\ncalbuquerque@novaims.unl.pt\n\nCarolina Maria\nShaul Adjunct Lecturer\ncshaul@novaims.unl.pt\n\nCarolina Santos\nMaximiano Adjunct Lecturer\ncmaximiano@novaims.unl.pt\n\nCarolina Vasconcelos\nInvited Teaching Assistant\ncvasconcelos@novaims.unl.pt\n\nCatarina Neves\nAssistant Professor\ncneves@novaims.unl.pt\n\nCatarina Palha\nInvited Teaching Assistant\ncpalha@novaims.unl.pt\n