## Setup

### Imports

In [None]:
import pandas as pd
from langchain_core.documents import Document
from langchain_text_splitters import (MarkdownHeaderTextSplitter,
                                      RecursiveCharacterTextSplitter)

from src.config import FilePaths
from src.enums import Party

### Config

In [None]:
party = Party.VIJFTIG_PLUS

# Short markdown files are formatted and trimmed; no introduction, summary etc
short_markdown_file = FilePaths.short_markdown_dir / f"{party}_short.md"

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[
        ("#", "Hoofdstuk"),
        ("##", "Sectie"),
        ("###", "Subsectie"),
    ],
    strip_headers=True,
)
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    separators=["\n\n", ".\n", "\n", ".", " ", ""],
    keep_separator="end",
)


## Chunking

In [None]:
with open(short_markdown_file, 'r', encoding='utf-8') as file:
    markdown_string = file.read()

# Step 1: Split the markdown text by headers
md_header_splits = markdown_splitter.split_text(markdown_string)

# Step 2: Recursively split the header chunks into smaller chunks
chunks = recursive_splitter.split_documents(md_header_splits)

example_chunk = chunks[20]
example_chunk

### Chunks visualization

In [None]:
def chunks_to_dataframe(chunks: list[Document]) -> pd.DataFrame:
    """
    Converts a list of LangChain Document objects into a pandas DataFrame,
    extracting specified metadata fields.
    """
    data = []
    
    for chunk in chunks:
        metadata = chunk.metadata
        page_content = chunk.page_content

        # Extract metadata fields with a default value of None or an empty string
        hoofdstuk = metadata.get('Hoofdstuk', "")
        sectie = metadata.get('Sectie', "")
        subsectie = metadata.get('Subsectie', "")

        data.append({
            'Hoofdstuk': hoofdstuk,
            'Sectie': sectie,
            'Subsectie': subsectie,
            'Text': page_content,
        })
        
    return pd.DataFrame(data)

In [None]:
chunks_to_dataframe(chunks)[:50]

## Embedding

### Process document chunks for embedding

In [None]:
def format_embedding_content(chunk: Document) -> str:
    metadata = chunk.metadata
    page_content = chunk.page_content

    chapter_title = metadata.get("Hoofdstuk")
    if not chapter_title:
        raise ValueError("No chapter title found")

    section_title = metadata.get("Sectie", "")
    subsection_title = metadata.get("Subsectie", "")

    embedding_content = f"{chapter_title}\n{section_title}\n{subsection_title}\n{page_content}"

    return embedding_content

In [None]:
embedding_content = format_embedding_content(example_chunk)

print(embedding_content)