# Notebook for splitting markdown files

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
def set_headers(par: str, header_levels: dict) -> dict:
    """ Set headers for a paragraph """
    
    # Check if paragraph starts with a header (e.g. '# ', '## ', etc.)
    for level in range(1, 7):
        if par.startswith('#' * level + ' '):
            header_levels[level] = par
            # Reset lower-level headers
            for lower_level in range(level + 1, 7):
                header_levels[lower_level] = None
            # Exit loop after finding the correct header level
            break  
    
    # Build headers dictionary dynamically
    headers = {f'header_{i}': header_levels[i] for i in range(1, 7) if header_levels[i]}
    return headers

In [None]:
def set_complet_context(par: str, header_levels: dict) -> dict:
    """ Set the complete context for a paragraph """

    complet_context = ''
    for header in header_levels:
        complet_context += header_levels[header]
        complet_context += '\n\n'

    if par not in header_levels.values():
        complet_context += par
    
    return complet_context

In [None]:
from llama_index.core.schema import TextNode

def split_document_text(paragraphs: list[str], split_by_sentence: bool = False) -> list[TextNode]:
    """ Split text into paragraphs """
    result = []
    headers = {}
    header_levels = {1: None, 2: None, 3: None, 4: None, 5: None, 6: None}
    
    for par in paragraphs:
        headers = set_headers(par, header_levels)
        complet_context = set_complet_context(par, headers)
        
        metadata = {
            'headers': headers,
            'paragraph': par,
            'complet_context': complet_context
        }

        # use spacy to split paragraph into sentences
        if split_by_sentence:
            doc = nlp(par)
            for sent in doc.sents:
                node = TextNode(metadata=metadata, text=sent.text)
                result.append(node)

        else:
            # Create a TextNode and add to result
            node = TextNode(metadata=metadata, text=par)
            result.append(node)

    return result
    

In [None]:
with open('../data/ant_man.md', 'r') as f:
    paragraphs = f.read().split('\n')
    
    # use spacy to split paragraph into sentences
    result = split_document_text(paragraphs, split_by_sentence=True)
    
    # Or just split into paragraphs
    # result = split_document_text(paragraphs)

In [None]:
result

In [None]:
print(result[7].metadata['complet_context'])