# Notebook for splitting markdown files

In [106]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [107]:
def set_headers(par: str, header_levels: dict) -> dict:
    """ Set headers for a paragraph """
    
    # Check if paragraph starts with a header (e.g. '# ', '## ', etc.)
    for level in range(1, 7):
        if par.startswith('#' * level + ' '):
            header_levels[level] = par
            # Reset lower-level headers
            for lower_level in range(level + 1, 7):
                header_levels[lower_level] = None
            # Exit loop after finding the correct header level
            break  
    
    # Build headers dictionary dynamically
    headers = {f'header_{i}': header_levels[i] for i in range(1, 7) if header_levels[i]}
    return headers

In [108]:
def set_complet_context(par: str, header_levels: dict) -> dict:
    """ Set the complete context for a paragraph """

    complet_context = ''
    for header in header_levels:
        complet_context += header_levels[header]
        complet_context += '\n\n'

    if par not in header_levels.values():
        complet_context += par
    
    return complet_context

In [109]:
from llama_index.core.schema import TextNode

def split_document_text(
        paragraphs: list[str], 
        md_metadata: dict,
        add_metadata_to_text: bool = False,
        split_by_sentence: bool = False
    ) -> list[TextNode]:
    """ Split text into paragraphs """
    result = []
    headers = {}
    header_levels = {1: None, 2: None, 3: None, 4: None, 5: None, 6: None}
    
    for par in paragraphs:
        headers = set_headers(par, header_levels)
        # complet_context = set_complet_context(par, headers)
        
        metadata = {
            **md_metadata,
            **headers,
            # 'paragraph': par,
            # 'complet_context': complet_context
        }        

        # use spacy to split paragraph into sentences
        if split_by_sentence:
            doc = nlp(par)
            for sent in doc.sents:
                
                if add_metadata_to_text:
                    text = ''
                    for value in metadata.values():
                        text += str(value) + '\n'
                    text += sent.text
                else:
                    text = sent.text
                        
                metadata['paragraph'] = par
                node = TextNode(metadata=metadata, text=text)
                result.append(node)

        else:
            if add_metadata_to_text:
                text = ''
                for value in metadata.values():
                    text += str(value) + '\n'
                text += par
            else:
                text = par
            # Create a TextNode and add to result
            metadata['paragraph'] = par
            node = TextNode(metadata=metadata, text=text)
            result.append(node)

    return result
    

In [110]:
import frontmatter

with open('../data/ant_man.md', 'r') as f:
        post = frontmatter.load(f)

        metadata = post.metadata
        paragraphs = post.content.split('\n\n')
        
        if metadata.get('url'):
                metadata.pop('url')
        if metadata.get('title'):
                metadata.pop('title')
                
        # by default, split by paragraph
        result1 = split_document_text(
                paragraphs=paragraphs, 
                md_metadata=metadata, 
        ) 
                
        # use spacy to split paragraph into sentences
        result2 = split_document_text(
                paragraphs=paragraphs, 
                md_metadata=metadata, 
                split_by_sentence=True, 
        )
        
        # Add metadata to text (will be only divided by paragraph)
        result3 = split_document_text(
                paragraphs=paragraphs, 
                md_metadata=metadata, 
                add_metadata_to_text=True
        )
        
        # Add metadata to text (will be divided by sentence)
        result4 = split_document_text(
                paragraphs=paragraphs, 
                md_metadata=metadata, 
                add_metadata_to_text=True, 
                split_by_sentence=True
        )


### By default, split by paragraph

In [111]:
import json

meta = json.dumps(result1[7].metadata, indent=2)
print(meta)

txt = json.dumps(result1[7].text, indent=2)
print('Text:', txt)

{
  "heading": "Heading",
  "subheading": "SubHeading",
  "header_1": "# Ant Man",
  "header_2": "## Main Movies",
  "header_3": "### Ant-Man (2015)",
  "paragraph": "Pym, who manipulated Lang through an unknowing Luis into stealing the suit as a test, wants Lang to become the new Ant-Man to steal the Yellowjacket from Cross. Having been spying on Cross after discovering his intentions, Hope and Pym train Lang to fight and to control ants. While Hope harbors resentment towards Pym about her mother Janet's death, he reveals that Janet, known as the Wasp, disappeared into a subatomic Quantum Realm while disabling a Soviet nuclear missile in 1987. Pym warns Lang that he could suffer a similar fate if he overrides his suit's regulator. They send him to steal a device from the Avengers' headquarters that will aid their heist, where he briefly fights Sam Wilson."
}
Text: "Pym, who manipulated Lang through an unknowing Luis into stealing the suit as a test, wants Lang to become the new Ant-Ma

## Use spacy to split paragraph into sentences

In [112]:
import json

meta = json.dumps(result2[7].metadata, indent=2)
print(meta)

txt = json.dumps(result2[7].text, indent=2)
print('Text:', txt)

{
  "heading": "Heading",
  "subheading": "SubHeading",
  "header_1": "# Ant Man",
  "header_2": "## Main Movies",
  "header_3": "### Ant-Man (2015)",
  "paragraph": "In 1989, scientist Hank Pym resigns from S.H.I.E.L.D. after discovering their attempt to replicate his Ant-Man shrinking technology. Believing the technology would be dangerous if replicated, Pym vows to hide it for as long as he lives. In the present day, Pym's estranged daughter, Hope van Dyne, and former prot\u00e9g\u00e9, Darren Cross, have forced him out of his company, Pym Technologies. Cross is close to perfecting a shrinking suit of his own, the Yellowjacket, which horrifies Pym."
}
Text: "In the present day, Pym's estranged daughter, Hope van Dyne, and former prot\u00e9g\u00e9, Darren Cross, have forced him out of his company, Pym Technologies."


## Add metadata to text (will be only divided by paragraph)

In [113]:
import json

meta = json.dumps(result3[7].metadata, indent=2)
print(meta)

txt = json.dumps(result3[7].text, indent=2)
print('Text:', txt)

{
  "heading": "Heading",
  "subheading": "SubHeading",
  "header_1": "# Ant Man",
  "header_2": "## Main Movies",
  "header_3": "### Ant-Man (2015)",
  "paragraph": "Pym, who manipulated Lang through an unknowing Luis into stealing the suit as a test, wants Lang to become the new Ant-Man to steal the Yellowjacket from Cross. Having been spying on Cross after discovering his intentions, Hope and Pym train Lang to fight and to control ants. While Hope harbors resentment towards Pym about her mother Janet's death, he reveals that Janet, known as the Wasp, disappeared into a subatomic Quantum Realm while disabling a Soviet nuclear missile in 1987. Pym warns Lang that he could suffer a similar fate if he overrides his suit's regulator. They send him to steal a device from the Avengers' headquarters that will aid their heist, where he briefly fights Sam Wilson."
}
Text: "Heading\nSubHeading\n# Ant Man\n## Main Movies\n### Ant-Man (2015)\nPym, who manipulated Lang through an unknowing Luis i

## Add metadata to text (will be divided by sentence)

In [114]:
import json

meta = json.dumps(result4[7].metadata, indent=2)
print(meta)

txt = json.dumps(result4[7].text, indent=2)
print('Text:', txt)

{
  "heading": "Heading",
  "subheading": "SubHeading",
  "header_1": "# Ant Man",
  "header_2": "## Main Movies",
  "header_3": "### Ant-Man (2015)",
  "paragraph": "In 1989, scientist Hank Pym resigns from S.H.I.E.L.D. after discovering their attempt to replicate his Ant-Man shrinking technology. Believing the technology would be dangerous if replicated, Pym vows to hide it for as long as he lives. In the present day, Pym's estranged daughter, Hope van Dyne, and former prot\u00e9g\u00e9, Darren Cross, have forced him out of his company, Pym Technologies. Cross is close to perfecting a shrinking suit of his own, the Yellowjacket, which horrifies Pym."
}
Text: "Heading\nSubHeading\n# Ant Man\n## Main Movies\n### Ant-Man (2015)\nIn 1989, scientist Hank Pym resigns from S.H.I.E.L.D. after discovering their attempt to replicate his Ant-Man shrinking technology. Believing the technology would be dangerous if replicated, Pym vows to hide it for as long as he lives. In the present day, Pym's