# Notebook for splitting markdown files

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
def set_headers(par: str, header_levels: dict) -> dict:
    """ Set headers for a paragraph """
    
    # Check if paragraph starts with a header (e.g. '# ', '## ', etc.)
    for level in range(1, 7):
        if par.startswith('#' * level + ' '):
            header_levels[level] = par
            # Reset lower-level headers
            for lower_level in range(level + 1, 7):
                header_levels[lower_level] = None
            # Exit loop after finding the correct header level
            break  
    
    # Build headers dictionary dynamically
    headers = {f'header_{i}': header_levels[i] for i in range(1, 7) if header_levels[i]}
    return headers

In [3]:
def set_complet_context(par: str, header_levels: dict) -> dict:
    """ Set the complete context for a paragraph """

    complet_context = ''
    for header in header_levels:
        complet_context += header_levels[header]
        complet_context += '\n\n'

    if par not in header_levels.values():
        complet_context += par
    
    return complet_context

In [4]:
from llama_index.core.schema import TextNode

def split_document_text(paragraphs: list[str], split_by_sentence: bool = False) -> list[TextNode]:
    """ Split text into paragraphs """
    result = []
    headers = {}
    header_levels = {1: None, 2: None, 3: None, 4: None, 5: None, 6: None}
    
    for par in paragraphs:
        headers = set_headers(par, header_levels)
        complet_context = set_complet_context(par, headers)
        
        metadata = {
            'headers': headers,
            'paragraph': par,
            'complet_context': complet_context
        }

        # use spacy to split paragraph into sentences
        if split_by_sentence:
            doc = nlp(par)
            for sent in doc.sents:
                node = TextNode(metadata=metadata, text=sent.text)
                result.append(node)

        else:
            # Create a TextNode and add to result
            node = TextNode(metadata=metadata, text=par)
            result.append(node)

    return result
    

In [5]:
import os 
directory = 'data1/'

# List to store all the results
all_results = []

# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.md'):  # Only process markdown files
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r') as f:
            #extract the heading subheading and title
            
            paragraphs = f.read().split('\n')
            
            # Use spacy to split paragraphs into sentences
            result = split_document_text(paragraphs, split_by_sentence=True)
            
            # Append results to the overall list
            all_results.extend(result)
             # Or just split into paragraphs
             # result = split_document_text(paragraphs)

In [11]:
all_results[7]

TextNode(id_='0241f58c-1379-4deb-8cfe-36cd4cfe9c5f', embedding=None, metadata={'paragraph': '* Be a member of The Church of Jesus Christ of Latter-day Saints. For applicants who are not Church members, see [2.2.2 Non-Member Applicants](https://pathwaysupport.org/handbook/2-admission-and-tuition/admission/#non-member).', 'complet_context': '# 2.2 Admission Requirements\n\n* Be a member of The Church of Jesus Christ of Latter-day Saints. For applicants who are not Church members, see [2.2.2 Non-Member Applicants](https://pathwaysupport.org/handbook/2-admission-and-tuition/admission/#non-member).'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='For applicants who are not Church members, see [2.2.2 Non-Member Applicants](https://pathwaysupport.org/handbook/2-admission-and-tuition/admission/#non-member).', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', m

In [7]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone
from llama_index.embeddings.openai import OpenAIEmbedding
from dotenv import load_dotenv, find_dotenv
embed_model = OpenAIEmbedding()

_=load_dotenv(find_dotenv())
# index = VectorStoreIndex(all_results)
pc = Pinecone()
pc_index = pc.Index(os.getenv("PINECONE_INDEX_NAME"))

In [8]:
storage_context = StorageContext.from_defaults(vector_store=PineconeVectorStore(pinecone_index=pc_index))

In [9]:
vector_index = VectorStoreIndex(nodes=all_results, storage_context=storage_context)

Upserted vectors:   0%|          | 0/48 [00:00<?, ?it/s]