In [None]:
import json
import os
import pandas as pd

In [None]:
output_folder = "./output_chapters/"
chapters_file_name = "chapters.json"

In [None]:
chapters_file_path = output_folder + chapters_file_name

# Confirm that file exists
if not os.path.exists(chapters_file_path):
    print(f"Error: The file {chapters_file_path} does not exist.")
    exit(1)
else:
    print(f"File {chapters_file_path} exists.")


In [None]:
chapters = {}
# read the json and save it as dictionary
with open(chapters_file_path, 'r') as file:
    try:
        chapters = json.load(file)
        print("JSON file loaded successfully.")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        exit(1)

In [None]:
print(f"Number of chapters: {len(chapters)}")
# Get the chapter called "Introduction"
introduction_chapter = next(
    (chapter for chapter in chapters.values() if chapter.get("title") == "Introduction"),
    None
)

print(f"Introduction chapter:\n{introduction_chapter.get('content', '')[:500]}")

### That is ugly
As you can see, we have data...but it is raw. We now need to divide it, but how do we do so?
### Things to consider:
- What is a line?  
- How do we define where it starts and ends?  
- What happens with sentences that -  
finish in different lines?
- Or sentences that start in one

page and then finish in another one?
- What punctuation do we use?  
- _Why is a sentence?_  

In [None]:
raw_chapter_text = introduction_chapter.get("content", "")
print(f"Raw chapter text length: {len(raw_chapter_text)} characters")

In [None]:
# Function to split text into chunks of n lines using line breaks
def split_text_by_lines(text, lines_per_chunk=5):
    lines = text.splitlines()
    chunks = []
    for i in range(0, len(lines), lines_per_chunk):
        chunk = "\n".join(lines[i:i+lines_per_chunk])
        chunks.append({"chunk_id": i // lines_per_chunk + 1, "text": chunk})
    return chunks

In [None]:
chunks_by_line_jump = split_text_by_lines(raw_chapter_text, lines_per_chunk=1) # Using 1 line per chunk for clarity
df_line_jump = pd.DataFrame(chunks_by_line_jump)
df_line_jump.to_csv(output_folder + "lines_chunk.csv", index=False)
print(df_line_jump.head(10))  # Display the first 10 chunks

In [None]:
# Function to split text into chunks of n sentences (using periods), handling line breaks
def split_text_by_sentences(text, sentences_per_chunk=5):
    import re
    # Replace line breaks with spaces to avoid breaking sentences
    clean_text = re.sub(r'\s*\n\s*', ' ', text)
    # Split by period, question mark, or exclamation mark followed by space or end of string
    sentences = re.split(r'(?<=[.!?])\s+', clean_text)
    # Remove empty sentences
    sentences = [s.strip() for s in sentences if s.strip()]
    chunks = []
    for i in range(0, len(sentences), sentences_per_chunk):
        chunk = " ".join(sentences[i:i+sentences_per_chunk])
        chunks.append({"chunk_id": i // sentences_per_chunk + 1, "text": chunk})
    return chunks

In [None]:
chunks_by_sentence = split_text_by_sentences(text=raw_chapter_text, sentences_per_chunk=1) # Using 1 sentence per chunk for clarity
df_sentences = pd.DataFrame(chunks_by_sentence)

# Show all rows and columns, and prevent text truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

print(df_sentences.head(10))  # Display the first 10 chunks by sentences
df_sentences.to_csv(output_folder + "sentences_chunks.csv", index=False)

# Return settings to default
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')

It is still not perfect. For example, if you mention "A. Bonavides," it might be incorrectly split into two sentences. Or, you may want to save more information, such as grouping text by chapter (e.g., "Ability Score Increase"), which requires more advanced, semantic division of the PDF.  
But to get started, this approach is more than enough!

In [None]:
# Function to split text into chunks of n sentences, avoiding splits at abbreviations/initials
def split_text_by_sentences_reg(text, sentences_per_chunk=5):
    import re

    # List of common abbreviations and initials to protect
    abbreviations = [
        r'(?:[A-Z]\.){2,}',         # e.g., "U.S.", "A.B."
        r'(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|St|vs|etc|e\.g|i\.e)\.',  # common abbreviations
    ]
    # Protect abbreviations by replacing the period with a placeholder
    protected_text = text
    for abbr in abbreviations:
        protected_text = re.sub(abbr, lambda m: m.group(0).replace('.', '<DOT>'), protected_text)

    # Replace line breaks with spaces to avoid breaking sentences
    clean_text = re.sub(r'\s*\n\s*', ' ', protected_text)
    # Split by period, question mark, or exclamation mark followed by space or end of string
    sentences = re.split(r'(?<=[.!?])\s+', clean_text)
    # Restore protected periods
    sentences = [s.replace('<DOT>', '.') for s in sentences]
    # Remove empty sentences
    sentences = [s.strip() for s in sentences if s.strip()]
    chunks = []
    for i in range(0, len(sentences), sentences_per_chunk):
        chunk = " ".join(sentences[i:i+sentences_per_chunk])
        chunks.append({"chunk_id": i // sentences_per_chunk + 1, "text": chunk})
    return chunks

In [None]:
chunks_by_sentence_reg = split_text_by_sentences_reg(text=raw_chapter_text, sentences_per_chunk=1) # Using 1 sentence per chunk for clarity
df_sentences_reg = pd.DataFrame(chunks_by_sentence_reg)

# Show all rows and columns, and prevent text truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

print(df_sentences_reg.head(10))  # Display the first 10 chunks by sentences
df_sentences_reg.to_csv(output_folder + "sentences_chunks.csv", index=False)

# Return settings to default
pd.reset_option('display.max_rows')
pd.reset_option('display.max_colwidth')