# Demistifying RAG
## And how I prove that it is way more dificult writing Demistifying than RAG

In [None]:
def split_text_by_chapters(text):
    # Regex to match lines like "Chapter 1: Title" or "Chapter 2 Title"
    pattern = r'(?im)^(Chapter\s+(\d+)[^\n]*)$'
    matches = list(re.finditer(pattern, text))
    chapters = []
    for i, match in enumerate(matches):
        chapter_line = match.group(1).strip()
        chapter_num = match.group(2)
        start = match.end()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        content = text[start:end].strip()
        chapters.append({
            "chapter_number": chapter_num,
            "chapter_title": chapter_line,
            "content": content
        })
    return chapters

In [None]:
def split_text_by_chapters(text):
    import re
    # Regex: "chapter" (with optional spaces), number, optional spaces, colon (with optional spaces), then title
    # Allow optional non-word chars or digits before "chapter" (to handle cases like "20 10 1Ch apter  5: E q u ipm en t")
    pattern = r'(?im)^.*?(?:[Cc]\s*[Hh]\s*[Aa]\s*[Pp]\s*[Tt]\s*[Ee]\s*[Rr])\s*(\d+)\s*:\s*[^\n]*$'
    matches = list(re.finditer(pattern, text, re.MULTILINE))
    chapters = []
    expected_chapter = 1
    last_chapter = None

    # Handle introduction (everything before first chapter)
    if matches and matches[0].start() > 0:
        intro_content = text[:matches[0].start()].strip()
        if intro_content:
            chapters.append({
                "chapter_number": "introduction",
                "chapter_title": "Introduction",
                "content": intro_content
            })

    for i, match in enumerate(matches):
        # Find the full matched line for the chapter title
        line_start = text.rfind('\n', 0, match.start()) + 1
        line_end = text.find('\n', match.start())
        if line_end == -1:
            line_end = len(text)
        chapter_line = text[line_start:line_end].strip()
        chapter_num = int(match.group(1))
        # Check for sequential chapter numbers
        if chapter_num != expected_chapter:
            raise ValueError(
                f"Expected Chapter {expected_chapter} after Chapter {last_chapter}, but found Chapter {chapter_num}"
            )
        last_chapter = chapter_num
        expected_chapter += 1
        start = match.end()
        end = matches[i+1].start() if i+1 < len(matches) else len(text)
        content = text[start:end].strip()
        chapters.append({
            "chapter_number": str(chapter_num),
            "chapter_title": chapter_line,
            "content": content
        })
    return chapters
# filepath: /Users/annie/dev/roll_20/roll_20.ipynb

In [None]:
import re

pattern = r'(?im)^((?:[Cc]\s*)?(?:[Hh]\s*)?(?:[Aa]\s*)?(?:[Pp]\s*)?(?:[Tt]\s*)?(?:[Ee]\s*)?(?:[Rr]\s*)\s*(\d+)(?:\s|:)[^\n]*)$'
test_text = """
355,000 20 +6

Ch apter  2: R aces
A  VISIT TO ONE OF TH
"""

matches = list(re.finditer(pattern, test_text, re.MULTILINE))
for m in matches:
    print("MATCH:", m.group(1))

In [None]:
all_book_pdf_path = "./resources/DNDPlayersHandbook.pdf"
all_book_raw_text = pdf_to_text(pdf_path=all_book_pdf_path, start_page=2)

In [None]:
# save the raw text to a file for reference
with open(output_folder + "all_book_raw_text.txt", "w") as f:
    f.write(all_book_raw_text)

In [None]:
chapters = split_text_by_chapters(all_book_raw_text)
df_chapters = pd.DataFrame(chapters)
df_chapters.to_csv(output_folder + "chapters.csv", index=False)
print(df_chapters.head())

In [None]:
# Build the dictionary: chapter number as key, value is dict with title and content
chapters_dict = {
    chapter["chapter_number"]: {
        "title": chapter["chapter_title"],
        "content": chapter["content"]
    }
    for chapter in chapters
}

# Save to JSON
with open(output_folder + "chapters.json", "w", encoding="utf-8") as f:
    json.dump(chapters_dict, f, ensure_ascii=False, indent=2)

In [None]:
# Uncomment to print the first two chapters for verification
# print(json.dumps({k: chapters_dict[k] for k in list(chapters_dict)[:2]}, indent=2))

## The Two Main Parts of RAG

When working with Retrieval-Augmented Generation (RAG), there are two main components to consider:

1. **Structuring Your Information:**  
    The first step is to ensure your data is organized in a way that makes it easy to retrieve and use. This involves cleaning, chunking, and formatting your information so that it can be efficiently searched and referenced.

2. **Choosing What to Send:**  
    Once your data is well-structured, the next challenge is deciding which pieces of information to send to your model or downstream process. This selection step is crucial for maximizing relevance and performance.

---

In the next section, I'll focus on strategies for choosing what information to send.