In [6]:
import story_sage
from story_sage import StorySageConfig
from story_sage.utils import Raptor, Chunk, Embedder
from openai import OpenAI
import yaml
import httpx
import os
import chromadb
from pprint import pprint
import copy
from typing import OrderedDict
import importlib

RERUN = False



os.environ['TOKENIZERS_PARALLELISM'] = "false"

config_path = './config.yml'
raptor = Raptor(config_path=config_path,
                skip_summarization=False,
                chunk_size=400,
                max_tokens=100,
                target_dim=5)

ssconfig = StorySageConfig.from_file(config_path)
SERIES = next(series for series in ssconfig.series if series.series_metadata_name == 'harry_potter')

source_file = './books/harry_potter/01test_hp.txt'
source_file = './books/harry_potter/0[1|2]_*.txt'

client = OpenAI(api_key=ssconfig.openai_api_key, http_client=httpx.Client(verify=False))

embedder = Embedder()

chroma_client = chromadb.EphemeralClient()

In [2]:
if RERUN:
    results = raptor.process_texts(source_file, number_of_levels=3)

In [3]:
if RERUN:
    with open('hp_1_2.pkl', 'wb') as f:
        import pickle
        pickle.dump(results, f)
else:
    with open('hp_1_2.pkl', 'rb') as f:
        import pickle
        results = pickle.load(f)

In [4]:
for book, book_data in results.items():
    print(f'Book: {book}')
    for chapter, chapter_data in book_data.items():
        print(f'  Chapter: {chapter}')
        for level, level_data in chapter_data.items():
            print(f'    Level: {level}')
            print(f'    {len(level_data)} Chunks')
            print()
            pprint(level_data)
            print()

Book: 01_the_sourcerers_stone.txt
  Chapter: chapter_0
    Level: level_1
    2 Chunks

[Chunk: book_1|chapter_0|level_1|chunk_0 * Parents: ['book_1|chapter_0|level_2|chunk_0'] * Children: [],
 Chunk: book_1|chapter_0|level_1|chunk_1 * Parents: ['book_1|chapter_0|level_2|chunk_0'] * Children: []]

    Level: level_2
    1 Chunks

[Chunk: book_1|chapter_0|level_2|chunk_0 * Parents: [] * Children: ['book_1|chapter_0|level_1|chunk_0', 'book_1|chapter_0|level_1|chunk_1']]

  Chapter: chapter_1
    Level: level_1
    19 Chunks

[Chunk: book_1|chapter_1|level_1|chunk_0 * Parents: ['book_1|chapter_1|level_2|chunk_1'] * Children: [],
 Chunk: book_1|chapter_1|level_1|chunk_1 * Parents: ['book_1|chapter_1|level_2|chunk_3'] * Children: [],
 Chunk: book_1|chapter_1|level_1|chunk_2 * Parents: ['book_1|chapter_1|level_2|chunk_6'] * Children: [],
 Chunk: book_1|chapter_1|level_1|chunk_3 * Parents: ['book_1|chapter_1|level_2|chunk_6'] * Children: [],
 Chunk: book_1|chapter_1|level_1|chunk_4 * Parents:

In [5]:
importlib.reload(story_sage)
l2_sum: Chunk = results['01_the_sourcerers_stone.txt']['chapter_8']['level_2'][1]
pprint(l2_sum.__json__())

{'children': ['book_1|chapter_8|level_1|chunk_0',
              'book_1|chapter_8|level_1|chunk_1',
              'book_1|chapter_8|level_1|chunk_2'],
 'chunk_key': 'book_1|chapter_8|level_2|chunk_1',
 'metadata': {'book_number': 1,
              'chapter_number': 8,
              'chunk_index': 1,
              'level': 2},
 'parents': [],
 'text': 'In the context, Harry experiences a lot of attention from other '
         'students at Hogwarts due to his fame. He struggles to navigate the '
         "school's complicated layout, which includes a variety of staircases "
         'and doors that behave unexpectedly. Harry and Ron have a run-in with '
         'the caretaker, Argus Filch, when they accidentally try to enter a '
         'forbidden corridor. Filch is known for his cat, Mrs. Norris, who '
         'helps him catch rule-breakers. The classes at Hogwarts are rigorous, '
         'covering various magical topics,'}


In [7]:
from story_sage.utils.raptor import _LevelsDict

def get_chroma_collection(results: dict[str, OrderedDict[str, _LevelsDict]], embedder, collection_name="raptor_collection"):
    # 1. Initialize client & embedding function
    client = chromadb.PersistentClient(path='./chroma_data')
    collection = client.create_collection(collection_name, embedding_function=embedder)

    # 2. Flatten results into a list of (text, level, parents, children)
    documents = []
    ids = []
    metadatas = []

    for book, book_data in results.items():
        for chapter, chapter_data in book_data.items():
            for level, level_data in chapter_data.items():
                for chunk in level_data:
                    documents.append(chunk.text)
                    ids.append(f'{book}_{chapter}_{level}_{chunk.chunk_key}')
                    metadatas.append({
                        'series_id': SERIES.series_id,
                        'book': int(book),
                        'chapter': int(chapter),
                        'level': int(level.split('_')[1])
                    })

    collection.add(
        documents=documents,
        ids=ids,
        metadatas=metadatas
    )
    return collection

In [10]:
collection = get_chroma_collection(results, embedder)

In [11]:
def generate_results(
    query : str,
    context_text : str
) -> str:

    developer_prompt = f"""
    You are a helpful GPT named Story Sage. Although you are not the author, you should use the voice of an author talking about their own work.
    
    # Your Goal
    You need to help a reader keep track of people, places, and plot points in books.
    The reader is currently reading a book series and has provided you with a question about the context of the book.
    You will be provided with text delimited by triple quotes that contain context from the book.

    # Your Task
    Based on the context and question provided:

    1 - Think about the what information you would need to answer their question.
    2 - Review the context provided to find supporting information for your answer.
    3 - Be skeptical--don't just find supporting information, but also consider that the user may be misleading you.

    If the answer isn't clearly stated in the context, don't guess. DO NOT INCLUDE INFORMATION THAT DOESN'T APPEAR IN THE CONTEXT.
    If the question mentions specific people, places, or things, but they do not appear in the context, don't assume anything about them.

    # Your Output
    1 - Provide a sentence or two that answers the reader's question based on the context provided.
    2 - Provide bullet points with excerpts from the context that support your answer.

    The reader is in Book 1 and you MAY NOT GIVE SPOILERS, so only use information from the context provided. Do not include information from the broader universe.

    """

    user_prompt = f"""
    # Question
    {query}

    # Context
    {context_text}
    """
    response = client.chat.completions.create(
        model="gpt-4o-mini", 
        messages=[
            {"role": "developer", "content": developer_prompt},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=500,
        n=1,
        stop=None,
        temperature=0.7
    )
    answer = response.choices[0].message.content.strip()
    return answer

In [12]:
with open('tests/test_config.yml', 'r') as f:
    test_config = yaml.safe_load(f)

In [13]:
def get_answer(question, chapter):
    search_results = collection.query(query_texts=question, n_results=20, where={"chapter": { '$lt': chapter + 1 }})
    context_text = "------\n\n".join(set(search_results['documents'][0]))
    raptor_answer = generate_results(question, context_text)
    print(f"Question: {question}")
    print()
    print(f"Answer: {raptor_answer}")
    print()
    print('💡' * 20)
    print()


## Results for the Whole Book

In [14]:
for question in test_config[0]['question_list']:
    get_answer(question, 18)

Question: Why does Dumbledore decide to have Harry grow up with the Dursleys rather than with one of the wizard families? How does Harry’s experience with his relatives build his character?

Answer: Dumbledore decides to have Harry grow up with the Dursleys to protect him from the wizarding world until he is ready, believing it is best for him to remain away from the fame and expectations associated with his identity as "the boy who lived." This experience with the Dursleys, who treat him poorly, builds his character by instilling resilience and a sense of empathy towards others who are marginalized.

- "Dumbledore insists it's best for him to grow up away from that world until he's ready."
- "Hagrid expresses anger towards the Dursleys for hiding Harry's magical heritage from him."
- Harry faces a harsh summer with his cousin Dudley and his gang tormenting him daily, which highlights his resilience.
- "The Dursleys go to extreme lengths to prevent Harry from receiving his letters," sh

## Results at Chapter 2

In [15]:
for question in test_config[0]['question_list']:
    get_answer(question, 2)

Question: Why does Dumbledore decide to have Harry grow up with the Dursleys rather than with one of the wizard families? How does Harry’s experience with his relatives build his character?

Answer: Dumbledore decides to leave Harry with the Dursleys because he believes it's the best place for him to grow up away from the wizarding world until he is ready to understand his legacy. Harry’s experience with the Dursleys, who treat him poorly, helps to build his character by fostering resilience and empathy, despite the neglect and bullying he endures.

- Dumbledore states, "It’s the best place for him... His aunt and uncle will be able to explain everything to him when he’s older."
- Dumbledore emphasizes that growing up away from fame and expectations will be beneficial for Harry: "Famous before he can walk and talk! Famous for something he won’t even remember! Can’t you see how much better off he’ll be, growing up away from all that until he’s ready to take it?"
- Harry's life with the 