In [1]:
import os
import json
from story_sage.utils.chunker import StorySageChunker
from tqdm import tqdm
import argparse
from openai import OpenAI
from pydantic import BaseModel
from story_sage.data_classes.story_sage_config import StorySageConfig
import yaml
import httpx
from glob import glob
from story_sage.utils.embedding import Embedder
import chromadb

In [47]:

SERIES_NAME = 'harry_potter'
BOOK_NUMBERS = [1, 2]

def load_config(config_path):
    with open(config_path, 'r') as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    return config

config = StorySageConfig.from_config(load_config('config.yml'))

In [48]:
# Create file patterns for each book number
file_patterns = []
if BOOK_NUMBERS:
    file_patterns = [f'./books/{SERIES_NAME}/{str(num).zfill(2)}_*.txt' for num in BOOK_NUMBERS]
else:
    file_patterns = [f'./books/{SERIES_NAME}/*_*.txt']

In [4]:
chunker = StorySageChunker(model_name='all-MiniLM-L6-v2')

def create_chunks(pattern):
    text_dict = chunker.read_text_files(pattern)
    
    for _, book_info in text_dict.items():
        book_number = book_info['book_number']
        for chapter_number, chapter_text in tqdm(book_info['chapters'].items(), desc=f'Processing book {book_number}'):
            chapter_text_length = len(''.join(chapter_text).replace(' ', ''))
            if chapter_text_length < 100:
                continue
            # Concatenate the elements in chapter_text
            full_text = ' '.join(chapter_text)
            chunks = chunker.process_file(
                text=full_text,
                context_window=4,
                percentile_threshold=70,
                min_chunk_size=100
            )

            if not os.path.exists(f'chunks/{SERIES_NAME}'):
                os.makedirs(f'chunks/{SERIES_NAME}')
            if not os.path.exists(f'chunks/{SERIES_NAME}/bigger_chunks'):
                os.makedirs(f'chunks/{SERIES_NAME}/bigger_chunks')
            json.dump(chunks, open(f'chunks/{SERIES_NAME}/bigger_chunks/{book_number}_{chapter_number}.json', 'w'), indent=4)

In [58]:
client = OpenAI(
    api_key=config.openai_api_key, http_client=httpx.Client(verify=False)
)

class CharacterActions(BaseModel):
    character: str
    actions: str

class ChunkSummary(BaseModel):
    summary: str
    characters: list[CharacterActions]
    locations: list[str]
    creatures: list[str]
    objects: list[str]

def get_summary(text: str):
    """Gets a summary of the text using GPT-4o"""
    summary_len = round(len(text.split(' ')) / 5)
    if summary_len < 50:
        summary_len = 50
    chat_completion = client.beta.chat.completions.parse(
        messages=[
            {
                "role": "system",
                "content": f"""
                    You are an advanced litarary assistant who specializes in 
                        summarizing chunks from novels to optimize the quality of 
                        embeddings for a RAG application.
                    Please create a concise SUMMARY of the following text,
                        paying particular attention to the characters, locations, 
                        and actions.
                    Use simple language and focus on capturing as much meaning 
                        in as few words as possible to help with the similarity search.
                    
                    Please extract SUMMARY that should be no longer than {summary_len} words. 
                    Please extract CHARACTERS along with a brief description of their ACTIONS.
                    Please extract LOCATIONS.
                    Please extract CREATURES.
                    Please extract OBJECTS.
                """
            },
            {
                "role": "user",
                "content": text
            }
        ],
        model="gpt-4o-mini",
        response_format=ChunkSummary
    )
    return chat_completion.choices[0].message.parsed, chat_completion.usage


In [54]:
chunks = []

filename_pattern = '[' + ','.join([str(num) for num in BOOK_NUMBERS]) + ']' if BOOK_NUMBERS else '*'

chunks_filepath = f'./chunks/{SERIES_NAME}/bigger_chunks/{filename_pattern}_*.json'

for chunk_filepath in glob(chunks_filepath):
    filename = os.path.basename(chunk_filepath)
    book_number, chapter_number = filename.split('.')[0].split('_')
    with open(chunk_filepath, 'r') as f:
        for chunk_idx, chunk in enumerate(json.load(f)):
            chunks.append((book_number, chapter_number, chunk_idx, chunk))

summaries = []
usage = []
blank_summary = ChunkSummary(summary='', characters=[], locations=[], creatures=[], objects=[])
for chunk in tqdm(chunks):
    response, usage_ = blank_summary, 0 #get_summary(chunk[3])
    summaries.append((chunk[0], chunk[1], chunk[2], {
        'full_chunk': chunk[3],
        'summary': response.summary,
        'characters': [{character.character: character.actions} for character in response.characters],
        'locations': response.locations,
        'creatures': response.creatures,
        'objects': response.objects
    }))
    usage.append(usage_)


100%|██████████| 69/69 [00:00<00:00, 574220.19it/s]


In [64]:
def show_summary(summary):

    def _get_str(list_):
        return '   ' + '\n    '.join(list_)

    response, usage_ = get_summary(summary[3]['full_chunk'])
    print(f"Book: {summary[0]}, Chapter: {summary[1]}, Chunk: {summary[2]}")
    print(f"Chunk word count: {len(summary[3]['full_chunk'].split(' '))}, Summary word count: {len(response.summary.split(' '))}")
    print('-----------------')
    print('Summary:')
    print('   ', response.summary)
    print('-----------------')
    print('Characters:')
    for character in response.characters:
        print(f"    {character.character}: {character.actions}")
    print('-----------------')
    print('Locations:\n', _get_str(response.locations))
    print('-----------------')
    print('Creatures:\n', _get_str(response.creatures))
    print('-----------------')
    print('Objects:\n', _get_str(response.objects))
    print('-----------------')
    print('Chunk:\n   ', summary[3]['full_chunk'])

In [72]:
show_summary(summaries[47])

Book: 1, Chapter: 5, Chunk: 1
Chunk word count: 2718, Summary word count: 108
-----------------
Summary:
    Harry Potter travels to Diagon Alley with Hagrid after receiving a letter about his acceptance to Hogwarts. They arrive by boat and discuss magical creatures like dragons and the Ministry of Magic. Hagrid helps Harry navigate the Muggle world, struggling with public transport. They enter the Leaky Cauldron, where Harry is greeted as a celebrity. Hagrid introduces Harry to several magical characters, including Professor Quirrell. They head to Gringotts to access Harry's vault, encountering goblins and learning about vault security. Hagrid retrieves a letter from Dumbledore regarding a mysterious object in vault 713. They take a wild cart ride deeper into the bank, passing beautiful but dangerous sights.
-----------------
Characters:
    Harry Potter: Curious about magic, follows Hagrid and engages in discussions about dragons and the Ministry of Magic.
    Hagrid: Guides Harry to

In [69]:
len(summaries)

69

In [75]:
import pickle

with open('chunks/wheel_of_time/summaries/summaries.pkl', 'rb') as file:
    wheel_of_time_summaries = pickle.load(file)

print(wheel_of_time_summaries[3])

(1, 1, 0, {'full_chunk': ' An Empty Road The Wheel of Time turns, and Ages come and pass, leaving memories that become legend. Legend fades to myth, and even myth is long forgotten when the Age that gave it birth comes again. In one Age, called the Third Age by some, an Age yet to come, an Age long past, a wind rose in the Mountains of Mist. The wind was not the beginning. There are neither beginnings nor endings to the turning of the Wheel of Time. But it was a beginning. Born below the ever cloud-capped peaks that gave the mountains their name, the wind blew east, out across the Sand Hills, once the shore of a great ocean, before the Breaking of the World. Down it flailed into the Two Rivers, into the tangled forest called the Westwood, and beat at two men walking with a cart and horse down the rock-strewn track called the Quarry Road. For all that spring should have come a good month since, the wind carried an icy chill as if it would rather bear snow. Gusts plastered Rand al’Thor’s

In [78]:
character_keys = set()

for summary in wheel_of_time_summaries:
    for character in summary[3]['characters']:
        character_keys.update(character.keys())

print(len(character_keys))

167


In [85]:
import glob
import json

# Define the path pattern for the summary files
summary_files_pattern = './chunks/harry_potter/summaries/*.json'

# Initialize an empty list to store the summaries
harry_potter_summaries = []

# Iterate over each file that matches the pattern
for summary_file in glob.glob(summary_files_pattern):
    with open(summary_file, 'r') as file:
        filename = os.path.basename(summary_file)
        book_number, chapter_name = filename.split('.')[0].split('_')
        summary_data = json.load(file)
        for chunk_idx, summary in enumerate(summary_data):
            harry_potter_summaries.append((book_number, chapter_name, chunk_idx, summary))

print(f"Loaded {len(harry_potter_summaries)} summaries.")

Loaded 823 summaries.


In [87]:

with open('./chunks/harry_potter/summaries/summaries_1-2-3-4-5-6-7.pkl', 'wb') as file:
    pickle.dump(harry_potter_summaries, file)