In [4]:
#pip install --no-cache-dir jupyter langchain_openai langchain_community langchain langgraph faiss-cpu sentence-transformers ipywidgets transformers nltk scikit-learn matplotlib markdown langchain_chroma

import yaml
import ipywidgets as widgets
from IPython.display import display
import pickle
import markdown
import glob
import re
import os

In [5]:
import logging
from story_sage.story_sage import StorySage

# Configure the logger

logger = logging.getLogger('story_sage')
logger.setLevel(logging.DEBUG)
# Create a console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)

# Create a formatter and set it for the handler
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(console_handler)

# Filter out logs from other modules
class StorySageFilter(logging.Filter):
    def filter(self, record):
        return record.name.startswith('story_sage')

logger.addFilter(StorySageFilter())



with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)

api_key = config['OPENAI_API_KEY']
chroma_path = config['CHROMA_PATH']
chroma_collection = config['CHROMA_COLLECTION']

# Load series.yml to create a mapping from series_metadata_name to series_id
with open('series.yml', 'r') as file:
    series_list = yaml.safe_load(file)
metadata_to_id = {series['series_metadata_name']: series['series_id'] for series in series_list}

# Load all character dictionaries and merge them using the metadata_to_id mapping
# Load entities.json
with open('entities.json', 'r') as file:
    entities = yaml.safe_load(file)

story_sage = StorySage(
    api_key=api_key,
    chroma_path=chroma_path,
    chroma_collection_name=chroma_collection,
    entities=entities,
    series_yml_path='series.yml',
    n_chunks=10
)


# Add a handler to the StorySage logger
story_sage.logger = logger

def invoke_story_sage(data: dict):
    required_keys = ['question', 'book_number', 'chapter_number', 'series_id']
    if not all(key in data for key in required_keys):
        return {'error': f'Missing parameter! Request must include {", ".join(required_keys)}'}, 400

    try:
        result, context = story_sage.invoke(**data)
        return result, context
    except Exception as e:
        raise e
        return {'error': 'Internal server error.'}, 500
    
data = {
    'question': 'Explain the interactions between Cenn and Rand',
    'book_number': 2,
    'chapter_number': 1,
    'series_id': 3
}

response, context = invoke_story_sage(data)
print(response)


2024-12-16 22:38:22,636 - story_sage.story_sage_retriever - INFO - {'$and': [{'$or': [{'book_number': {'$lt': 2}}, {'$and': [{'book_number': 2}, {'chapter_number': {'$lt': 1}}]}]}, {'$and': [{'p_3_6': True}, {'p_3_2': True}]}]}
2024-12-16 22:38:22,636 - story_sage.story_sage_retriever - INFO - {'$and': [{'$or': [{'book_number': {'$lt': 2}}, {'$and': [{'book_number': 2}, {'chapter_number': {'$lt': 1}}]}]}, {'$and': [{'p_3_6': True}, {'p_3_2': True}]}]}


- Cenn Buie interacts with Rand in a couple of key moments, particularly highlighting Cenn's role as a member of the Village Council and his opinions on village matters.
- In Chapter 2, Rand comments on Nynaeve's temper and recalls that Cenn Buie had been thumped on the head by her after he called her a child. This indicates a past interaction that illustrates the dynamic of respect and authority within the village:
  - "When Cenn Buie called her a child last year, she thumped him on the head with her stick, and he’s on the Village Council, and old enough to be her grandfather, besides." (Book 1, Chapter 2)
- In Chapter 3, during a village gathering, Cenn is present with the Village Council as they interact with the peddler, Padan Fain, showing his involvement in community affairs:
  - "Reluctantly the crowd parted to let them to the fore, everyone closing in quickly behind and never stopping their calling to the peddler." (Book 1, Chapter 3)
- Cenn's discomfort is noted during a heate

In [3]:
data = {
    'question': "what is the name of rand's horse?",
    'book_number': 2,
    'chapter_number': 1,
    'series_id': 3
}

response, context = invoke_story_sage(data)
print(response)

2024-12-16 22:14:32,618 - story_sage.story_sage_retriever - INFO - {'$and': [{'$or': [{'book_number': {'$lt': 2}}, {'$and': [{'book_number': 2}, {'chapter_number': {'$lt': 1}}]}]}, {'$and': [{'p_3_6': True}, {'a_3_19': True}]}]}


- Rand's horse is named Bela. 
  - "Rand moved his horse close to Bela and touched her shoulder." (Book 1, Chapter 45)


In [14]:
import chromadb
from story_sage.story_sage_embedder import StorySageEmbedder
from langchain.embeddings import SentenceTransformerEmbeddings
class EmbeddingAdapter(SentenceTransformerEmbeddings):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def _embed_documents(self, texts):
        return super().embed_documents(texts)  

    def __call__(self, input):
        return self._embed_documents(input)  

embedder = EmbeddingAdapter
client = chromadb.PersistentClient(path=chroma_path)
vector_store = client.get_collection(name=chroma_collection)

In [15]:
filter_dict = {'$and': [
                {'$or': [
                    {'book_number': {'$lt': 1}},
                    {'$and': [
                        {'book_number': 1}, 
                        {'chapter_number': {'$lt': 25}}
                    ]}
                ]}, 
                {'a_3_12': True}
               ]}

filter_dict = {'$or': [
                    {'book_number': {'$lt': 1}},
                    {'$and': [
                        {'book_number': 1}, 
                        {'chapter_number': {'$lt': 25}}
                    ]}
                ]}
#client.delete_collection('wot_retriever_test')
vector_store.query(query_texts=['trolloc'],
                   n_results=5,
                   where=filter_dict,
                   include=['metadatas','documents'])

{'ids': [['3_1_7_29', '3_1_8_31', '3_1_14_8', '3_1_14_6', '3_1_5_40']],
 'embeddings': None,
 'documents': [['. . Trollocs? Then, before anyone knew what was happening, those . .',
   '. . the peddler. The Trollocs.',
   '. in Saldaea, wasn’t it?” Perrin said. Lan hurled his empty bucket to the floor with a crash. “You will talk about it, won’t you? There are always Trollocs in the Borderlands, blacksmith.',
   'Still, it’s been a strange winter. Strange things in the mountains. I heard the other day there were Trollocs up in Saldaea. But that’s the Borderlands then, isn’t it?” He finished with his mouth still open, then snapped it shut, appearing surprised that he had said so much. Rand had tensed at the word Trollocs, and tried to hide it by wringing his washcloth out over his head. As the fellow went on he relaxed, but not everyone kept his mouth shut. “Trollocs?” Mat chortled. Rand splashed water at him, but Mat just wiped it off of his face with a grin. “You just let me tell you a

In [3]:
path_to_chunks = './chunks/wheel_of_time/semantic_chunks'
chunks = {}
for filepath in glob.glob(f'{path_to_chunks}/*.pkl'):
    match = re.match(r'(\d+)_(\d+)\.pkl', os.path.basename(filepath))
    if match:
        book_number, chapter_number = map(int, match.groups())
        with open(filepath, 'rb') as f:
            if book_number not in chunks:
                chunks[book_number] = {}
            chunks[book_number][chapter_number] = pickle.load(f)


In [3]:
import openai

from pydantic import BaseModel
from openai import OpenAI
import httpx

# Create a custom HTTPX client with SSL verification disabled
req_client = httpx.Client(verify=False)

client = OpenAI(api_key=api_key, http_client=req_client)

full_response = None

class StorySageEntities(BaseModel):
  people: list[str]
  places: list[str]
  groups: list[str]
  animals: list[str]

def extract_named_entities(text):
    completion = client.beta.chat.completions.parse(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": """
                You are a highly advanced natural language processing agent that 
                is optimized to do named entity recognition (NER). Your goal is to
                extract entities and a summary from text provided to you.
                
                For example, if the text is:
                    Standing with the other Whitecloaks, Perrin saw the Lugard Road near the Manetherendrelle and the border of Murandy.
                    If dogs had been able to make footprints on stone, he would have said the tracks were the prints of a pack of large hounds.
             
                Extract:
                    People: Perrin
                    Places: Lugard Road, Manetherendrelle, Murandy
                    Groups: Whitecloaks, pack
                    Animals: dogs
                """},
            {"role": "user", "content": text},
        ],
        response_format=StorySageEntities
    )

    extracted_entity = completion.choices[0].message.parsed
    usage_information = completion.usage

    return extracted_entity, usage_information


#entities = extract_named_entities(chunk_to_extract)
#print(entities)

In [None]:
import time

counter = 0
len_cap = 400000
book_chunks = chunks[1]
for i in range(num_chapters):
    chapter_chunks = book_chunks[i]
    chapter_text = '\n'.join(chapter_chunks)
    chapter_len = len(chapter_text)
    if counter + chapter_len > len_cap:
        print(f'Waiting for 30 seconds to avoid exceeding the character limit. Current chapter: {i + 1}. Current length: {counter}')
        time.sleep(30)
        counter = 0
    result[i] = extract_named_entities(chapter_text)
    counter += chapter_len
    #result[chapter_number] = extract_named_entities(chapter_text)
    #entities, usage = extract_named_entities(chapter_text)
    #result[str(chapter_number)] = {'entities': entities, 'chunks': chapter_chunks}

print(f'Finished extracting from {len(result)} chapters')
print(result[-5])

Waiting for 30 seconds to avoid exceeding the character limit. Current chapter: 14. Current length: 368427
Waiting for 30 seconds to avoid exceeding the character limit. Current chapter: 26. Current length: 390861
Waiting for 30 seconds to avoid exceeding the character limit. Current chapter: 37. Current length: 378804
Waiting for 30 seconds to avoid exceeding the character limit. Current chapter: 50. Current length: 390436
Finished extracting from 54 chapters
(StorySageEntities(people=['Rand', 'Nynaeve', 'Lan', 'Egwene', 'Moiraine', 'Mat', 'Loial', 'Perrin', 'Green Man'], places=['Blight', 'Mountains of Dhoom', 'Eye of the World'], groups=['Aes Sedai', 'Warder', 'Worms', 'Halfmen', 'Ogier', 'The People of the Dragon'], animals=['horses', 'butterflies', 'bees']), CompletionUsage(completion_tokens=91, prompt_tokens=4911, total_tokens=5002, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), pr

In [None]:
import json

with open('01_the_eye_of_the_world.json', 'w') as json_file:
    json.dump(result, json_file, default=lambda o: o.__dict__, indent=4)

In [4]:
import json

with open('01_the_eye_of_the_world.json', 'r') as json_file:
    result = json.load(json_file)

In [5]:
result

[[{'people': ['Lews Therin Telamon', 'Elan Morin Tedronai', 'Ilyena'],
   'places': ['Dragonmount', 'palace', 'World Sea'],
   'groups': ['Servants', 'Hundred Companions'],
   'animals': ['none']},
  {'completion_tokens': 52,
   'prompt_tokens': 3707,
   'total_tokens': 3759,
   'completion_tokens_details': {'accepted_prediction_tokens': 0,
    'audio_tokens': 0,
    'reasoning_tokens': 0,
    'rejected_prediction_tokens': 0},
   'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}],
 [{'people': ["Rand al'Thor",
    "Tam al'Thor",
    "Brandelwyn al'Vere",
    'Wit Congar',
    'Cenn Buie',
    'Mat Cauthon',
    'Egwene'],
   'places': ["Emond's Field",
    'Winespring Inn',
    'Mountains of Mist',
    'Westwood',
    'Two Rivers',
    'Sand Hills',
    'Waterwood'],
   'groups': ['Two Rivers folk', 'Women’s Circle', 'Village Council'],
   'animals': ['mare',
    'dogs',
    'geese',
    'badger',
    'sheep',
    'cattle',
    'wolves',
    'horses',
    'ravens',
    

In [6]:
import pprint

num_chapters = len(result[0])

entities_dict = {
    'series': {
        'wheel_of_time': {
            'series_metadata_name': 'wheel_of_time', 
            'series_id': 3, 
            'series_name': 'The Wheel of Time',
            'series_entities': {
                'people_by_id': {}, 
                'people_by_name': {}, 
                'places_by_id': {}, 
                'places_by_name': {},
                'groups_by_id': {},
                'groups_by_name': {},
                'animals_by_id': {},
                'animals_by_name': {}
            },
            'books': [
                {
                    'book_name': 'The Eye of the World', 
                    'book_number': 1, 
                    'chapter_count': num_chapters,
                    'chapters': [],
                    'book_entities': {
                        'people_by_id': {}, 
                        'people_by_name': {}, 
                        'places_by_id': {},
                        'places_by_name': {},
                        'groups_by_id': {},
                        'groups_by_name': {},
                        'animals_by_id': {},
                        'animals_by_name': {}
                    }
                }
            ]
        }
    }
}

for i, chapter_entities in enumerate(result):
    entities_obj = {'chapter': i, 'people': chapter_entities[0]['people'], 'places': chapter_entities[0]['places'], 'groups': chapter_entities[0]['groups'], 'animals': chapter_entities[0]['animals']}
    entities_obj['people'] = list(set([str.lower(person.replace('’', "'").replace('‘', "'")) for person in entities_obj['people']]))
    entities_obj['places'] = list(set([str.lower(place.replace('’', "'").replace('‘', "'")) for place in entities_obj['places']]))
    entities_obj['groups'] = list(set([str.lower(group.replace('’', "'").replace('‘', "'")) for group in entities_obj['groups']]))
    entities_obj['animals'] = list(set([str.lower(animal.replace('’', "'").replace('‘', "'")) for animal in entities_obj['animals']]))
    entities_dict['series']['wheel_of_time']['books'][0]['chapters'].append(entities_obj)

with open('entities.json', 'w') as json_file:
    json.dump(entities_dict, json_file, indent=4)

In [7]:
import json

with open('entities.json', 'r') as json_file:
    entities_dict = json.load(json_file)

In [8]:
import openai

all_people = set()
all_places = set()
all_groups = set()
all_animals = set()

for book in entities_dict['series']['wheel_of_time']['books']:
    for chapter in book['chapters']:
        all_people.update(chapter['people'])
        all_places.update(chapter['places'])
        all_groups.update(chapter['groups'])
        all_animals.update(chapter['animals'])

class GroupedEntities(BaseModel):
    all_people: list[list[str]]

def group_similar_names(names_to_group):
    text = ', '.join(names_to_group)
    completion = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": """
                You are a highly advanced natural language processing agent that 
                is optimized to do named entity recognition (NER). Your goal is to
                group together names that represent the same person from the text provided to you.
             
                Names usually follow a standard pattern. Haral Luhhan and Alsbet Luhhan are likely to be different people, but Haral Luhhan and Master Luhhan are likely to be the same person.

                Make sure all names in the input are present in the output.   
             
                For example:
                    Input: Bran, Mat, Bran al'Vere, Haral Luhhan, Breyan, Matrim Cauthon, Alsbet Luhhan, Master al'Vere, Mat Cauthon
                    Output: [['Bran', "Bran al'Vere", "Master al'Vere"], ['Mat', 'Matrim Cauthon', 'Mat Cauthon'], ['Breyan'], ['Haral Luhhan'], ['Alsbet Luhhan']]
                """},
            {"role": "user", "content": text},
        ],
        response_format=GroupedEntities
    )

    return completion.choices[0].message.parsed



In [9]:
grouped_entities = group_similar_names(all_people)
all_people = grouped_entities.all_people

In [10]:
all_people

[['hyam kinch', 'master kinch'],
 ['bran', "bran al'vere", "master al'vere", 'mayor'],
 ['cenn', 'cenn buie'],
 ['elyas machera', 'elyas'],
 ['lews therin telamon'],
 ['tam', "tam al'thor"],
 ["rand al'thor", 'rand'],
 ['lain mandragoran'],
 ['master hightower'],
 ['logain'],
 ['ara'],
 ['kari', "kari al'thor"],
 ['eazil forney'],
 ['moiraine sedai', 'moiraine'],
 ["ba'alzamon"],
 ['darl coplin', 'hari coplin', 'dag coplin'],
 ['ila'],
 ['mother brune'],
 ['bili congar', 'wit congar'],
 ['isam'],
 ['mistress alys'],
 ['artur hawkwing'],
 ['yurian stonebow'],
 ["al'akir mandragoran"],
 ['gode', 'howal gode'],
 ['agelmar', 'lord agelmar'],
 ['elayne'],
 ['lan'],
 ['elder haman'],
 ['raimun holdwin'],
 ['floran gelb', 'gelb'],
 ['bayle domon', 'captain domon'],
 ['gareth bryne'],
 ["nynaeve al'meara", 'nynaeve'],
 ['morgase', 'queen morgase'],
 ['rowan hum'],
 ['balwen mayel'],
 ['paitr'],
 ['master gill', 'basel gill'],
 ['rogosh'],
 ['arin'],
 ['sheriam sedai'],
 ['bornhald', 'geofram b

In [11]:
# Create names->id and id->name maps
series_entities = entities_dict['series']['wheel_of_time']['series_entities']
series_id = 3

series_entities['people_by_name'] = {}
series_entities['people_by_id'] = {}
for idx, group in enumerate(all_people):
    id = f"p_{series_id}_{idx}"
    series_entities['people_by_id'][id] = group
    for name in group:
        series_entities['people_by_name'][name] = id

series_entities['places_by_name'] = {}
series_entities['places_by_id'] = {}
for idx, place in enumerate(all_places):
    id = f"pl_{series_id}_{idx}"
    series_entities['places_by_id'][id] = place
    series_entities['places_by_name'][place] = id

series_entities['groups_by_id'] = {}
series_entities['groups_by_name'] = {}
for idx, group in enumerate(all_groups):
    id = f"g_{series_id}_{idx}"
    series_entities['groups_by_id'][id] = group
    series_entities['groups_by_name'][group] = id

series_entities['animals_by_id'] = {}
series_entities['animals_by_name'] = {}
for idx, animal in enumerate(all_animals):
    id = f"a_{series_id}_{idx}"
    series_entities['animals_by_id'][id] = animal
    series_entities['animals_by_name'][animal] = id

In [14]:
with open('entities.json', 'w') as json_file:
    json.dump(entities_dict, json_file, indent=4)

In [13]:
for book in entities_dict['series']['wheel_of_time']['books']:
    for chapter in book['chapters']:
        for person in chapter['people']:
            try:
                person_id = series_entities['people_by_name'][person]
            except KeyError as e:
                all_people.append(person)
                person_id = f"p_{series_id}_{len(all_people)}"
                series_entities['people_by_id'][person_id] = [person]
                series_entities['people_by_name'][person] = person_id
        chapter['people_ids'] = [series_entities['people_by_name'][person] for person in chapter['people']]
        chapter['places_ids'] = [series_entities['places_by_name'][place] for place in chapter['places']]
        chapter['groups_ids'] = [series_entities['groups_by_name'][group] for group in chapter['groups']]
        chapter['animals_ids'] = [series_entities['animals_by_name'][animal] for animal in chapter['animals']]

In [88]:
with open('entities.json', 'w') as json_file:
    json.dump(entities_dict, json_file, indent=4)

In [44]:
with open('entities.json', 'w') as json_file:
    json.dump(entities_dict, json_file, indent=4)

In [5]:
data = {
    'question': 'Explain the interactions between Cenn and Rand',
    'book_number': 2,
    'chapter_number': 1,
    'series_id': 3
}

response, context = invoke_story_sage(data)
print(response)

2024-12-16 20:31:30,984 - story_sage - DEBUG - Invoking StorySage with question: Explain the interactions between Cenn and Rand, book_number: 2, chapter_number: 1, series_id: 3
2024-12-16 20:31:30,984 - story_sage - DEBUG - Invoking StorySage with question: Explain the interactions between Cenn and Rand, book_number: 2, chapter_number: 1, series_id: 3
2024-12-16 20:31:30,986 - story_sage.story_sage_chain - DEBUG - Extracting characters from question.
2024-12-16 20:31:30,986 - story_sage.story_sage_chain - DEBUG - Extracting characters from question.
2024-12-16 20:31:30,987 - story_sage.story_sage_chain - DEBUG - Series ID found in state.
2024-12-16 20:31:30,987 - story_sage.story_sage_chain - DEBUG - Series ID found in state.


{'error': 'Internal server error.'}


In [5]:
# Define the input and output widgets
input_box = widgets.Text(
    value='',
    placeholder='Type your question here...',
    description='Question:',
    continuous_update=False,
    disabled=False
)

submit_button = widgets.Button(
    description='Submit',
    disabled=False,
    button_style='',
    tooltip='Click to submit your question',
    icon='check'
)

book_number_box = widgets.IntText(
    value=10,
    description='Book Number:',
    disabled=False
)

chapter_number_box = widgets.IntText(
    value=None,
    description='Chapter Number:',
    disabled=False
)

status_box = widgets.Output(layout={'min_height': '50px'})
output_box = widgets.Output(layout={'min_height': '200px'})
context_box = widgets.Output(layout={'min_height': '200px'})

# Create a spinner widget
spinner = widgets.HTML(
    value="""<i class="fa fa-spinner fa-spin" style="font-size:24px; color:#2a9df4;"></i>""",
    placeholder='Loading...',
    description=''
)

# Make sure Font Awesome is available
display(widgets.HTML("<link rel='stylesheet' href='https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css'>"))

def wrap_answer(answer):
    html = markdown.markdown(answer)
    return f"<div style='background-color: #f9f9f9; padding: 10px; border-radius: 5px;'>{html}</div>"

def show_results(answer, context):
    with output_box:
        output_box_contents = []
        output_box_contents.append("<h3>Answer</h3>")
        output_box_contents.append(wrap_answer(answer))
        display(widgets.HTML(''.join(output_box_contents)))
        with context_box:
            context_box.clear_output()
            context_box_contents = []
            context_box_contents.append("<h3>Context</h3>")
            for idx in range(len(context['metadatas'])):
                meta = context['metadatas'][0][idx]
                content = context['documents'][0][idx]
                context_box_contents.append(f"<p><strong>Book Number:</strong> {meta['book_number']} <strong>Chapter Number:</strong> {meta['chapter_number']}</p>")
                context_box_contents.append(f"<p>{content}</p>")
            display(widgets.HTML(wrap_answer("".join(context_box_contents))))

    

# Define the function to handle the button click
def submit_question(b):
    with status_box:
        status_box.clear_output()
        display(widgets.HTML(f"<h3>Retrieving top relevant chunks...</h3>"))
        with output_box:
            output_box.clear_output()
            display(spinner)
            answer, context = story_sage.invoke(
                question = input_box.value,
                book_number = book_number_box.value,
                chapter_number = chapter_number_box.value
            )
            output_box.clear_output()
    show_results(answer, context)

# Attach the handler to the button
submit_button._click_handlers.callbacks.clear()
submit_button.on_click(submit_question)

# Attach the handler to the input box for the return key
#input_box.observe(submit_question)

# Display the widgets
display(status_box, book_number_box, chapter_number_box, input_box, submit_button, output_box, context_box)

HTML(value="<link rel='stylesheet' href='https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-aw…

Output(layout=Layout(min_height='50px'))

IntText(value=10, description='Book Number:')

IntText(value=0, description='Chapter Number:')

Text(value='', continuous_update=False, description='Question:', placeholder='Type your question here...')

Button(description='Submit', icon='check', style=ButtonStyle(), tooltip='Click to submit your question')

Output(layout=Layout(min_height='200px'))

Output(layout=Layout(min_height='200px'))