In [60]:
import json
from gliner import GLiNER
import torch
from tqdm.notebook import tqdm
from story_sage.data_classes.story_sage_config import StorySageConfig
import yaml
import httpx
from glob import glob

In [17]:
def load_config(config_path):
    with open(config_path, 'r') as file:
        config = yaml.load(file, Loader=yaml.FullLoader)
    return config

config = StorySageConfig.from_config(load_config('config.yml'))

In [101]:
from openai import OpenAI
from openai.types import Completion, CompletionChoice, CompletionUsage
from pydantic import BaseModel

client = OpenAI(
    api_key=config.openai_api_key, http_client=httpx.Client(verify=False)
)

class ChunkSummary(BaseModel):
    summary: str
    characters: list[str]
    locations: list[str]
    creatures: list[str]

def get_summary(text):
    """Gets a summary of the text using GPT-4o"""
    chat_completion = client.beta.chat.completions.parse(
        messages=[
            {
                "role": "system",
                "content": """
                    You are an advanced litarary assistant who specializes in summarizing chunks from novels to optimize the quality of embeddings for a RAG application.
                    Please create a concise summary of the following text, paying particular attention to the characters, locations, and actions.
                    The summary should be no longer than 150 words. The summary should not include descriptive language or dialogue, it should 
                    focus on the key events and characters in the text. Make sure all of the characters mentioned in the excerpt are included in the summary.
                    Use simple language and focus on capturing as much meaning in as few words as possible to help with the similarity search.
                    Also, please extract locations and creatures mentioned in the passage.
                """
            },
            {
                "role": "user",
                "content": text
            }
        ],
        model="gpt-4o-mini",
        response_format=ChunkSummary
    )
    return chat_completion.choices[0].message.parsed, chat_completion.usage


In [146]:

chunks_filepath = './chunks/wheel_of_time/bigger_chunks/1_*.json'

chunks = []

for chunk_filepath in glob(chunks_filepath):
    
    with open(chunk_filepath, 'r') as f:
        chunks.extend(json.load(f))


summaries = []
usage = []
for chunk in tqdm(chunks):
    response, usage_ = get_summary(chunk)
    summaries.append(response)
    usage.append(usage_)

  0%|          | 0/259 [00:00<?, ?it/s]

In [147]:
sum([comp.total_tokens for comp in usage])

502823

In [149]:
import os
filename = os.path.basename('chunks/wheel_of_time/bigger_chunks/1_27.json')
book_number, chapter_number = filename.split('.')[0].split('_')
print(book_number, chapter_number)

1 27


In [148]:
summaries

[ChunkSummary(summary='Rand and Mat meet with Master Gill in a secluded corner table of the common room. Master Gill orders food for them, which consists of thin slices of beef, mustard greens, and potatoes. After the serving maid leaves, Master Gill asks Rand to explain his troubles to decide how to help. Rand is hesitant and chooses to keep the explanation simple, intentionally omitting details about Trollocs and Fades to avoid scaring Gill while acknowledging the danger involved.', characters=['Rand', 'Mat', 'Master Gill'], locations=['common room'], creatures=['Trollocs', 'Fades']),
 ChunkSummary(summary="Rand, Mat, and their friends are pursued by dangerous men, suspected to be Darkfriends. Thom, Rand's companion, died during an attack while trying to help them reach Whitebridge. They plan to continue to Caemlyn and then Tar Valon. Master Gill, the innkeeper, expresses reluctance to involve the Guards due to connections to Aes Sedai, particularly Elaida, who could threaten their s

In [63]:
for chunk, summary in zip(chunks, summaries):
    print(f'{chunk}\n\n{summary}\n-------------------------\n')

Chapter 7 The Sorting Hat The door swung open at once. A tall, black-haired witch in emerald-green robes stood there. She had a very stern face and Harry’s first thought was that this was not someone to cross. “The firs’ years, Professor McGonagall,” said Hagrid. “Thank you, Hagrid. I will take them from here.” She pulled the door wide. The entrance hall was so big you could have fit the whole of the Dursleys’ house in it. The stone walls were lit with flaming torches like the ones at Gringotts, the ceiling was too high to make out, and a magnificent marble staircase facing them led to the upper floors. They followed Professor McGonagall across the flagged stone floor. Harry could hear the drone of hundreds of voices from a doorway to the right — the rest of the school must already be here — but Professor McGonagall showed the first years into a small, empty chamber off the hall. They crowded in, standing rather closer together than they would usually have done, peering about nervously

In [150]:
for chunk in chunks:
    print(len(chunk))

1382
8673
10211
7732
2528
879
6369
7886
8755
9009
8617
1491
14763
7931
5417
1279
1149
8776
8896
4151
2505
7692
11039
9039
1443
940
9204
7126
11640
18025
4912
1676
9313
7273
10137
8010
1786
12346
4441
1266
9549
8647
8230
2163
11616
3860
2060
7962
8140
1246
2801
9047
8780
1518
1107
10060
8250
8593
850
6567
8223
10341
8109
805
2611
9417
10144
7525
8778
8912
1152
1766
11499
13008
7188
5050
1226
9046
4211
3821
7409
8423
7328
1094
7824
8917
6717
747
7806
9402
10137
7713
1003
1005
8168
8924
1456
3317
8426
9522
3350
1488
8644
8726
10061
1454
2940
11221
9223
5479
1498
8842
7703
70
968
8764
7454
2904
9697
6762
2537
10287
10568
5415
2441
16240
8192
1954
2755
10355
10825
9296
5932
3581
1290
7757
5748
9961
6246
8284
8542
7960
101
1921
8240
11413
8493
3214
1677
8465
7727
7417
1541
2384
9698
10106
2820
6053
9649
6758
10257
850
1116
7115
8626
4358
679
10554
8951
8292
9368
1862
2522
8067
10126
8516
1344
9604
8060
7141
8745
3113
1977
8294
10860
7371
2206
1753
8269
1866
2597
9694
8071
7239
4397
2442
6298

In [142]:
new_summaries = []
for i, summary in enumerate(summaries):
    new_summaries.append({
        'full_chunk': chunks[i],
        'summary': summary.summary,
        'characters': summary.characters,
        'locations': summary.locations,
        'creatures': summary.creatures
    })

In [145]:
with open('summarized_chunks_harry_potter_01.json', 'w') as f:
    json.dump(new_summaries, f, indent=4)

In [105]:
summarized_chunks = [{'chunk': chunk, 'summary': summary} for chunk, summary in zip(chunks, summaries)]

In [138]:
import json

with open('summarized_chunks.json', 'w') as f:
    json.dump(summarized_chunks, f, default=lambda o: o.model_dump() if hasattr(o, 'dict') else o, indent=4)

In [106]:
summarized_chunks

[{'chunk': ' The Sorting Hat The door swung open at once. A tall, black-haired witch in emerald-green robes stood there. She had a very stern face and Harry’s first thought was that this was not someone to cross. “The firs’ years, Professor McGonagall,” said Hagrid. “Thank you, Hagrid. I will take them from here.” She pulled the door wide. The entrance hall was so big you could have fit the whole of the Dursleys’ house in it. The stone walls were lit with flaming torches like the ones at Gringotts, the ceiling was too high to make out, and a magnificent marble staircase facing them led to the upper floors. They followed Professor McGonagall across the flagged stone floor. Harry could hear the drone of hundreds of voices from a doorway to the right — the rest of the school must already be here — but Professor McGonagall showed the first years into a small, empty chamber off the hall. They crowded in, standing rather closer together than they would usually have done, peering about nervou

In [151]:
from story_sage.utils.embedding import update_tagged_entities, Embedder
import chromadb
chroma_client = chromadb.PersistentClient(config.chroma_path)
embedder = Embedder()
collection = chroma_client.get_or_create_collection('summarized_chunks', embedding_function=embedder)


In [115]:
ids = []
docs_to_embed = []
metadatas = []

i = 0
for chunk in summarized_chunks:
    ids.append(str(i))
    docs_to_embed.append(chunk['summary'].summary)
    metadatas.append({'chunk_idx': i})
    i += 1

collection.add(ids=ids, documents=docs_to_embed, metadatas=metadatas)

In [159]:
def answer_query(query_texts):
    results = collection.query(query_texts=query_texts, n_results=5, include=['metadatas', 'documents', 'distances'])
    full_chunks = [f"Book {meta['book_number']}, Chapter {meta['chapter_number']}: {meta['full_chunk']}" for meta in results['metadatas'][0]]
    prompt = """
You are an assistant to help a reader keep track of people, places, and plot points in books.
The attached pieces of retrieved context are excerpts from the books related to the reader's question. Use them to generate your response.

Guidelines for the response:
    * If you don't know the answer or aren't sure, just say that you don't know. 
    * Don't provide any irrelevant information. Most importantly: DO NOT PROVIDE INFORMATION FROM OUTSIDE THE CONTEXT.
    * Use bullet points to provide excerpts from the context that support your answer. Reference the book and chapter whenever you include an excerpt.
    * If there is no context, you can say that you don't have enough information to answer the question.
Context:
            """
    prompt += '\n'.join([f'- {chunk}' for chunk in full_chunks])
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": query_texts}
        ]
    )

    return completion.choices[0].message.content, results['documents'][0], results['distances'][0]

from pprint import pprint
from markdown import markdown
from IPython.display import display, HTML
answer, docs, distances = answer_query("who is rand's mother?")
html_answer = markdown(answer)
display(HTML(html_answer))

for doc, dist in zip(docs, distances):
    print(f'{dist} - {doc}')

0.80485999584198 - Rand regains consciousness in a garden after falling from a wall. He sees a girl who is beautifully dressed in ornate garments, unlike anything he's known, and she approaches him with confidence. The girl, Elayne, examines his injuries and begins to treat him with items she produces from her cloak. Her brother, Gawyn, is with her and expresses concern about their situation due to having disobeyed their mother. As Elayne tends to Rand's wounds, the siblings show a bit of playful banter and concern for each other, revealing their close bond and status. Elayne is revealed to be knowledgeable in treating wounds, and Gawyn reassures Rand about her skills. They discuss their family dynamics, hinting at their noble lineage, and make a connection with Rand while caring for him. Rand feels both anxious and mesmerized by Elayne's beauty and confidence. Overall, the scene sets up a blend of social class, sibling dynamics, and immediate danger of Rand's situation, hinting at dee