In [81]:
from langchain.text_splitter import MarkdownHeaderTextSplitter
import json
import pandas as pd

In [82]:
splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
    ("#", "H1"),
    ("##", "H2"),
    ("###", "H3"),
    ("####", "H4"),
])

In [83]:
with open("/Users/abhijit/Desktop/DS-RPC-01/data/engineering/engineering_master_doc.md", "r", encoding="utf-8") as file:
    md_content = file.read()


In [84]:
chunks = splitter.split_text(md_content)

In [85]:
chunks

[Document(metadata={'H1': 'FinSolve Technologies Engineering Document', 'H2': '1. Introduction', 'H3': '1.1 Company Overview'}, page_content='FinSolve Technologies is a leading FinTech company headquartered in Bangalore, India, with operations across North America, Europe, and Asia-Pacific. Founded in 2018, FinSolve provides innovative financial solutions, including digital banking, payment processing, wealth management, and enterprise financial analytics, serving over 2 million individual users and 10,000 businesses globally.'),
 Document(metadata={'H1': 'FinSolve Technologies Engineering Document', 'H2': '1. Introduction', 'H3': '1.2 Purpose'}, page_content='This engineering document outlines the technical architecture, development processes, and operational guidelines for FinSolve\'s product ecosystem. It serves as a comprehensive guide for engineering teams, stakeholders, and partners to ensure alignment with FinSolve\'s mission: "To empower financial freedom through secure, scalab

In [86]:
len(chunks)

81

In [87]:
from langchain.vectorstores import Chroma
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en") # 748 


In [88]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding,
    persist_directory="/Users/abhijit/Desktop/DS-RPC-01/src/vector_data/chroma_db"  # change path if needed
)


In [89]:
vectorstore.persist()

In [90]:
db = Chroma(
    persist_directory="/Users/abhijit/Desktop/DS-RPC-01/src/vector_data/chroma_db",  # your folder path
    embedding_function=embedding  # or HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)

In [91]:
query = "How does FinSolve ensure high availability in production?"

# Retrieve top 5 chunks
results = db.similarity_search(query, k=5)


In [92]:
results

[Document(metadata={'H1': 'FinSolve Technologies Engineering Document', 'H2': '4. Software Development Lifecycle (SDLC)', 'H3': '4.1 Agile Methodology'}, page_content='FinSolve follows a Scrum-based Agile process with 2-week sprints:'),
 Document(metadata={'H1': 'FinSolve Technologies Engineering Document', 'H2': '4. Software Development Lifecycle (SDLC)', 'H3': '4.1 Agile Methodology'}, page_content='FinSolve follows a Scrum-based Agile process with 2-week sprints:'),
 Document(metadata={'H1': 'FinSolve Technologies Engineering Document', 'H2': '2. System Architecture', 'H3': '2.1 Overview'}, page_content="FinSolve's architecture is a microservices-based, cloud-native system designed for scalability, resilience, and security. It leverages a modular design to support rapid feature development and seamless integration with third-party financial systems (e.g., payment gateways, credit bureaus, regulatory reporting systems)."),
 Document(metadata={'H1': 'FinSolve Technologies Engineering 

In [93]:
for i, doc in enumerate(results, 1):
    print(f"\n--- Result {i} ---")
    print(f"Content:\n{doc.page_content[:500]}")  # Show first 500 characters
    print(f"Metadata:\n{doc.metadata}")

In [94]:
from groq import Groq

client = Groq(api_key="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")


In [38]:
user_RAG=db.as_retriever(search_kwargs={"k": 5})

In [96]:
from langchain.tools import Tool

def retrieve_context_from_db(query: str, db) -> str:
    """
    Retrieve relevant documents from a ChromaDB retriever and return their contents
    as a single formatted string to be used in an LLM prompt.

    Args:
        query (str): The user query.
        db: A LangChain retriever object (e.g., ChromaDB.as_retriever()).

    Returns:
        str: Concatenated string of relevant document contents.
    """
    # Step 1: Retrieve documents from the retriever
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

    # Now you can use:
    documents = retriever.get_relevant_documents(query)
    
    
    # Step 2: Extract just the text content from each Document
    page_contents = [doc.page_content for doc in documents]

    # Step 3: Join them with double line breaks for clarity
    context = "\n\n".join(page_contents)
    # print(context)
    return context

rag_tool = Tool(
    name="RAGRetriever",
    func=lambda q: retrieve_context_from_db(q, db),
    description="Useful for answering questions by retrieving context from vector DB"
)

In [97]:
from langchain_groq import ChatGroq

In [None]:
llm = ChatGroq(
    temperature=0.2,
    model_name="llama-3.3-70b-versatile",  # or "llama3-70b-8192", "gemma-7b-it"
    api_key="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"      # or set as env variable GROQ_API_KEY
)

In [116]:
from langchain.agents import initialize_agent, AgentType

agent = initialize_agent(
    tools=[rag_tool],
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

In [117]:
response = agent.run("How does FinSolve ensure high availability in production?")
response



[1m> Entering new AgentExecutor chain...[0m


APIConnectionError: Connection error.

In [None]:
response

'FinSolve ensures high availability in production through a combination of strategies, including a multi-region cloud architecture with automatic failover mechanisms, load balancing across availability zones, and continuous real-time monitoring. They utilize redundant infrastructure to prevent single points of failure and employ automated scaling to handle traffic fluctuations. Additionally, they conduct regular disaster recovery drills and maintain a 24/7 network operations center (NOC) to swiftly address any issues, ensuring minimal downtime and maximum uptime for critical financial services.'

In [None]:
import gradio as gr

def retrieve_context_from_db(query, db):
    retriever = db.as_retriever(search_kwargs={"k": 5})
    docs = retriever.get_relevant_documents(query)
    return "\n\n".join([doc.page_content for doc in docs])

def chat_fn(message, history):
    try:
        context = retrieve_context_from_db(message, db)
        response = agent.run(message)
    except Exception as e:
        response = f"❌ Error: {str(e)}"
        context = ""
    
    final_output = f"🧠 **Answer:**\n"+response
    return final_output  # ✅ Not a tuple

chatbot_ui = gr.ChatInterface(
    fn=chat_fn,
    title="RAG Chatbot",
    description="Ask questions about your FinSolve company.",
)

chatbot_ui.launch(inline=True)


  self.chatbot = Chatbot(


* Running on local URL:  http://127.0.0.1:7873
* To create a public link, set `share=True` in `launch()`.






[1m> Entering new AgentExecutor chain...[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
<think>
Okay, the user is asking me to explain what a Technology Stack is. Let me start by recalling what I know. A technology stack, often called a tech stack, refers to the set of software and tools used to build a product, usually a website or app. But I should make sure I'm accurate here.

Hmm, maybe I should break it down. The tech stack typically includes the front-end, back-end, database, and other supporting software. For example, a common stack is LAMP (Linux, Apache, MySQL, PHP). But I need to explain each part clearly. Wait, maybe I should also mention that it's not just software; sometimes it includes frameworks, programming languages, servers, and even deployment tools. 

I should check if there are different types of stacks. Like MEAN (MongoDB, Express.js, Angular, Node.js) is another example. Also, the front-end stack might involve HTML, CSS, JavaScript, and fram

In [77]:
from langchain_core.prompts import ChatPromptTemplate
import uuid
from langchain_groq import ChatGroq
import os
from typing import Optional
from langchain_core.pydantic_v1 import BaseModel
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.output_parsers import PydanticOutputParser
# from dotenv import load_dotenv
from rich import print

# load_dotenv()

class AgenticChunker:
    def __init__(self, openai_api_key=None):
        self.chunks = {}
        self.id_truncate_limit = 5

        # Whether or not to update/refine summaries and titles as you get new information
        self.generate_new_metadata_ind = True
        self.print_logging = True

        # if openai_api_key is None:
        #     openai_api_key = os.getenv("OPENAI_API_KEY")

        # if openai_api_key is None:
        #     raise ValueError("API key is not provided and not found in environment variables")

        self.llm  = ChatGroq(
    temperature=0.2,
    model_name="llama-3.3-70b-versatile",  # or "llama3-70b-8192", "gemma-7b-it"
    api_key="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"      # or set as env variable GROQ_API_KEY
)

    def add_propositions(self, propositions):
        for proposition in propositions:
            self.add_proposition(proposition)
    
    def add_proposition(self, proposition):
        if self.print_logging:
            print (f"\nAdding: '{proposition}'")

        # If it's your first chunk, just make a new chunk and don't check for others
        if len(self.chunks) == 0:
            if self.print_logging:
                print ("No chunks, creating a new one")
            self._create_new_chunk(proposition)
            return

        chunk_id = self._find_relevant_chunk(proposition)

        # If a chunk was found then add the proposition to it
        if chunk_id:
            if self.print_logging:
                print (f"Chunk Found ({self.chunks[chunk_id]['chunk_id']}), adding to: {self.chunks[chunk_id]['title']}")
            self.add_proposition_to_chunk(chunk_id, proposition)
            return
        else:
            if self.print_logging:
                print ("No chunks found")
            # If a chunk wasn't found, then create a new one
            self._create_new_chunk(proposition)
        

    def add_proposition_to_chunk(self, chunk_id, proposition):
        # Add then
        self.chunks[chunk_id]['propositions'].append(proposition)

        # Then grab a new summary
        if self.generate_new_metadata_ind:
            self.chunks[chunk_id]['summary'] = self._update_chunk_summary(self.chunks[chunk_id])
            self.chunks[chunk_id]['title'] = self._update_chunk_title(self.chunks[chunk_id])

    def _update_chunk_summary(self, chunk):
        """
        If you add a new proposition to a chunk, you may want to update the summary or else they could get stale
        """
        PROMPT = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                    You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic
                    A new proposition was just added to one of your chunks, you should generate a very brief 1-sentence summary which will inform viewers what a chunk group is about.

                    A good summary will say what the chunk is about, and give any clarifying instructions on what to add to the chunk.

                    You will be given a group of propositions which are in the chunk and the chunks current summary.

                    Your summaries should anticipate generalization. If you get a proposition about apples, generalize it to food.
                    Or month, generalize it to "date and times".

                    Example:
                    Input: Proposition: Greg likes to eat pizza
                    Output: This chunk contains information about the types of food Greg likes to eat.

                    Only respond with the chunk new summary, nothing else.
                    """,
                ),
                ("user", "Chunk's propositions:\n{proposition}\n\nCurrent chunk summary:\n{current_summary}"),
            ]
        )

        runnable = PROMPT | self.llm

        new_chunk_summary = runnable.invoke({
            "proposition": "\n".join(chunk['propositions']),
            "current_summary" : chunk['summary']
        }).content

        return new_chunk_summary
    
    def _update_chunk_title(self, chunk):
        """
        If you add a new proposition to a chunk, you may want to update the title or else it can get stale
        """
        PROMPT = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                    You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic
                    A new proposition was just added to one of your chunks, you should generate a very brief updated chunk title which will inform viewers what a chunk group is about.

                    A good title will say what the chunk is about.

                    You will be given a group of propositions which are in the chunk, chunk summary and the chunk title.

                    Your title should anticipate generalization. If you get a proposition about apples, generalize it to food.
                    Or month, generalize it to "date and times".

                    Example:
                    Input: Summary: This chunk is about dates and times that the author talks about
                    Output: Date & Times

                    Only respond with the new chunk title, nothing else.
                    """,
                ),
                ("user", "Chunk's propositions:\n{proposition}\n\nChunk summary:\n{current_summary}\n\nCurrent chunk title:\n{current_title}"),
            ]
        )

        runnable = PROMPT | self.llm

        updated_chunk_title = runnable.invoke({
            "proposition": "\n".join(chunk['propositions']),
            "current_summary" : chunk['summary'],
            "current_title" : chunk['title']
        }).content

        return updated_chunk_title

    def _get_new_chunk_summary(self, proposition):
        PROMPT = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                    You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic
                    You should generate a very brief 1-sentence summary which will inform viewers what a chunk group is about.

                    A good summary will say what the chunk is about, and give any clarifying instructions on what to add to the chunk.

                    You will be given a proposition which will go into a new chunk. This new chunk needs a summary.

                    Your summaries should anticipate generalization. If you get a proposition about apples, generalize it to food.
                    Or month, generalize it to "date and times".

                    Example:
                    Input: Proposition: Greg likes to eat pizza
                    Output: This chunk contains information about the types of food Greg likes to eat.

                    Only respond with the new chunk summary, nothing else.
                    """,
                ),
                ("user", "Determine the summary of the new chunk that this proposition will go into:\n{proposition}"),
            ]
        )

        runnable = PROMPT | self.llm

        new_chunk_summary = runnable.invoke({
            "proposition": proposition
        }).content

        return new_chunk_summary
    
    def _get_new_chunk_title(self, summary):
        PROMPT = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                    You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic
                    You should generate a very brief few word chunk title which will inform viewers what a chunk group is about.

                    A good chunk title is brief but encompasses what the chunk is about

                    You will be given a summary of a chunk which needs a title

                    Your titles should anticipate generalization. If you get a proposition about apples, generalize it to food.
                    Or month, generalize it to "date and times".

                    Example:
                    Input: Summary: This chunk is about dates and times that the author talks about
                    Output: Date & Times

                    Only respond with the new chunk title, nothing else.
                    """,
                ),
                ("user", "Determine the title of the chunk that this summary belongs to:\n{summary}"),
            ]
        )

        runnable = PROMPT | self.llm

        new_chunk_title = runnable.invoke({
            "summary": summary
        }).content

        return new_chunk_title


    def _create_new_chunk(self, proposition):
        new_chunk_id = str(uuid.uuid4())[:self.id_truncate_limit] # I don't want long ids
        new_chunk_summary = self._get_new_chunk_summary(proposition)
        new_chunk_title = self._get_new_chunk_title(new_chunk_summary)

        self.chunks[new_chunk_id] = {
            'chunk_id' : new_chunk_id,
            'propositions': [proposition],
            'title' : new_chunk_title,
            'summary': new_chunk_summary,
            'chunk_index' : len(self.chunks)
        }
        if self.print_logging:
            print (f"Created new chunk ({new_chunk_id}): {new_chunk_title}")
    
    def get_chunk_outline(self):
        """
        Get a string which represents the chunks you currently have.
        This will be empty when you first start off
        """
        chunk_outline = ""

        for chunk_id, chunk in self.chunks.items():
            single_chunk_string = f"""Chunk ({chunk['chunk_id']}): {chunk['title']}\nSummary: {chunk['summary']}\n\n"""
        
            chunk_outline += single_chunk_string
        
        return chunk_outline

    def _find_relevant_chunk(self, proposition):
        current_chunk_outline = self.get_chunk_outline()

        PROMPT = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                    Determine whether or not the "Proposition" should belong to any of the existing chunks.

                    A proposition should belong to a chunk of their meaning, direction, or intention are similar.
                    The goal is to group similar propositions and chunks.

                    If you think a proposition should be joined with a chunk, return the chunk id.
                    If you do not think an item should be joined with an existing chunk, just return "No chunks"

                    Example:
                    Input:
                        - Proposition: "Greg really likes hamburgers"
                        - Current Chunks:
                            - Chunk ID: 2n4l3d
                            - Chunk Name: Places in San Francisco
                            - Chunk Summary: Overview of the things to do with San Francisco Places

                            - Chunk ID: 93833k
                            - Chunk Name: Food Greg likes
                            - Chunk Summary: Lists of the food and dishes that Greg likes
                    Output: 93833k
                    """,
                ),
                ("user", "Current Chunks:\n--Start of current chunks--\n{current_chunk_outline}\n--End of current chunks--"),
                ("user", "Determine if the following statement should belong to one of the chunks outlined:\n{proposition}"),
            ]
        )

        runnable = PROMPT | self.llm

        chunk_found = runnable.invoke({
            "proposition": proposition,
            "current_chunk_outline": current_chunk_outline
        }).content

        # Pydantic data class
        class ChunkID(BaseModel):
            """Extracting the chunk id"""
            chunk_id: Optional[str]
            
        # Extraction to catch-all LLM responses. This is a bandaid
        extraction_chain = create_extraction_chain_pydantic(pydantic_schema=ChunkID, llm=self.llm)
        extraction_found = extraction_chain.invoke(chunk_found)["text"]
        if extraction_found:
            chunk_found = extraction_found[0].chunk_id

        # If you got a response that isn't the chunk id limit, chances are it's a bad response or it found nothing
        # So return nothing
        if len(chunk_found) != self.id_truncate_limit:
            return None

        return chunk_found
    
    def get_chunks(self, get_type='dict'):
        """
        This function returns the chunks in the format specified by the 'get_type' parameter.
        If 'get_type' is 'dict', it returns the chunks as a dictionary.
        If 'get_type' is 'list_of_strings', it returns the chunks as a list of strings, where each string is a proposition in the chunk.
        """
        if get_type == 'dict':
            return self.chunks
        if get_type == 'list_of_strings':
            chunks = []
            for chunk_id, chunk in self.chunks.items():
                chunks.append(" ".join([x for x in chunk['propositions']]))
            return chunks
    
    def pretty_print_chunks(self):
        print (f"\nYou have {len(self.chunks)} chunks\n")
        for chunk_id, chunk in self.chunks.items():
            print(f"Chunk #{chunk['chunk_index']}")
            print(f"Chunk ID: {chunk_id}")
            print(f"Summary: {chunk['summary']}")
            print(f"Propositions:")
            for prop in chunk['propositions']:
                print(f"    -{prop}")
            print("\n\n")

    def pretty_print_chunk_outline(self):
        print ("Chunk Outline\n")
        print(self.get_chunk_outline())

if __name__ == "__main__":
    ac = AgenticChunker()

    ## Comment and uncomment the propositions to your hearts content
    propositions = [
        'The month is October.',
        'The year is 2023.',
        "One of the most important things that I didn't understand about the world as a child was the degree to which the returns for performance are superlinear.",
        'Teachers and coaches implicitly told us that the returns were linear.',
        "I heard a thousand times that 'You get out what you put in.'",
        'Teachers and coaches meant well.',
        "The statement that 'You get out what you put in' is rarely true.",
        "If your product is only half as good as your competitor's product, you do not get half as many customers.",
        "You get no customers if your product is only half as good as your competitor's product.",
        'You go out of business if you get no customers.',
        'The returns for performance are superlinear in business.',
        'Some people think the superlinear returns for performance are a flaw of capitalism.',
        'Some people think that changing the rules of capitalism would stop the superlinear returns for performance from being true.',
        'Superlinear returns for performance are a feature of the world.',
        'Superlinear returns for performance are not an artifact of rules that humans have invented.',
        'The same pattern of superlinear returns is observed in fame.',
        'The same pattern of superlinear returns is observed in power.',
        'The same pattern of superlinear returns is observed in military victories.',
        'The same pattern of superlinear returns is observed in knowledge.',
        'The same pattern of superlinear returns is observed in benefit to humanity.',
        'In fame, power, military victories, knowledge, and benefit to humanity, the rich get richer.'
    ]
    
    ac.add_propositions(propositions)
    ac.pretty_print_chunks()
    ac.pretty_print_chunk_outline()
    print (ac.get_chunks(get_type='list_of_strings'))

  warn(


TypeError: BaseModel.validate() takes 2 positional arguments but 3 were given

In [80]:
import uuid
import os
from typing import Optional

from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from pydantic import BaseModel  # <-- THIS IS THE CORRECTED LINE
from langchain_core.output_parsers import PydanticOutputParser
# from dotenv import load_dotenv
from rich import print

class AgenticChunker:
    def __init__(self, groq_api_key=None):
        """
        Initializes the AgenticChunker.
        It's recommended to set the GROQ_API_KEY in a .env file.
        """
        # load_dotenv() # Loads environment variables from a .env file
        
        self.chunks = {}
        self.id_truncate_limit = 5

        # Whether or not to update/refine summaries and titles as you get new information
        self.generate_new_metadata_ind = True
        self.print_logging = True

        # if groq_api_key is None:
        #     groq_api_key = os.getenv("GROQ_API_KEY")

        # if groq_api_key is None:
        #     raise ValueError("Groq API key is not provided. Please set it as an environment variable 'GROQ_API_KEY' in a .env file.")

        self.llm  = ChatGroq(
            temperature=0.2,
            model_name="llama3-70b-8192", # Recommended model for this complexity
            api_key="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"
        )

    def add_propositions(self, propositions):
        for proposition in propositions:
            self.add_proposition(proposition)
    
    def add_proposition(self, proposition):
        if self.print_logging:
            print (f"\n[bold]Adding[/bold]: '{proposition}'")

        # If it's your first chunk, just make a new chunk and don't check for others
        if len(self.chunks) == 0:
            if self.print_logging:
                print ("[italic]No chunks exist, creating a new one...[/italic]")
            self._create_new_chunk(proposition)
            return

        chunk_id = self._find_relevant_chunk(proposition)

        # If a chunk was found then add the proposition to it
        if chunk_id:
            if self.print_logging:
                print (f"[green]Chunk Found ({self.chunks[chunk_id]['chunk_id']}), adding to: {self.chunks[chunk_id]['title']}[/green]")
            self.add_proposition_to_chunk(chunk_id, proposition)
            return
        else:
            if self.print_logging:
                print ("[yellow]No relevant chunk found, creating a new one...[/yellow]")
            # If a chunk wasn't found, then create a new one
            self._create_new_chunk(proposition)
        
    def add_proposition_to_chunk(self, chunk_id, proposition):
        # Add proposition
        self.chunks[chunk_id]['propositions'].append(proposition)

        # Update summary and title
        if self.generate_new_metadata_ind:
            self.chunks[chunk_id]['summary'] = self._update_chunk_summary(self.chunks[chunk_id])
            self.chunks[chunk_id]['title'] = self._update_chunk_title(self.chunks[chunk_id])

    def _update_chunk_summary(self, chunk):
        """
        If you add a new proposition to a chunk, you may want to update the summary or else they could get stale
        """
        PROMPT = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                    You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic.
                    A new proposition was just added to one of your chunks. Generate a very brief, 1-sentence summary for the chunk.
                    A good summary will say what the chunk is about and give clarifying instructions on what else to add.
                    Your summaries should anticipate generalization. If you get a proposition about apples, generalize it to food.
                    Or a month, generalize it to "date and times".

                    Example:
                    Input: Proposition: Greg likes to eat pizza
                    Output: This chunk contains information about the types of food Greg likes to eat.

                    Only respond with the new chunk summary, nothing else.
                    """,
                ),
                ("user", "Chunk's propositions:\n{propositions}\n\nCurrent chunk summary:\n{current_summary}"),
            ]
        )

        runnable = PROMPT | self.llm

        new_chunk_summary = runnable.invoke({
            "propositions": "\n".join(chunk['propositions']),
            "current_summary" : chunk['summary']
        }).content

        return new_chunk_summary
    
    def _update_chunk_title(self, chunk):
        """
        If you add a new proposition to a chunk, you may want to update the title or else it can get stale
        """
        PROMPT = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                    You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic.
                    A new proposition was just added to one of your chunks. Generate a very brief, updated chunk title.
                    A good title will say what the chunk is about.
                    Your titles should anticipate generalization. If you get a proposition about apples, generalize it to food.
                    Or a month, generalize it to "date and times".

                    Example:
                    Input: Summary: This chunk is about dates and times that the author talks about
                    Output: Date & Times

                    Only respond with the new chunk title, nothing else.
                    """,
                ),
                ("user", "Chunk's propositions:\n{propositions}\n\nChunk summary:\n{current_summary}\n\nCurrent chunk title:\n{current_title}"),
            ]
        )

        runnable = PROMPT | self.llm

        updated_chunk_title = runnable.invoke({
            "propositions": "\n".join(chunk['propositions']),
            "current_summary" : chunk['summary'],
            "current_title" : chunk['title']
        }).content

        return updated_chunk_title

    def _get_new_chunk_summary(self, proposition):
        PROMPT = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                    You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic.
                    Generate a very brief, 1-sentence summary for a new chunk based on the given proposition.
                    A good summary will say what the chunk is about and give clarifying instructions on what else to add.
                    Your summaries should anticipate generalization. If you get a proposition about apples, generalize it to food.
                    Or a month, generalize it to "date and times".

                    Example:
                    Input: Proposition: Greg likes to eat pizza
                    Output: This chunk contains information about the types of food Greg likes to eat.

                    Only respond with the new chunk summary, nothing else.
                    """,
                ),
                ("user", "Determine the summary of the new chunk that this proposition will go into:\n{proposition}"),
            ]
        )

        runnable = PROMPT | self.llm
        return runnable.invoke({"proposition": proposition}).content
    
    def _get_new_chunk_title(self, summary):
        PROMPT = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                    You are the steward of a group of chunks which represent groups of sentences that talk about a similar topic.
                    Generate a very brief, few-word chunk title based on the given chunk summary.
                    A good chunk title is brief but encompasses what the chunk is about.
                    Your titles should anticipate generalization.

                    Example:
                    Input: Summary: This chunk is about dates and times that the author talks about.
                    Output: Date & Times

                    Only respond with the new chunk title, nothing else.
                    """,
                ),
                ("user", "Determine the title of the chunk that this summary belongs to:\n{summary}"),
            ]
        )

        runnable = PROMPT | self.llm
        return runnable.invoke({"summary": summary}).content

    def _create_new_chunk(self, proposition):
        new_chunk_id = str(uuid.uuid4())[:self.id_truncate_limit]
        new_chunk_summary = self._get_new_chunk_summary(proposition)
        new_chunk_title = self._get_new_chunk_title(new_chunk_summary)

        self.chunks[new_chunk_id] = {
            'chunk_id' : new_chunk_id,
            'propositions': [proposition],
            'title' : new_chunk_title,
            'summary': new_chunk_summary,
            'chunk_index' : len(self.chunks)
        }
        if self.print_logging:
            print (f"[bold blue]Created new chunk ({new_chunk_id}): {new_chunk_title}[/bold blue]")
    
    def get_chunk_outline(self):
        """
        Get a string which represents the chunks you currently have.
        """
        if not self.chunks:
            return "No chunks yet."
            
        chunk_outline = ""
        for chunk_id, chunk in self.chunks.items():
            single_chunk_string = f"""Chunk ({chunk['chunk_id']}): {chunk['title']}\nSummary: {chunk['summary']}\n\n"""
            chunk_outline += single_chunk_string
        return chunk_outline

    def _find_relevant_chunk(self, proposition):
        current_chunk_outline = self.get_chunk_outline()

        PROMPT = ChatPromptTemplate.from_messages(
            [
                (
                    "system",
                    """
                    Determine if the "Proposition" belongs to any existing chunks based on semantic similarity.
                    If it belongs, return ONLY the chunk_id.
                    If not, return ONLY the string "No chunks".
                    
                    Example 1:
                    Proposition: "Greg really likes hamburgers"
                    Current Chunks:
                    Chunk (2n4l3): Places in San Francisco
                    Summary: Things to do in San Francisco.
                    
                    Chunk (9383k): Food Greg Likes
                    Summary: Lists of food and dishes that Greg likes.
                    Output: 9383k

                    Example 2:
                    Proposition: "The sky is blue."
                    Current Chunks:
                    Chunk (9383k): Food Greg Likes
                    Summary: Lists of food and dishes that Greg likes.
                    Output: No chunks
                    """,
                ),
                ("user", "Current Chunks:\n---\n{current_chunk_outline}\n---\nProposition to evaluate: '{proposition}'"),
            ]
        )

        runnable = PROMPT | self.llm
        chunk_found_response = runnable.invoke({
            "proposition": proposition,
            "current_chunk_outline": current_chunk_outline
        }).content

        class ChunkID(BaseModel):
            """Extracting the chunk id"""
            chunk_id: Optional[str] = None

        parser = PydanticOutputParser(pydantic_object=ChunkID)

        extraction_prompt = ChatPromptTemplate.from_template(
            """
            Parse the user's text to find a chunk ID. The chunk ID is a short alphanumeric string. 
            If no chunk ID is found, return null for the chunk_id field.
            {format_instructions}
            Text to parse:
            {text_to_parse}
            """,
            partial_variables={"format_instructions": parser.get_format_instructions()},
        )

        extraction_chain = extraction_prompt | self.llm | parser

        try:
            parsed_result = extraction_chain.invoke({"text_to_parse": chunk_found_response})
            chunk_found = parsed_result.chunk_id
        except Exception as e:
            if self.print_logging:
                print(f"[red]Could not parse chunk ID from response: '{chunk_found_response}'. Error: {e}[/red]")
            chunk_found = None

        if chunk_found is None or chunk_found not in self.chunks:
            return None

        return chunk_found
    
    def get_chunks(self, get_type='dict'):
        """
        Returns chunks as a dictionary or a list of joined strings.
        """
        if get_type == 'dict':
            return self.chunks
        if get_type == 'list_of_strings':
            return [" ".join(chunk['propositions']) for chunk in self.chunks.values()]
        return self.chunks
    
    def pretty_print_chunks(self):
        print(f"\n\n[bold underline]Final Chunks ({len(self.chunks)} total)[/bold underline]\n")
        for chunk_id, chunk in self.chunks.items():
            print(f"[bold]Chunk #{chunk['chunk_index']} - {chunk['title']}[/bold]")
            print(f"  [cyan]ID:[/cyan] {chunk_id}")
            print(f"  [cyan]Summary:[/cyan] {chunk['summary']}")
            print(f"  [cyan]Propositions ({len(chunk['propositions'])}):[/cyan]")
            for prop in chunk['propositions']:
                print(f"    - {prop}")
            print("\n")

    def pretty_print_chunk_outline(self):
        print("[bold underline]Final Chunk Outline[/bold underline]\n")
        print(self.get_chunk_outline())


if __name__ == "__main__":
    ac = AgenticChunker()

    propositions = [
        'The month is October.',
        'The year is 2023.',
        "One of the most important things that I didn't understand about the world as a child was the degree to which the returns for performance are superlinear.",
        'Teachers and coaches implicitly told us that the returns were linear.',
        "I heard a thousand times that 'You get out what you put in.'",
        'Teachers and coaches meant well.',
        "The statement that 'You get out what you put in' is rarely true.",
        "If your product is only half as good as your competitor's product, you do not get half as many customers.",
        "You get no customers if your product is only half as good as your competitor's product.",
        'You go out of business if you get no customers.',
        'The returns for performance are superlinear in business.',
        'Some people think the superlinear returns for performance are a flaw of capitalism.',
        'Some people think that changing the rules of capitalism would stop the superlinear returns for performance from being true.',
        'Superlinear returns for performance are a feature of the world.',
        'Superlinear returns for performance are not an artifact of rules that humans have invented.',
        'The same pattern of superlinear returns is observed in fame.',
        'The same pattern of superlinear returns is observed in power.',
        'The same pattern of superlinear returns is observed in military victories.',
        'The same pattern of superlinear returns is observed in knowledge.',
        'The same pattern of superlinear returns is observed in benefit to humanity.',
        'In fame, power, military victories, knowledge, and benefit to humanity, the rich get richer.'
    ]
    
    ac.add_propositions(propositions)
    ac.pretty_print_chunks()
    ac.pretty_print_chunk_outline()

In [None]:
print("hello")