In [1]:
# Import necessary libraries
from langchain_community.document_loaders import ObsidianLoader
from pathlib import Path

path = "/home/codyt/Documents/Personal/Research"
loader = ObsidianLoader(path)
docs = loader.load()

# Extract source from metadata and ensure it's properly set
for doc in docs:
    source_path = Path(doc.metadata.get('source', ''))
    doc.metadata['source'] = source_path.stem
    print(f"Added source: {doc.metadata['source']}")  # Debug print

# Calculate the number of words total for all documents
total_words = sum(len(doc.page_content.split()) for doc in docs)

# Print the number of documents and total words
print(f"{len(docs)} documents loaded with a total of {total_words:,} words.")


Added source: PKM
Added source: IHAMS
Added source: Balance and Decisions
Added source: Living Knowledge Management
Added source: Mission Control
Added source: Obsidian Journaling Product
6 documents loaded with a total of 58,554 words.


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
# Load the API key from the .env file
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# PPLX_API_KEY = os.getenv("PPLX_API_KEY")

from langchain_openai import ChatOpenAI
# from langchain_community.chat_models import ChatPerplexity


fast_model = ChatOpenAI(model="gpt-4o-mini")
# external_knowledge_model = ChatPerplexity(model="llama-3.1-sonar-large-128k-online")
large_model = ChatOpenAI(model="gpt-4o")

In [3]:
from langchain_core.prompts import ChatPromptTemplate

knowledge_prompt = """
You are an expert document analyzer tasked with identifying and describing the developmental process of ideas in a structured mini-narrative format on {topic}.

**Task:**  
1. Analyze the text to create a mini-narrative that describes:  
   - What is being discussed in the document, including the central idea(s).  
   - What happens throughout the document, including any key developments, changes, or shifts in the idea(s).  
   - How the document progresses from beginning to middle to end, focusing on the natural flow of the developmental process.  
2. Only include motivations when they are explicitly evident as playing a role in the developmental process. Avoid speculating or forcing interpretations of intent.  
3. Refrain from making introductions, conclusions, or asking further questions about the document. Simply narrate the developmental process based on the provided text.

**Output Format:**  
Provide a structured mini-narrative with these elements:  
- **What is being discussed:** Clearly state the central idea(s) or topic(s) at the core of the document.  
- **What happens:** Describe the key points and developments as they progress throughout the document. Focus on identifying changes, shifts, or refinements in the ideas.  
- **Natural Progression:** Narrate the flow of the document from beginning to middle to end, emphasizing the natural evolution of the ideas.  
"""

reduce_template = """
You are an expert in synthesizing and consolidating information to create a cohesive narrative.  

**Context:**  
The following are summaries of key developments related to the evolution of an idea:  
{docs}  

**Task:**  
1. Carefully analyze the provided summaries to identify the progression and transformation of the idea over time.  
2. Combine these summaries into a single, cohesive narrative that:  
   - Clearly explains how the central idea evolved.  
   - Highlights key developments, shifts, and refinements in the idea across the summaries.  
   - Maintains logical flow and clarity throughout.  
3. Use a natural storytelling structure that captures the progression from beginning to end, while avoiding repetition or introducing new information.  

**Output Format:**  
Provide a structured, consolidated narrative with the following elements:  
- **Introductory Context:** Briefly introduce the central idea and scope of the summaries.  
- **Evolution of the Idea:** Narrate how the idea progressed or transformed over time, detailing the key developments in a logical manner.  
- **Final State:** Conclude the narrative by summarizing the final state or refined version of the idea based on the summaries provided.
"""

# Create the map prompt with the knowledge item template
map_prompt = ChatPromptTemplate.from_messages([
    ("system", knowledge_prompt),
    ("user", "Write a concise summary of the following:\n\n{content}")
])

reduce_prompt = ChatPromptTemplate([("human", reduce_template)])

from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=3200, chunk_overlap=0
)
split_docs = text_splitter.split_documents(docs)
print(f"Generated {len(split_docs)} documents.")

import operator
from typing import Annotated, List, Literal, TypedDict, Dict, Any

from langchain.chains.combine_documents.reduce import (
    acollapse_docs,
    split_list_of_docs,
)
from langchain_core.documents import Document
from langgraph.constants import Send
from langgraph.graph import END, START, StateGraph

token_max = 16000


def length_function(documents: List[Document]) -> int:
    """Get number of tokens for input contents."""
    return sum(fast_model.get_num_tokens(doc.page_content) for doc in documents)


# This will be the overall state of the main graph.
# It will contain the input document contents, corresponding
# summaries, and a final summary.
class OverallState(TypedDict):
    # Notice here we use the operator.add
    # This is because we want combine all the summaries we generate
    # from individual nodes back into one list - this is essentially
    # the "reduce" part
    contents: List[str]
    summaries: Annotated[list, operator.add]
    sources: Annotated[List[str], operator.add]
    collapsed_summaries: List[Document]
    final_summary: str


# This will be the state of the node that we will "map" all
# documents to in order to generate summaries
class SummaryState(TypedDict):
    content: str
    topic: str
    source: str


import asyncio
from datetime import datetime, timedelta

class RateLimiter:
    def __init__(self, calls_per_minute=10):  # More conservative limit
        self.calls_per_minute = calls_per_minute
        self.calls = []
        self.last_error_time = None
        
    async def wait(self):
        now = datetime.now()
        # Add exponential backoff after errors
        if self.last_error_time and (now - self.last_error_time) < timedelta(minutes=2):
            await asyncio.sleep(15)  # Longer cooldown after errors
            
        self.calls = [call for call in self.calls if now - call < timedelta(minutes=1)]
        if len(self.calls) >= self.calls_per_minute:
            wait_time = 60 - (now - self.calls[0]).total_seconds() + 5  # Added buffer
            if wait_time > 0:
                await asyncio.sleep(wait_time)
        self.calls.append(now)

# Create rate limiter instance
rate_limiter = RateLimiter(calls_per_minute=45)

from openai import RateLimitError  # Add this import at the top

# Modify the generate_summary function
async def generate_summary(state: SummaryState):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            await rate_limiter.wait()
            prompt = map_prompt.invoke({
                "content": state["content"],
                "topic": state["topic"]
            })
            response = await fast_model.ainvoke(prompt)
            return {
                "summaries": [response.content],
                "sources": [state["source"]]
            }
        except RateLimitError as e:
            if attempt == max_retries - 1:
                raise
            wait_time = (attempt + 1) * 30
            print(f"Rate limit exceeded. Waiting {wait_time} seconds before retry...")
            await asyncio.sleep(wait_time)


# Here we define the logic to map out over the documents
# We will use this an edge in the graph
def map_summaries(state: OverallState):
    # We will return a list of `Send` objects
    # Each `Send` object consists of the name of a node in the graph
    # as well as the state to send to that node
     return [
        Send("generate_summary", {
            "content": doc.page_content,
            "topic": state.get("topic", "Default Topic"),
            "source": Path(doc.metadata.get('source', '')).stem or f'doc_{i}'
        }) for i, doc in enumerate(state["contents"])
    ]


def collect_summaries(state: OverallState):
    # Deduplicate sources while maintaining order
    unique_sources = list(dict.fromkeys(state["sources"]))
    return {
        "collapsed_summaries": [
            Document(
                page_content=summary,
                metadata={"source": source}
            ) 
            # Change this line to use unique_sources instead of state["sources"]
            for summary, source in zip(state["summaries"], unique_sources)
        ]
    }


async def _reduce(input: dict) -> str:
    prompt = reduce_prompt.invoke(input)
    response = await large_model.ainvoke(prompt)
    return response.content


# Add node to collapse summaries
async def collapse_summaries(state: OverallState):
    doc_lists = split_list_of_docs(
        state["collapsed_summaries"], length_function, token_max
    )
    results = []
    for doc_list in doc_lists:
        results.append(await acollapse_docs(doc_list, _reduce))

    return {"collapsed_summaries": results}


# This represents a conditional edge in the graph that determines
# if we should collapse the summaries or not
def should_collapse(
    state: OverallState,
) -> Literal["collapse_summaries", "generate_final_summary"]:
    num_tokens = length_function(state["collapsed_summaries"])
    if num_tokens > token_max:
        return "collapse_summaries"
    else:
        return "generate_final_summary"


# Here we will generate the final summary
async def generate_final_summary(state: OverallState):
    # Flatten and deduplicate sources from collapsed summaries
    all_sources = []
    for doc in state["collapsed_summaries"]:
        if isinstance(doc, Document):
            all_sources.append(doc.metadata["source"])
        elif isinstance(doc, list):
            all_sources.extend(d.metadata["source"] for d in doc)
    
    # Deduplicate and sort sources
    unique_sources = sorted(dict.fromkeys(all_sources))
    
    # Generate summary
    collapsed_content = []
    for doc in state["collapsed_summaries"]:
        if isinstance(doc, Document):
            collapsed_content.append(doc.page_content)
        elif isinstance(doc, list):
            collapsed_content.extend(d.page_content for d in doc)
    
    summary = await _reduce({
        "docs": "\n\n".join(collapsed_content)
    })
    
    source_list = "\n".join(f"- [[{source}]]" for source in unique_sources)
    
    return {
        "final_summary": f"{summary}\n\n## Sources\n{source_list}"
    }


# Construct the graph
# Nodes:
graph = StateGraph(OverallState)
graph.add_node("generate_summary", generate_summary)
graph.add_node("collect_summaries", collect_summaries)
graph.add_node("collapse_summaries", collapse_summaries)
graph.add_node("generate_final_summary", generate_final_summary)

# Edges:
graph.add_conditional_edges(START, map_summaries, ["generate_summary"])
graph.add_edge("generate_summary", "collect_summaries")
graph.add_conditional_edges("collect_summaries", should_collapse)
graph.add_conditional_edges("collapse_summaries", should_collapse)
graph.add_edge("generate_final_summary", END)

app = graph.compile()

async for step in app.astream(
    {
        "contents": split_docs,
        "topic": "A self improving OS"        
    },
    {"recursion_limit": 30},
):
    
    print(list(step.keys()))

Generated 33 documents.
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['generate_summary']
['collect_summaries']
['generate_final_summary']


In [4]:
from IPython.display import Markdown, display

# Access nested value
display(Markdown(step['generate_final_summary']['final_summary']))


**Introductory Context:**

The central idea explored in these summaries revolves around the optimization of knowledge management and Retrieval-Augmented Generation (RAG) through strategic organization and AI integration. This involves the evolution of note structuring, vectorization strategies, goal management, and AI-driven journaling, with the aim of enhancing information retrieval, personal growth, and business success.

**Evolution of the Idea:**

Initially, the focus was on comparing two vectorization strategies for RAG: large topic-based files and separate files for subtopics. The discussion highlighted the benefits of dense contextual information from large files versus the retrieval efficiency and granularity offered by separate files. As the exploration deepened, the preference shifted towards separate files for subtopics, offering enhanced retrieval efficiency and relevance. This shift marked a pivotal moment in optimizing vectorization processes for Personal Knowledge Management (PKM) systems.

Simultaneously, the framework for organizing knowledge management was established, emphasizing the importance of structured notes, tagging, and metadata. This foundation allowed for the breakdown of complex concepts, like communication, into sub-concepts, enhancing searchability and usability. The discussion evolved towards practical applications, recommending folder organization and integration of project management into the knowledge system, thereby streamlining the management of goals and tasks.

The narrative then moved to goal setting and management within a note-taking system, where each goal was delineated into its own note with projects as subsections. This structure fostered clarity and motivation by connecting projects to overarching objectives. The exploration extended to technical aspects of RAG, such as batch size and parameter settings, underpinning the need for empirical testing to optimize model performance.

Parallelly, the integration of AI in journaling products was introduced, transforming journaling into a more interactive, insightful experience. The focus was on enhancing AI prompts to ensure empathetic, personalized feedback based on users' journal entries. This development underscored the potential of AI to analyze emotions and track personal growth, enriching the journaling experience.

**Final State:**

The culmination of these developments resulted in a sophisticated system that integrates efficient vectorization, structured knowledge management, goal-oriented project planning, and AI-enhanced journaling. The final state of the idea is a cohesive framework that leverages strategic organization and AI capabilities to improve information retrieval, personal development, and business outcomes. This integrated approach signifies a mature, refined solution that addresses the complexities of managing and utilizing personal and professional knowledge in an increasingly digital world.

## Sources
- [[Balance and Decisions]]
- [[IHAMS]]
- [[Living Knowledge Management]]
- [[Mission Control]]
- [[Obsidian Journaling Product]]
- [[PKM]]