In [None]:
%run 06_caching.ipynb

In [5]:
from langchain_openai import ChatOpenAI
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
import os
from dotenv import load_dotenv

load_dotenv()


True

In [3]:
llm = ChatOpenAI(
    model="gpt-4.1-mini",
    temperature=0,
    openai_api_key=os.getenv("MY_OPENAI_API_KEY")
)


In [6]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    MessagesPlaceholder("history"),
    ("human", "{input}")
])

In [7]:
chain = prompt | llm

In [8]:
store = {}  # can hold multiple sessions

In [9]:
with_memory = RunnableWithMessageHistory(
    chain,
    lambda session_id: store.setdefault(session_id, InMemoryChatMessageHistory()),
    input_messages_key="input",
    history_messages_key="history"
)


In [21]:
def memory_chat(query, session_id="session_1"):
    result = with_memory.invoke(
        {"input": query},
        config={"configurable": {"session_id": session_id}}
    )
    return result.content


In [32]:
def answer_query(query, session_id="session_1"):

    # STEP 0 — ROUTER
    decision = route_query(query)
    print("Router decision:", decision)

    # ---------------------------
    # DIRECT MODE → use MEMORY
    # ---------------------------
    if decision == "direct":
        answer = memory_chat(query, session_id=session_id)
        cache_set(query, answer)
        return answer

    # ---------------------------
    # RAG MODE
    # ---------------------------

    # STEP 1 — CACHE CHECK
    cached = cache_get(query)
    if cached:
        print("CACHE HIT")
        return cached

    print("CACHE MISS → Running full RAG pipeline...")

    # STEP 2 — RETRIEVE
    retrieved = retrieve_candidates(query)

    # STEP 3 — RERANK
    reranked = rerank_documents(query, retrieved, top_k=6)

    # STEP 4 — GENERATE
    answer = generate_answer(query, reranked)

    # STEP 5 — STORE FINAL ANSWER IN CACHE
    cache_set(query, answer)

    # STEP 6 — STORE BOTH QUERY & ANSWER IN MEMORY
    with_memory.invoke(
        {"input": f"USER: {query}"},
        config={"configurable": {"session_id": session_id}}
    )

    with_memory.invoke(
        {"input": f"ASSISTANT: {answer}"},
        config={"configurable": {"session_id": session_id}}
    )

    return answer


In [39]:
answer_query("What did Amazon report about cloud revenue in 2024?")

Router decision: rag
CACHE MISS → Running full RAG pipeline...
BM25 retrieved: 10 chunks
Semantic retrieved: 10 chunks
Combined unique: 17


'Amazon reported that AWS (Amazon Web Services) revenue increased 19% year-over-year in 2024, from $91 billion to $108 billion. For perspective, just 10 years ago, AWS revenue was $4.6 billion.'

In [40]:
answer_query("did i ask about Amazon earlier?")

Router decision: direct


"Yes, you asked about Amazon earlier in this session. Specifically, you inquired about Amazon's overall revenue in 2024 and also about Amazon's cloud revenue (AWS) in 2024. If you have any more questions or need further information, feel free to ask!"