In [1]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from utils.model_loader import ModelLoader

In [2]:
loader = ModelLoader()

{"timestamp": "2025-08-25T16:20:25.579838Z", "level": "info", "event": "Running in LOCAL mode: .env loaded"}
{"timestamp": "2025-08-25T16:20:25.583094Z", "level": "info", "event": "Loaded GROQ_API_KEY from individual env var"}
{"timestamp": "2025-08-25T16:20:25.586522Z", "level": "info", "event": "Loaded GOOGLE_API_KEY from individual env var"}
{"keys": {"GROQ_API_KEY": "gsk_eh...", "GOOGLE_API_KEY": "AIzaSy..."}, "timestamp": "2025-08-25T16:20:25.592517Z", "level": "info", "event": "API keys loaded"}
{"config_keys": ["faiss_db", "embedding_model", "retriever", "llm"], "timestamp": "2025-08-25T16:20:25.602825Z", "level": "info", "event": "YAML config loaded"}


In [3]:
llm = loader.load_llm()
print(f"LLM Loaded: {llm}")
result = llm.invoke("Hello, how are you?")
print(f"LLM Result: {result.content}")

{"provider": "google", "model": "gemini-2.0-flash", "timestamp": "2025-08-25T16:20:26.595381Z", "level": "info", "event": "Loading LLM"}


LLM Loaded: model='models/gemini-2.0-flash' google_api_key=SecretStr('**********') temperature=0.0 max_output_tokens=2048 client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x00000204C8571930> default_metadata=() model_kwargs={}
LLM Result: I am doing well, thank you for asking! As a large language model, I don't experience emotions or feelings like humans do, but I am functioning optimally and ready to assist you. How can I help you today?


In [4]:
parser = JsonOutputParser()

In [5]:
parser.get_format_instructions()

'Return a JSON object.'

In [6]:
prompt = PromptTemplate.from_template(
    "Give me a JSON with keys 'title' and 'summary' for this topic: {topic}\n{format_instructions}"
).partial(format_instructions=parser.get_format_instructions())

In [7]:
prompt

PromptTemplate(input_variables=['topic'], input_types={}, partial_variables={'format_instructions': 'Return a JSON object.'}, template="Give me a JSON with keys 'title' and 'summary' for this topic: {topic}\n{format_instructions}")

In [8]:
chain = prompt | llm | parser

In [9]:
result = chain.invoke({"topic": "LangChain for RAG"})

In [10]:
result

{'title': 'LangChain for Retrieval Augmented Generation (RAG)',
 'summary': 'LangChain simplifies the development of Retrieval Augmented Generation (RAG) applications. It provides modules for connecting to data sources, creating embeddings, indexing data, and orchestrating the retrieval and generation process. By leveraging LangChain, developers can build powerful RAG systems that combine the strengths of large language models with external knowledge to improve accuracy, reduce hallucinations, and provide more contextually relevant responses.'}

In [11]:
from langchain.output_parsers import OutputFixingParser

In [12]:
json_parser = JsonOutputParser()

In [13]:
# Fixing wrapper
fixing_parser = OutputFixingParser.from_llm(parser=json_parser, llm=llm)

In [14]:
# Broken JSON
bad_output = """
title: LangChain RAG
summary LangChain helps with retrieval augmented generation...
"""

In [15]:
fixed_result = fixing_parser.parse(bad_output)

In [16]:
print(fixed_result)

{'title': 'LangChain RAG', 'summary': 'LangChain helps with retrieval augmented generation...'}


In [17]:
# from langchain.output_parsers import OutputFixingParser
# from langchain_core.output_parsers import JsonOutputParser

# json_parser = JsonOutputParser()
# safe_parser = OutputFixingParser.from_llm(parser=json_parser, llm=llm)

# # Use safe_parser instead of json_parser
# chain = prompt | llm | safe_parser

In [18]:
# Wrap with cache
Model_Cache = {}

import time
def cached_model(query):
    start_time = time.time()
    if Model_Cache.get(query):
        print("***CACHE HIT***")
        end_time = time.time()
        elapsed = end_time - start_time
        print(f"EXECUTION TIME: {elapsed:.2f} seconds")
        return Model_Cache.get(query)
    else:
        print("***CACHE MISS – EXECUTING MODEL***")
        start_time = time.time()
        response = llm.invoke(query)
        end_time = time.time()
        elapsed = end_time - start_time
        print(f"EXECUTION TIME: {elapsed:.2f} seconds")
        Model_Cache[query] = response
        return response

In [19]:
query="hi"
response = cached_model(query)
print(response)

***CACHE MISS – EXECUTING MODEL***
EXECUTION TIME: 0.85 seconds
content='Hi there! How can I help you today?' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []} id='run--1d98af07-711d-4d11-a5d7-8802a32efeaa-0' usage_metadata={'input_tokens': 1, 'output_tokens': 11, 'total_tokens': 12, 'input_token_details': {'cache_read': 0}}


In [20]:
query="can you give me 1000 words essay on independence?"
response = cached_model(query)
print(response)

***CACHE MISS – EXECUTING MODEL***
EXECUTION TIME: 9.68 seconds
content="## The Enduring Pursuit of Independence: A Multifaceted Ideal\n\nIndependence, a word that resonates with power and promise, is a cornerstone of human aspiration. It signifies freedom from external control, the ability to make autonomous choices, and the capacity to shape one's own destiny. This concept, however, is far from monolithic. It manifests in diverse forms, from national sovereignty to personal autonomy, and its pursuit is a complex and often challenging journey. Understanding the multifaceted nature of independence, its historical significance, and its ongoing relevance in the modern world is crucial for navigating the complexities of individual and collective existence.\n\nAt its most fundamental level, independence refers to the state of being free from the domination or influence of others. This can apply to nations, individuals, or even organizations. National independence, often achieved through re

In [21]:
query="can you give me 1000 words essay on independence?"
response = cached_model(query)
print(response)

***CACHE HIT***
EXECUTION TIME: 0.00 seconds
content="## The Enduring Pursuit of Independence: A Multifaceted Ideal\n\nIndependence, a word that resonates with power and promise, is a cornerstone of human aspiration. It signifies freedom from external control, the ability to make autonomous choices, and the capacity to shape one's own destiny. This concept, however, is far from monolithic. It manifests in diverse forms, from national sovereignty to personal autonomy, and its pursuit is a complex and often challenging journey. Understanding the multifaceted nature of independence, its historical significance, and its ongoing relevance in the modern world is crucial for navigating the complexities of individual and collective existence.\n\nAt its most fundamental level, independence refers to the state of being free from the domination or influence of others. This can apply to nations, individuals, or even organizations. National independence, often achieved through revolution or negotia

In [22]:
embedding_model = loader.load_embeddings()
print(f"Embedding Model Loaded: {embedding_model}")
result = embedding_model.embed_query("Hello, how are you?")
print(f"Embedding Result: {result}")

{"model": "models/text-embedding-004", "timestamp": "2025-08-25T16:21:04.178817Z", "level": "info", "event": "Loading embedding model"}


Embedding Model Loaded: client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x00000204C87D3D00> async_client=<google.ai.generativelanguage_v1beta.services.generative_service.async_client.GenerativeServiceAsyncClient object at 0x00000204C8828EB0> model='models/text-embedding-004' task_type=None google_api_key=SecretStr('**********') credentials=None client_options=None transport=None request_options=None
Embedding Result: [0.012455824762582779, -0.022220758721232414, -0.050555989146232605, -0.04617856815457344, 0.007134224288165569, 0.03131191432476044, 0.03413437679409981, 0.002971651265397668, -0.010995007120072842, 0.04237418621778488, -0.01806010864675045, 0.02493678405880928, 0.1121494323015213, -0.013102629221975803, -0.0148635134100914, -0.024023527279496193, 0.010264375247061253, -0.0037090936675667763, -0.10687677562236786, -0.001329770078882575, 0.03868139907717705, -0.03769991546869278, 0.03614252060651779, 0.005920

In [23]:
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableMap, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [24]:
from langchain.schema import Document

documents = [
    Document(page_content="The Earth is the third planet from the Sun and the only known planet to support life. It has a diverse climate, ranging from arctic to tropical zones, and supports ecosystems across seven continents and five oceans."),

    Document(page_content="The Industrial Revolution, beginning in the 18th century, drastically transformed human societies by shifting from manual labor to machine-based manufacturing, leading to urbanization and economic expansion globally."),

    Document(page_content="The United Nations, established in 1945 after World War II, is an international organization founded to promote peace, security, human rights, and cooperation among countries. It has 193 member states."),

    Document(page_content="The global economy is an interconnected system involving trade, investment, and financial flows across countries. Major players include the United States, China, the European Union, and emerging markets like India and Brazil."),

    Document(page_content="Climate change refers to long-term shifts in temperatures and weather patterns. It is largely driven by human activities like burning fossil fuels, deforestation, and industrial emissions, leading to global warming and sea level rise."),

    Document(page_content="Democracy is a political system in which citizens exercise power by voting. Modern democracies typically have institutions for free elections, rule of law, freedom of expression, and checks and balances."),

    Document(page_content="The Internet has revolutionized communication, commerce, and education worldwide. Originating from military research in the 1960s, it now connects over 5 billion people, enabling instant global information exchange."),

    Document(page_content="Renewable energy sources like solar, wind, hydro, and geothermal are critical for a sustainable future. They offer alternatives to fossil fuels, reducing carbon emissions and reliance on finite resources."),

    Document(page_content="The World Health Organization (WHO) is a UN agency focused on global health issues. It coordinates international efforts to monitor diseases, set health standards, and respond to pandemics like COVID-19."),

    Document(page_content="Globalization is the process of increasing interaction and integration among people, companies, and governments worldwide. It has led to greater economic growth but also raised concerns about inequality and cultural homogenization.")
]

In [25]:
# Chroma vector DB with persistent storage
vector_store = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory="./chroma_db"  # Disk path for persistence
)

Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [26]:
# Optional: Persist manually (though auto-persistence happens internally)
vector_store.persist()

  vector_store.persist()


In [27]:
retriever = vector_store.as_retriever()

In [28]:
prompt = PromptTemplate.from_template(
    """
    Use the following context to answer the question.
    If you don't know the answer, just say you don't know. Don't try to make up an answer.

    Context:
    {context}

    Question:
    {question}

    Answer:
    """
)

In [29]:
# LCEL RAG Chain step-by-step
rag_chain = (
    RunnableMap({
        "context": retriever | (lambda docs: "\n\n".join([doc.page_content for doc in docs])),
        "question": RunnablePassthrough()
    })
    | prompt
    | llm
    | StrOutputParser()
)

In [30]:
# Wrap with cache
RAG_Cache = {}

In [31]:
import time
def cached_rag_chain(query):
    start_time = time.time()
    if RAG_Cache.get(query):
        print("***CACHE HIT***")
        end_time = time.time()
        elapsed = end_time - start_time
        print(f"EXECUTION TIME: {elapsed:.2f} seconds")
        return RAG_Cache.get(query)
    else:
        print("***CACHE MISS – EXECUTING MODEL***")
        start_time = time.time()
        response = llm.invoke(query)
        end_time = time.time()
        elapsed = end_time - start_time
        print(f"EXECUTION TIME: {elapsed:.2f} seconds")
        RAG_Cache[query] = response
        return response

In [32]:
query = "what is japan economy in 2024 and relation with north korea?"
response = cached_rag_chain(query)
print(response)

***CACHE MISS – EXECUTING MODEL***
EXECUTION TIME: 8.25 seconds
content='Okay, let\'s break down the Japanese economy in 2024 and its relationship with North Korea.\n\n**Japan\'s Economy in 2024: Overview**\n\nJapan\'s economy in 2024 is characterized by a mix of challenges and opportunities. Here\'s a summary:\n\n*   **Slow Growth:** Japan has been experiencing relatively slow economic growth for decades, and this trend is expected to continue in 2024. The IMF and other organizations project modest growth, typically in the range of 0.5% to 1%.\n*   **Aging Population and Declining Birth Rate:** This is a major structural issue. A shrinking workforce puts pressure on the social security system, reduces consumption, and limits potential economic growth.\n*   **Government Debt:** Japan has one of the highest levels of government debt in the world as a percentage of GDP. This limits the government\'s ability to stimulate the economy through fiscal policy.\n*   **Inflation:** After decades

In [33]:
query = "what is langchain framework?"
response = cached_rag_chain(query)
print(response)

***CACHE MISS – EXECUTING MODEL***
EXECUTION TIME: 7.19 seconds
content="LangChain is a framework designed to simplify the development of applications powered by large language models (LLMs). It provides tools, components, and interfaces to connect LLMs to various data sources and other computational resources, enabling developers to build more complex and sophisticated applications.\n\nHere's a breakdown of what LangChain is and its key features:\n\n**Core Concepts:**\n\n* **LLMs (Large Language Models):**  LangChain is built around the idea of leveraging the power of LLMs like GPT-3, GPT-4, PaLM, and others.  It provides a standardized interface for interacting with these models.\n* **Components:** LangChain offers a collection of modular components that can be assembled to create different types of applications. These components include:\n    * **Models:** Abstractions for different types of language models (LLMs, Chat Models, Text Embedding Models).\n    * **Prompts:** Tools for co

In [34]:
query = "Why United Nations, established in 1945 after World War II?"
response = cached_rag_chain(query)
print(response)


***CACHE MISS – EXECUTING MODEL***
EXECUTION TIME: 5.06 seconds
content="The United Nations was established in 1945 after World War II for a multitude of crucial reasons, all stemming from the desire to prevent another global catastrophe and foster international cooperation. Here's a breakdown of the key motivations:\n\n*   **Preventing Future Wars:** The primary and overarching goal was to prevent another world war. The devastation of World War I and especially World War II, with its unprecedented loss of life and widespread destruction, made it clear that a new international organization was needed to address the root causes of conflict and provide mechanisms for peaceful resolution.\n\n*   **Maintaining International Peace and Security:** The UN Charter explicitly states its purpose as maintaining international peace and security. This involves:\n    *   **Collective Security:** The idea that if one member is attacked, the others will come to its defense (though this is more nuanced

# Cache Using LangChain

In [35]:
from langchain.cache import InMemoryCache
from langchain.globals import set_llm_cache
from typing import Any, Dict, Tuple

In [36]:
class DebuggableCache(InMemoryCache):
    def __init__(self):
        super().__init__()
        self._cache: Dict[Tuple[str, str], Any] = {}

    def lookup(self, prompt: str, llm_string: str):
        return self._cache.get((prompt, llm_string))

    def update(self, prompt: str, llm_string: str, return_val: Any):
        self._cache[(prompt, llm_string)] = return_val

    def view_cache(self):  # this is our custom method
        return self._cache

In [37]:
dbg_cache = DebuggableCache()
set_llm_cache(dbg_cache)

In [38]:
response = llm.invoke("What is the capital of France?")

In [39]:
print("LLM Response:", response)


LLM Response: content='The capital of France is **Paris**.' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []} id='run--19e77678-4cd2-47df-a116-6d0a426c4890-0' usage_metadata={'input_tokens': 7, 'output_tokens': 9, 'total_tokens': 16, 'input_token_details': {'cache_read': 0}}


In [40]:
print("\nCache Contents:")
for k, v in dbg_cache.view_cache().items():
    print(f"Prompt: {k[0]} | Cached Output: {v}")


Cache Contents:
Prompt: [{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "messages", "HumanMessage"], "kwargs": {"content": "What is the capital of France?", "type": "human"}}] | Cached Output: [ChatGeneration(text='The capital of France is **Paris**.', generation_info={'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, message=AIMessage(content='The capital of France is **Paris**.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, id='run--19e77678-4cd2-47df-a116-6d0a426c4890-0', usage_metadata={'input_tokens': 7, 'output_tokens': 9, 'total_tokens': 16, 'input_token_details': {'cache_read': 0}}))]


In [41]:
response = llm.invoke("What is the capital of France?")

In [42]:
print("\nCache Contents:")
for k, v in dbg_cache.view_cache().items():
    print(f"Prompt: {k[0]} | Cached Output: {v}")


Cache Contents:
Prompt: [{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "messages", "HumanMessage"], "kwargs": {"content": "What is the capital of France?", "type": "human"}}] | Cached Output: [ChatGeneration(text='The capital of France is **Paris**.', generation_info={'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, message=AIMessage(content='The capital of France is **Paris**.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, id='run--19e77678-4cd2-47df-a116-6d0a426c4890-0', usage_metadata={'input_tokens': 7, 'output_tokens': 9, 'total_tokens': 16, 'input_token_details': {'cache_read': 0}}))]


In [43]:
response = llm.invoke("What is the capital of india?")
print("LLM Response:", response)

LLM Response: content='The capital of India is **New Delhi**.' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []} id='run--bbc1dfbc-992b-419b-9061-a89e23f0cdc2-0' usage_metadata={'input_tokens': 7, 'output_tokens': 10, 'total_tokens': 17, 'input_token_details': {'cache_read': 0}}


In [44]:
print("\nCache Contents:")
for k, v in dbg_cache.view_cache().items():
    print(f"Prompt: {k[0]} | Cached Output: {v}")


Cache Contents:
Prompt: [{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "messages", "HumanMessage"], "kwargs": {"content": "What is the capital of France?", "type": "human"}}] | Cached Output: [ChatGeneration(text='The capital of France is **Paris**.', generation_info={'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, message=AIMessage(content='The capital of France is **Paris**.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, id='run--19e77678-4cd2-47df-a116-6d0a426c4890-0', usage_metadata={'input_tokens': 7, 'output_tokens': 9, 'total_tokens': 16, 'input_token_details': {'cache_read': 0}}))]
Prompt: [{"lc": 1, "type": "constructor", "id": ["langchain", "schema", "messages", "HumanMessage"], "kwargs": {"content": "What is the capital of india?", "type": "human"}}] | Cached Output: [ChatGeneration(tex

In [45]:
response = llm.invoke("What is the capital of india?")
print("LLM Response:", response)

LLM Response: content='The capital of India is **New Delhi**.' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []} id='run--bbc1dfbc-992b-419b-9061-a89e23f0cdc2-0' usage_metadata={'input_tokens': 7, 'output_tokens': 10, 'total_tokens': 17, 'input_token_details': {'cache_read': 0}}


In [46]:
response = llm.invoke("give me 1000 lines of essay on science and give the importance of it regarding mathematics?")
print("LLM Response:", response)

LLM Response: content="## The Indelible Bond: Science, Mathematics, and the Pursuit of Understanding\n\nScience, in its broadest sense, is the systematic pursuit of knowledge about the natural world through observation, experimentation, and analysis. It is a dynamic and ever-evolving process, driven by curiosity and a desire to understand the intricate workings of the universe. From the smallest subatomic particles to the vast expanse of galaxies, science seeks to unravel the mysteries that surround us, providing explanations and predictions that shape our understanding of reality. At the heart of this endeavor lies a powerful and indispensable tool: mathematics. Mathematics provides the language, the structure, and the logical framework that allows science to move beyond mere observation and into the realm of quantifiable understanding and predictive power. The relationship between science and mathematics is not merely symbiotic; it is a fundamental and inseparable bond, a cornerstone

In [47]:
from langchain.callbacks import get_openai_callback
with get_openai_callback() as cb:
    response = llm.invoke("Tell me a joke about LangChain and ECS")
    print("Response:", response.content)
    print("Token Usage Stats:", cb)

Response: Okay, here's a joke about LangChain and ECS:

Why did the LangChain agent refuse to deploy on ECS?

Because it said, "I'm designed to orchestrate complex language models, not container orchestration! I'm already dealing with enough chains, I don't need another one!"
Token Usage Stats: Tokens Used: 75
	Prompt Tokens: 9
		Prompt Tokens Cached: 0
	Completion Tokens: 66
		Reasoning Tokens: 0
Successful Requests: 1
Total Cost (USD): $0.0


In [48]:
questions = [
    "What is Retrieval-Augmented Generation?",
    "How does FAISS indexing work?",
    "What is the difference between fine-tuning and RAG?"
]

with get_openai_callback() as cb:
    for q in questions:
        answer = llm.invoke(q)
        print(f"\nQ: {q}\nA: {answer.content}")

    print("\n=== Token Usage Summary ===")
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Total Cost (USD): ${cb.total_cost:.6f}")


Q: What is Retrieval-Augmented Generation?
A: Retrieval-Augmented Generation (RAG) is a technique that enhances the capabilities of large language models (LLMs) by allowing them to access and incorporate information from external knowledge sources during the generation process.  Think of it as giving an LLM a textbook or a library to consult before answering a question.

Here's a breakdown of the key components and how it works:

**1. The Problem RAG Solves:**

* **LLMs have limited knowledge:** LLMs are trained on massive datasets, but their knowledge is frozen in time. They can't access real-time information or specific domain knowledge that wasn't included in their training data.
* **Hallucinations:** LLMs can sometimes generate incorrect or nonsensical information, often referred to as "hallucinations." This happens because they're trying to answer questions based on patterns learned during training, even if they don't have the actual facts.
* **Lack of Transparency:** It's often 