In [12]:
from langgraph.graph import Graph
import os
from dotenv import load_dotenv
load_dotenv()
os.environ['SERPER_API_KEY']
os.environ['LANGSMITH_API_KEY']
os.environ['LANGSMITH_PROJECT']
os.environ['TAVILY_API_KEY']
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import TextLoader, DirectoryLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from typing import TypedDict, Annotated, Sequence
import operator
from langchain_core.messages import BaseMessage
from langchain.prompts import PromptTemplate

# LOAD AND cHUNK dATA

In [13]:
loader=DirectoryLoader(r'../data',glob="./*.txt",loader_cls=TextLoader)
docs=loader.load()

text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=50
)
new_docs = text_splitter.split_documents(documents=docs)
doc_strings2 = [doc.page_content for doc in new_docs]

In [14]:
embeddings = OllamaEmbeddings(
    model="llama3.2",
)
db = Chroma.from_documents(new_docs, embeddings)
retriever = db.as_retriever(search_kwargs={"k": 3})

In [15]:
query="Tell me about India's Industrial Growth?"
sample=retriever.get_relevant_documents(query)
print(sample[0].metadata)
print(sample[0].page_content)

for i in sample:
    print(i)


{'source': '../data/sample.txt'}
consumer spending domestically has driven economic momentum.
page_content='consumer spending domestically has driven economic momentum.' metadata={'source': '../data/sample.txt'}
page_content='Indiaâ€™s last eight years of GDP (hypothetical data):' metadata={'source': '../data/sample.txt'}
page_content='consumer demand for electronics, automobiles, and pharmaceuticals.' metadata={'source': '../data/sample.txt'}


### Functions Pydantiucs Agent State

1. This defines a TypedDict called AgentState that has a single field messages. It uses Python's typing system where:

2. Sequence[BaseMessage] indicates it holds a sequence of BaseMessage objects
3. Annotated with operator.add suggests these messages can be combined/concatenated
4. TypedDict is a special kind of dictionary that specifies types for its keys

In [16]:
class AgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]

from pydantic import BaseModel , Field
class TopicSelectionParser(BaseModel):
    Topic: str = Field(description='Selected Topic')
    Reasoning: str = Field(description='Reasoning behind topic selection')
from langchain.output_parsers import PydanticOutputParser
parser = PydanticOutputParser(pydantic_object=TopicSelectionParser)
llm=ChatOllama(model="llama3.2",temperature=0)

In [17]:
def find_relevant_text(state):
    message=state['messages']
    question=message[-1]
    print(question)
    template="""
    Your task is to classify the given user query into one of the following categories: [India, Not Related]. 
    Only respond with the category name and nothing else.

    User query: {question}
    {format_instructions}
    """
    prompt=PromptTemplate(template=template,input_variables=[question],partial_variables={
                                        "format_instructions" : parser.get_format_instructions()    }                                
                                    )
    
    chain=prompt |llm |parser
    response = chain.invoke({"question":question,"format_instructions" : parser.get_format_instructions() })
    print(response)
    return {"messages": [response.Topic]}



In [18]:
state={"messages": ["Tell me about India's Industrial Growth"]}
find_relevant_text(state)

Tell me about India's Industrial Growth
Topic="India's Industrial Growth" Reasoning='Not Related'


{'messages': ["India's Industrial Growth"]}