In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_astradb import AstraDBVectorStore
from langchain_core.prompts import ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain.schema import Document
from langgraph.graph import START, StateGraph, END
from typing import Literal, List
from typing_extensions import TypedDict
from dotenv import load_dotenv
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
ASTRA_API_KEY = os.getenv("ASTRA_API_KEY")
DB_ENDPOINT = os.getenv("DB_ENDPOINT")
DB_ID = os.getenv("DB_ID")

In [3]:
llm = ChatGoogleGenerativeAI(api_key = GOOGLE_API_KEY, model = "gemini-1.5-pro",temperature = 1)
print(llm.invoke("What is capital of India").content)

The capital of India is **New Delhi**. 



In [4]:
def pdf_2_document(data):
    loader = DirectoryLoader(glob = "*.pdf", loader_cls = PyPDFLoader, path = data)
    docs = loader.load()
    return docs

In [5]:
docs = pdf_2_document("data/")

In [6]:
def text_2_chunks(docs):
    splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)
    chunks = splitter.split_documents(docs)
    return chunks

In [7]:
chunks = text_2_chunks(docs)
len(chunks)

2306

Data Ingestion Process

In [5]:
gemini_embedding = GoogleGenerativeAIEmbeddings(google_api_key = GOOGLE_API_KEY, model = "models/embedding-001")

In [11]:
def ingest(status):
    vector_store = AstraDBVectorStore(embedding = gemini_embedding,
                                      api_endpoint = DB_ENDPOINT,
                                      namespace = "constitution",
                                      token = ASTRA_API_KEY,
                                      collection_name = "Law_bot")
    
    if status == None:
        len = vector_store.add_documents(chunks)
        return vector_store
    else:
        return vector_store

In [12]:
vector_store = ingest("done")#CHANGE THIS THE PARAMETER TO NONE I ALREADY RAN THIS CELL HENCE IT IS "done",
print("Data Ingested Successfully")

Data Ingested Successfully


In [29]:
vector_store.similarity_search("I had a argument with Police could You tell me what to do according to the indian constituion")

[Document(metadata={'source': 'data\\Constitution.pdf', 'page': 41}, page_content='THE CONSTITUTION OF  INDIA\n(Part III.—Fundamental Rights)11\n(6) Nothing in sub-clause ( g) of the said clause shall affect the operation \nof any existing law in so far as it imposes, or prevent the State from making \nany law imposing, in the interests of the general public, reasonable restrictions \non the exercise of the right conferred by the said sub-clause, and, in particular, \n1[nothing in the said sub-clause shall affect the operation of any existing law in'),
 Document(metadata={'source': 'data\\Constitution.pdf', 'page': 43}, page_content='THE CONSTITUTION OF  INDIA\n(Part III.—Fundamental Rights)13\nProvided that nothing in this sub-clause shall authorise the detention \nof any person beyond the maximum period prescribed by any law made \nby Parliament under sub-clause ( b) of clause (7); or\n(b) such person is detained in accordance with the provisions of any \nlaw made by Parliament under

In [6]:
vector_store = AstraDBVectorStore(embedding = gemini_embedding,
                                      api_endpoint = DB_ENDPOINT,
                                      namespace = "constitution",
                                      token = ASTRA_API_KEY,
                                      collection_name = "Law_bot")

In [28]:
retriver = vector_store.as_retriever()

In [8]:
class GraphState(TypedDict):
    question : str
    documents : List[str]
    generation : str   
    route : str

In [32]:
def route(state : GraphState) -> str:
    question = state["question"]

    class RouteQuery(BaseModel):
        datasource: Literal["vectorstore", "out_of_context"] = Field(..., description = "For a given User Question find out whether to route it to vectorstore or out_of_context.")

    system = """You are an expert at routing a user question to a vectorstore or out_of_context.
    The vectorstore contains documents of The Constitution of India. NOTE: if any question has the words 'according to Indian Constitution' use 'vectorstore'.
    Use the vectorstore for questions on these topics. Otherwise, use out_of_context."""

    llm_router = llm.with_structured_output(RouteQuery)

    route_prompt = ChatPromptTemplate.from_messages([("system", system), ("human", "{question}")])
    question_router = route_prompt | llm_router
    response = question_router.invoke({"question": question})
    
    return response

In [31]:
def Grade_Docs(state : GraphState):
    docs = state["documents"]
    question = state["question"]
    
    class Grading(BaseModel):
        binary_source: Literal["yes", "no"] = Field(..., "Documents are relevant to question, Yes or no")
    
    system = """You are a grader assessing relevance of a retrieved document to a user question. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""

    grade_prompt = ChatPromptTemplate.from_messages([("system", system), ("human", "Retrieved document: \n\n {document} \n\n User question: {question}")])

    llm_grader = llm.with_structured_output(Grading)
    retrieval_grader = grade_prompt | llm_grader

    retrieval_grader.invoke({"question": question, "document": docs})
    
    

datasource='vectorstore'
