In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain_astradb import AstraDBVectorStore
from langchain_google_genai import ChatGoogleGenerativeAI,GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel,Field
from langchain_core.output_parsers import StrOutputParser
from langchain_community.utilities import ArxivAPIWrapper,WikipediaAPIWrapper
from langchain_community.tools import ArxivQueryRun,WikipediaQueryRun
from langchain import hub
from typing import List,Literal
from typing_extensions import TypedDict
from langchain.schema import Document
from pprint import pprint
from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
LANGCHAIN_API_KEY = os.getenv("LANGSMITH_API_KEY")
LANGCHAIN_PROJECT="Langgraph_bot"
LANGCHAIN_TRACING_V2= True 
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [3]:
llm = ChatGoogleGenerativeAI(model = "gemini-1.5-pro",api_key = GOOGLE_API_KEY,temperature = 0.2)
print(llm.invoke("Who are you?").content)

I am a large language model, trained by Google. 

Here's what that means:

* **I am a computer program:** I don't have feelings, experiences, or a physical body.
* **I am trained on a massive amount of text data:** This allows me to communicate and generate human-like text in response to a wide range of prompts. 
* **I can help you with various tasks:** From answering questions to writing stories, translating languages, and summarizing text, I am here to assist you.

How can I help you today? 



**DATA-INGESTION & CONVERSION**

In [4]:
def load_pdf(data):
    loader = DirectoryLoader(path = data,glob = "*.pdf",loader_cls = PyPDFLoader)
    docs = loader.load()
    return docs

extracted_data = load_pdf("data/")

In [5]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)


    return text_chunks

text_chunks = text_split(extracted_data)
len(text_chunks)

233

In [6]:
load_dotenv()
gemini_embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
ASTRA_DB_API = os.getenv("ASTRA_API_KEY")
ASTRA_ENDPOINT = os.getenv("DB_ENDPOINT")

In [9]:
def ingest(status):
    vector_store = AstraDBVectorStore(token = ASTRA_DB_API,
                                      api_endpoint = ASTRA_ENDPOINT,
                                      embedding = gemini_embeddings,
                                      namespace = "law",
                                      collection_name = "langgraph_RAG")
    is_full = status
    if is_full == None:#THIS MEANS THERE IS NO VECTORS CREATED IN DB
        inserted_ids = vector_store.add_documents(text_chunks)
    else:
        return vector_store
    
    
    return vector_store,inserted_ids

vector_store = ingest(None)#IF YOU ARE RUNNING THIS FOR 2nd TIME CHANGE PARAMETER TO "done"

In [13]:
vector_store = AstraDBVectorStore(token = ASTRA_DB_API,
                                      api_endpoint = ASTRA_ENDPOINT,
                                      embedding = gemini_embeddings,
                                      namespace = "law",
                                      collection_name = "langgraph_RAG")

In [16]:
vector_store = vector_store.as_retriever()

**ROUTING IN THE GRAPGH**

In [26]:
from pydantic import BaseModel, Field
from typing import Literal

class Route_Query(BaseModel):
    datasource: Literal["vectorstore", "wiki_search"] = Field(..., description="For a given user question decide whether to route it to vectorstore or wiki_search")

llm_router = llm.with_structured_output(Route_Query)

system = """You are an expert at routing a user question to a vectorstore or wiki_search.
The vectorstore contains documents related to Indian law budget 2024.
Use the vectorstore for questions on these topics. Otherwise, use wiki_search."""

route_prompt = ChatPromptTemplate.from_messages([("system", system), ("human", "{question}")])

question_router = route_prompt | llm_router

print(question_router.invoke({"question": "Tell me about Shrimp Production & Export in budget 2024?"}))

Key 'title' is not supported in schema, ignoring
Key 'title' is not supported in schema, ignoring
Key 'title' is not supported in schema, ignoring
Key 'title' is not supported in schema, ignoring


[{'args': {'datasource': 'vectorstore'}, 'type': 'Route_Query'}]


**DOCUMENT GRADER**

In [27]:
class Grade_Docs(BaseModel):
    binary_score : Literal["yes","no"] = Field(...,description = "Documents are relevant to the question, 'yes' or 'no'")

system = """You are a grader assessing relevance of a retrieved document to a user question. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""

grade_prompt = ChatPromptTemplate.from_messages([("system", system),("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),])

llm_grader = llm.with_structured_output(Grade_Docs)

retrieval_grader = grade_prompt | llm_grader
docs = vector_store.get_relevant_documents("Tell me about indian budget 2024")
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": "Tell me about indian budget 2024", "document": doc_txt}))

Key 'title' is not supported in schema, ignoring
Key 'title' is not supported in schema, ignoring
Key 'title' is not supported in schema, ignoring
Key 'title' is not supported in schema, ignoring
  warn_deprecated(


[{'args': {'binary_score': 'yes'}, 'type': 'Grade_Docs'}]
