In [8]:
import os

from dotenv import load_dotenv
load_dotenv()

cohere_api_key = os.environ["COHERE_API_KEY"]
huggingface_api_key = os.environ["HUGGINGFACEHUB_API_TOKEN"]


In [9]:
# import getpass
# import os

# os.environ["COHERE_API_KEY"] = getpass.getpass("Enter your Cohere API key: ")

In [10]:
# data ingestion

In [11]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [12]:
loader = TextLoader(
    "C:/Biprayan - 2/LLMops/project/data/demo.txt",
    autodetect_encoding= True
)

documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
docs = text_splitter.split_documents(documents)

texts = [doc.page_content for doc in docs]

In [13]:
# data retrieval

In [14]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEndpointEmbeddings


embedding_function = HuggingFaceEndpointEmbeddings(
    model="sentence-transformers/all-MiniLM-L6-v2",
    huggingfacehub_api_token= huggingface_api_key
)


In [15]:
embedded_docs = embedding_function.embed_documents(texts=texts)

In [16]:
vector_store = FAISS.from_documents(docs, embedding_function)

In [17]:
retrieved_docs = vector_store.similarity_search("what is Agentic Document Extraction", k=3)

In [18]:
retriever = vector_store.as_retriever()

In [19]:
# promt

In [20]:
from langchain_core.prompts import ChatPromptTemplate

template="""You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use ten sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""

prompt=ChatPromptTemplate.from_template(template)


In [21]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse ten sentences maximum and keep the answer concise.\nQuestion: {question}\nContext: {context}\nAnswer:\n"), additional_kwargs={})])

In [22]:
from langchain_core.output_parsers import StrOutputParser
output_parser = StrOutputParser()

In [23]:
# ! uv pip install -U langchain-cohere

In [24]:
from langchain_cohere import ChatCohere

llm_model = ChatCohere(
    cohere_api_key=cohere_api_key,
    model="command-a-03-2025",  # model selection can be changed
)

In [26]:
llm_model.invoke("what is Agentic Document Extraction")

AIMessage(content='**Agentic Document Extraction** refers to the process of using **agentic AI systems** to extract relevant information, insights, or specific data points from documents in a more intelligent, context-aware, and goal-oriented manner. Unlike traditional document extraction methods (e.g., rule-based systems or simple keyword searches), agentic extraction leverages advanced AI capabilities, such as natural language understanding (NLU), reasoning, and decision-making, to perform tasks more autonomously and adaptively.\n\n### Key Characteristics of Agentic Document Extraction:\n1. **Autonomy**: The system operates with minimal human intervention, identifying and extracting information based on predefined goals or dynamic instructions.\n2. **Context Awareness**: It understands the context of the document, including nuances, relationships, and implicit meanings, to extract more accurate and relevant data.\n3. **Goal-Oriented**: The extraction process is driven by specific obj

In [27]:
from langchain_core.runnables import RunnablePassthrough

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm_model
    | output_parser
)

In [28]:
chain.invoke("what is Agentic Document Extraction")

"Agentic Document Extraction (ADE) is an advanced system that builds upon traditional Optical Character Recognition (OCR) technology. It utilizes generative AI to go beyond simple text extraction, aiming to truly understand the content of documents. The provided context suggests ADE is a developing field, with early approaches likely to evolve. It highlights the system's ability to move from basic OCR to a more comprehensive understanding of document content. Anirban Chakraborty appears to be a key figure associated with ADE, as they are mentioned as an author and their work is referenced. The context also mentions a diagram illustrating an early ADE approach, indicating a visual representation of the system's functionality. While the exact details of ADE's capabilities are not fully outlined, it is presented as a significant advancement in document processing technology."

In [29]:
from langchain_core.runnables import RunnablePassthrough

input = {"context": retriever, "question": RunnablePassthrough()}

chain_2 = (
    input | prompt| llm_model| output_parser
)

In [30]:
chain_2.invoke("what is Agentic Document Extraction")

'Agentic Document Extraction (ADE) is an advanced, generative AI-powered system that evolves traditional Optical Character Recognition (OCR) to understand and interpret document content more deeply. It goes beyond simple text extraction to provide meaningful insights and comprehension of the material. ADE represents a significant advancement in document processing technology, leveraging AI to enhance accuracy and utility. The concept is still evolving, with ongoing research addressing key challenges in its development. It aims to bridge the gap between raw text extraction and true content understanding. ADE is discussed in articles by Anirban Chakraborty, highlighting its potential and future directions. The system is designed to improve over time as AI capabilities expand. Its primary goal is to transform how documents are analyzed and utilized in various applications. ADE is a cutting-edge approach to document extraction, promising greater efficiency and intelligence in handling text

In [31]:
chain_2

{
  context: VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEndpointEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001CF31664050>, search_kwargs={}),
  question: RunnablePassthrough()
}
| ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse ten sentences maximum and keep the answer concise.\nQuestion: {question}\nContext: {context}\nAnswer:\n"), additional_kwargs={})])
| ChatCohere(client=<cohere.client.Client object at 0x000001CEF7B5DE80>, async_client=<cohere.client.AsyncClient object at 0x000001CEF7B5FD70>, model='command-a-03-2025', cohere_api_key=SecretStr('*********

In [36]:
input_data = {"context": retriever, "question": RunnablePassthrough()}
llm_chain = prompt | llm_model | output_parser

final_chain = input_data | llm_chain

# Invoke with a plain string so the RunnablePassthrough receives the question text
final_chain.invoke("what is Agentic Document Extraction")

"Agentic Document Extraction (ADE) is an advanced system that evolves traditional Optical Character Recognition (OCR) into a more sophisticated process. It leverages generative AI to go beyond simple text extraction, aiming for a deeper understanding of document content. ADE represents an early approach that is expected to evolve over time. The system addresses key challenges in extracting and interpreting information from documents. It was introduced by Anirban Chakraborty, as mentioned in the provided context. ADE is designed to enhance the capabilities of OCR by incorporating AI-powered understanding. The exact mechanisms and features of ADE are not fully detailed in the given context. However, it is clear that ADE aims to improve document processing by integrating advanced AI techniques. Further specifics about ADE's functionality or applications are not provided in the available information."

In [37]:
input_data

{'context': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEndpointEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001CF31664050>, search_kwargs={}),
 'question': RunnablePassthrough()}

In [38]:
final_chain

{
  context: VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEndpointEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001CF31664050>, search_kwargs={}),
  question: RunnablePassthrough()
}
| ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse ten sentences maximum and keep the answer concise.\nQuestion: {question}\nContext: {context}\nAnswer:\n"), additional_kwargs={})])
| ChatCohere(client=<cohere.client.Client object at 0x000001CEF7B5DE80>, async_client=<cohere.client.AsyncClient object at 0x000001CEF7B5FD70>, model='command-a-03-2025', cohere_api_key=SecretStr('*********