In [43]:
from langchain.schema import HumanMessage, SystemMessage, AIMessage
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.memory import ConversationBufferMemory
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
import os
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
# read pdf document
pdf_path = "data-lake-vs-data-warehouse-vs-database-key-differences-explained.pdf"

pdf_loader = PyPDFLoader(pdf_path)
pdf_text = pdf_loader.load()

In [25]:
# Configure text splitter with optimal parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,  # Characters per chunk
    chunk_overlap=200,  # Overlap between chunks
    length_function=len,  # Using simple length function for characters
    separators=["", "", " ", ""],  # Separators for splitting
)

# Split the documents
pdf_text = text_splitter.split_documents(pdf_text)

In [34]:
db = Chroma.from_documents(pdf_text, GoogleGenerativeAIEmbeddings(model="models/embedding-001"))

In [38]:
retriever = db.as_retriever()

In [None]:
template = """
    Answer the question based only on the following context:
    {context}
    Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
model = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0, convert_system_message_to_human=True)

def format_docs(docs):
    return "\n".join([f"{i+1}. {doc}" for i, doc in enumerate(docs)])

chain = (
    {'context': retriever | format_docs, 'question': RunnablePassthrough()}
    | prompt
    | model 
    | StrOutputParser()
)

chain.invoke("What is data warehouse?")