In [45]:
## Data Ingestion 
from langchain_community.document_loaders import TextLoader
loader = TextLoader("speech.txt")

In [46]:
text_documents = loader.load()

In [47]:
text_documents

[Document(metadata={'source': 'speech.txt'}, page_content='"The future belongs to those who believe in the beauty of their dreams." Today, as we gather here, let us remember that each of us holds the power to shape our destiny. Our journey is not always easy, and the path to success is often filled with challenges. But it is in overcoming these obstacles that we discover our true strength and resilience.\n\nLet us embrace challenges as opportunities to learn and grow. Let us support one another with kindness, compassion, and understanding, for together we are stronger than we are alone. Each act of encouragement, each moment of collaboration, brings us closer to our shared goals.\n\nAs we look to the future, let us strive for excellence in all that we do. Let us be guided by integrity, fueled by curiosity, and inspired by the dreams that unite us. Remember, greatness is not achieved in isolation, but through the collective efforts of a dedicated and passionate community.\n\nTogether, w

In [48]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

In [49]:
# # web based loader
# from langchain_community.document_loaders import WebBaseLoader
# import bs4

# loader = WebBaseLoader(web_paths="https://lilianweng.github.io/posts/2023-06-23-agent/",
#                        bs_kwargs=dict(parse_only=bs4.SoupStrainer(
#                            class_=("post-title","post-content","post-header")
#                        )))


# Load

### Document Loading

In [50]:
from langchain_community.document_loaders import PyPDFLoader
pdf_loader = PyPDFLoader("Ultrasound.pdf")

In [51]:
doc = pdf_loader.load()

In [11]:
doc

[Document(metadata={'producer': 'ST4 PDF Engine (Build 13.0.1.0)', 'creator': 'SCHEMA ST4', 'creationdate': '2025-08-04T17:42:33+02:00', 'title': 'RevB_TechnicalSpecification_UWS_7.0_ENG', 'moddate': '2025-08-04T17:42:33+02:00', 'source': 'Ultrasound.pdf', 'total_pages': 24, 'page': 0, 'page_label': '1'}, page_content='Technical\nSpecification\nEnglish\nUltrasound Workspace\n7.0'),
 Document(metadata={'producer': 'ST4 PDF Engine (Build 13.0.1.0)', 'creator': 'SCHEMA ST4', 'creationdate': '2025-08-04T17:42:33+02:00', 'title': 'RevB_TechnicalSpecification_UWS_7.0_ENG', 'moddate': '2025-08-04T17:42:33+02:00', 'source': 'Ultrasound.pdf', 'total_pages': 24, 'page': 1, 'page_label': '2'}, page_content=''),
 Document(metadata={'producer': 'ST4 PDF Engine (Build 13.0.1.0)', 'creator': 'SCHEMA ST4', 'creationdate': '2025-08-04T17:42:33+02:00', 'title': 'RevB_TechnicalSpecification_UWS_7.0_ENG', 'moddate': '2025-08-04T17:42:33+02:00', 'source': 'Ultrasound.pdf', 'total_pages': 24, 'page': 2, 'pa

# Transform

In [52]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)

In [53]:
documents = text_splitter.split_documents(doc)

In [54]:
documents[45:50]

[]

# Embedding

In [66]:
# from langchain.embeddings import OpenAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma

try:
    db = Chroma.from_documents(documents, GoogleGenerativeAIEmbeddings(model="models/embedding-001"))
except Exception as e:
    print(f"Error creating Chroma vector store: {e}")
    db = None

In [67]:
db

<langchain_community.vectorstores.chroma.Chroma at 0x1f4fd51e690>

In [68]:
query = "What are the hardware requirements?"
docs = db.similarity_search(query)

In [69]:
docs

[Document(metadata={'title': 'RevB_TechnicalSpecification_UWS_7.0_ENG', 'producer': 'ST4 PDF Engine (Build 13.0.1.0)', 'creationdate': '2025-08-04T17:42:33+02:00', 'creator': 'SCHEMA ST4', 'total_pages': 24, 'source': 'Ultrasound.pdf', 'page': 2, 'page_label': '3', 'moddate': '2025-08-04T17:42:33+02:00'}, page_content='Client Workstation Requirements ..................................................................................................................  14\nOperating System ............................................................................................................................. \xa0 14\nHardware .......................................................................................................................................... \xa0 15\nDisk Space ......................................................................................................................................... \xa0 15\nConfiguration ..............................................

In [73]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

In [23]:
prompt = ChatPromptTemplate.from_template("""
Answer the following question based only on the provided context. 
Think step by step before providing a detailed answer. 
I will tip you $1000 if the user finds the answer helpful. 
<context>
{context}
</context>
Question: {input}""")

In [24]:
# create a chain
from langchain.chains.combine_documents import create_stuff_documents_chain

documents_chain = create_stuff_documents_chain(llm, prompt)

In [25]:
# retrieve relevant documents
retriever = db.as_retriever()
retriever

NameError: name 'db' is not defined

In [81]:
# Retrieval Chain
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever,documents_chain)

In [83]:
retrieval_chain.invoke({"input": "What are the Client Workstation Requirements?"})

{'input': 'What are the Client Workstation Requirements?',
 'context': [Document(metadata={'page': 2, 'total_pages': 24, 'producer': 'ST4 PDF Engine (Build 13.0.1.0)', 'source': 'Ultrasound.pdf', 'creator': 'SCHEMA ST4', 'moddate': '2025-08-04T17:42:33+02:00', 'creationdate': '2025-08-04T17:42:33+02:00', 'page_label': '3', 'title': 'RevB_TechnicalSpecification_UWS_7.0_ENG'}, page_content='Client Workstation Requirements ..................................................................................................................  14\nOperating System ............................................................................................................................. \xa0 14\nHardware .......................................................................................................................................... \xa0 15\nDisk Space ........................................................................................................................................

## Using PdfPlumber for linearizing data

In [14]:
from langchain_community.document_loaders import PDFPlumberLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# 1. Load the document using PDFPlumberLoader
pdf_path = "Ultrasound.pdf"
loader1 = PDFPlumberLoader(pdf_path)
docs1 = loader1.load()

print(f"Loaded {len(docs1)} pages from the document.")

# 2. Initialize the Text Splitter
# This splitter will try to break text into 2000-character chunks
# with a 100-character overlap between chunks to maintain context.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,
    chunk_overlap=200
)

# 3. Split the documents into smaller chunks
# The .split_documents() method takes the list of loaded pages
# and returns a new list of smaller chunked documents.
chunked_documents = text_splitter.split_documents(docs1)

print(f"Split the document into {len(chunked_documents)} chunks.")

# --- You can now inspect the resulting chunks ---

# Print the content of the first chunk to see the result
if chunked_documents:
    print("\n--- Content of the first chunk ---")
    print(chunked_documents[0].page_content)
    
    print("\n--- Metadata of the first chunk ---")
    print(chunked_documents[0].metadata)

Loaded 22 pages from the document.
Split the document into 21 chunks.

--- Content of the first chunk ---
Technical
Specification
English
Ultrasound Workspace
7.0

--- Metadata of the first chunk ---
{'source': 'Ultrasound.pdf', 'file_path': 'Ultrasound.pdf', 'page': 0, 'total_pages': 22, 'Producer': 'iLovePDF', 'ModDate': 'D:20250824092020Z'}


In [16]:
# from langchain.embeddings import OpenAIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma

try:
    db1 = Chroma.from_documents(chunked_documents, GoogleGenerativeAIEmbeddings(model="models/embedding-001"))
except Exception as e:
    print(f"Error creating Chroma vector store: {e}")
    db1 = None

In [17]:
query = "What are the Client Workstation Requirements?"
docs = db1.similarity_search(query)
docs

[Document(metadata={'Producer': 'iLovePDF', 'file_path': 'Ultrasound.pdf', 'page': 0, 'total_pages': 22, 'source': 'Ultrasound.pdf', 'ModDate': 'D:20250824092020Z'}, page_content='Technical\nSpecification\nEnglish\nUltrasound Workspace\n7.0'),
 Document(metadata={'page': 0, 'Producer': 'iLovePDF', 'total_pages': 22, 'file_path': 'Ultrasound.pdf', 'ModDate': 'D:20250824092020Z', 'source': 'Ultrasound.pdf'}, page_content='Technical\nSpecification\nEnglish\nUltrasound Workspace\n7.0'),
 Document(metadata={'page': 11, 'ModDate': 'D:20250824092020Z', 'source': 'Ultrasound.pdf', 'total_pages': 22, 'file_path': 'Ultrasound.pdf', 'Producer': 'iLovePDF'}, page_content='supported by Microsoft, starting with Windows 10 version 1809 and higher.(1) Microsoft\noperating systems built on the Windows OneCore kernel support the x64 variant.\n1. For more information, see: https://support.microsoft.com/en-us/lifecycle/search'),
 Document(metadata={'ModDate': 'D:20250824092020Z', 'page': 17, 'file_path': 

In [20]:
# retrieve relevant documents
retriever1 = db1.as_retriever()
retriever1

VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000018DD9EE4E10>, search_kwargs={})

In [26]:
# Retrieval Chain
from langchain.chains import create_retrieval_chain
retrieval_chain1 = create_retrieval_chain(retriever1,documents_chain)

In [27]:
retrieval_chain1.invoke({"input": "What are the Client Workstation Requirements?"})

{'input': 'What are the Client Workstation Requirements?',
 'context': [Document(metadata={'source': 'Ultrasound.pdf', 'total_pages': 22, 'file_path': 'Ultrasound.pdf', 'ModDate': 'D:20250824092020Z', 'page': 0, 'Producer': 'iLovePDF'}, page_content='Technical\nSpecification\nEnglish\nUltrasound Workspace\n7.0'),
  Document(metadata={'ModDate': 'D:20250824092020Z', 'Producer': 'iLovePDF', 'page': 0, 'file_path': 'Ultrasound.pdf', 'total_pages': 22, 'source': 'Ultrasound.pdf'}, page_content='Technical\nSpecification\nEnglish\nUltrasound Workspace\n7.0'),
  Document(metadata={'Producer': 'iLovePDF', 'ModDate': 'D:20250824092020Z', 'total_pages': 22, 'file_path': 'Ultrasound.pdf', 'page': 11, 'source': 'Ultrasound.pdf'}, page_content='supported by Microsoft, starting with Windows 10 version 1809 and higher.(1) Microsoft\noperating systems built on the Windows OneCore kernel support the x64 variant.\n1. For more information, see: https://support.microsoft.com/en-us/lifecycle/search'),
  Do