In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader

In [2]:
loader=UnstructuredPDFLoader("test1.pdf",mode="paged",strategy='auto', encoding='utf-8')

In [3]:
docs=loader.load()

  from .autonotebook import tqdm as notebook_tqdm
`mode='paged'` is deprecated in favor of the 'by_page' chunking strategy. Learn more about chunking here: https://docs.unstructured.io/open-source/core-functionality/chunking


In [4]:
for i, doc in enumerate(docs):
    print(f"Document {i+1}:")
    print(doc.page_content)  # Print the first 500 characters of each document
    print("\n---\n")

Document 1:
AN887 APPLICATION NOTE

MICROCONTROLLERS MADE EASY by Microcontroller Division Applications

WHAT IS A MICROCONTROLLER?

A few years ago, system control functions were implemented using logic components and were usually large, heavy boxes. Later on, microprocessors were used and the entire con- troller could fit onto a small circuit board. As the process of miniaturization continued, all of the components needed for a controller were built right onto one chip. By only including the fea- tures specific to the task, cost is relatively low.

A typical microcontroller has bit manipulation instructions, easy7 and direct access to I/O, and quick and efficient interrupt processing. Therefore, a microcontroller is a highly integrated device which includes, on one chip, all or most of the parts needed to perform an application control function.

Microcontrollers come in many varieties. Depending on the power and features that are needed, customers might choose a 4, 8, 16, or 32 bit 

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

# Split into smaller chunks for vector search
child_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

# Split into larger parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)


In [6]:
# Split the documents into smaller chunks
parent_chunk=parent_splitter.split_documents(docs)
for i, chunk in enumerate(parent_chunk):
    print(f"Parent Chunk {i+1}:")
    print(chunk.page_content)  # Print the content of each parent chunk
    print("\n---\n")

Parent Chunk 1:
AN887 APPLICATION NOTE

MICROCONTROLLERS MADE EASY by Microcontroller Division Applications

WHAT IS A MICROCONTROLLER?

A few years ago, system control functions were implemented using logic components and were usually large, heavy boxes. Later on, microprocessors were used and the entire con- troller could fit onto a small circuit board. As the process of miniaturization continued, all of the components needed for a controller were built right onto one chip. By only including the fea- tures specific to the task, cost is relatively low.

A typical microcontroller has bit manipulation instructions, easy7 and direct access to I/O, and quick and efficient interrupt processing. Therefore, a microcontroller is a highly integrated device which includes, on one chip, all or most of the parts needed to perform an application control function.

---

Parent Chunk 2:
Microcontrollers come in many varieties. Depending on the power and features that are needed, customers might choo

In [7]:
store=InMemoryStore()



In [8]:
from langchain_community.vectorstores.utils import filter_complex_metadata
# create a vector store for the parent documents
child_docs=child_splitter.split_documents(parent_chunk) # child from parent
filter_child_docs=filter_complex_metadata(child_docs)

In [9]:
## now we do embedding of the chunks
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [10]:
from langchain.vectorstores import Chroma


## vector store   
vector_store = Chroma.from_documents(
    documents=filter_child_docs,
    embedding=embeddings,
    persist_directory="chroma_db",
    collection_name="my_collection"
)


In [11]:
retreiver=ParentDocumentRetriever(
    vectorstore=vector_store,
    docstore=store,
    child_splitter=child_splitter,
)

In [12]:
from langchain_community.vectorstores.utils import filter_complex_metadata

filtered_parent_chunk = filter_complex_metadata(parent_chunk)
retriever=retreiver.add_documents(filtered_parent_chunk)

In [13]:
result=vector_store.similarity_search("What is the main topic of the document?", k=3)
print(result[0].page_content)

WITH RESPECT TO ANY CLAIMS ARISING FROM THE CONTENT OF SUCH A NOTE AND/OR THE USE MADE BY CUSTOMERS OF THE INFORMATION CONTAINED HEREIN IN CONNEXION WITH THEIR PRODUCTS.”


In [14]:
## now prompt 
from langchain_core.prompts import PromptTemplate
prompt = PromptTemplate(
    template="""
Use the following context to answer the question. 
If you don't know the answer, just say "I don't know"—don't try to make up an answer.

Context:
{context}

Question: {question}
Answer:
""",
    input_variables=["context", "question"]
)

In [15]:
## llm model
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
import os
load_dotenv()
model=ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0.2,
    api_key=os.getenv("GEMINI_API_KEY"),
)

In [16]:
## parse the question and get the answer
from langchain_core.output_parsers import StrOutputParser

parser=StrOutputParser()

In [17]:
from langchain.schema.runnable import RunnableSequence

In [18]:
def format_docs(retreiver):
    return "\n\n".join(doc.page_content for doc in retreiver)

chain = RunnableSequence(
    {
        "context": retreiver | format_docs,
        "question": lambda x: x
    },
    prompt,
    model,
    parser
)


In [19]:
response=chain.invoke("what is serial interface?")

In [20]:
print(response)

Serial interfaces are used to exchange data with the external world.  Many microcontrollers have both asynchronous (SCI or UART) and synchronous (SPI) serial communication peripherals built in.
