In [53]:
import os
from llama_parse import LlamaParse
from dotenv import load_dotenv

load_dotenv()

True

In [None]:


API_KEY = os.getenv("LLAMA_PARSE_API_KEY")

# Input PDF and output directory
pdf_path = "test1.pdf"
output_dir = "../data/parsed_md"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "test1.md")

# Skip parsing if file already exists
if os.path.exists(output_file):
    print(f"Markdown file already exists at {output_file}. Skipping parsing.")
else:
    # Initialize parser with support for tables and images
    parser = LlamaParse(
        api_key=API_KEY,
        result_type="markdown",
        verbose=True,
        parsing_instruction="""
            Please extract all content including tables, images, lists, and headers 
            in clean and readable markdown format.
        """
    )

  
        
           


In [55]:
  # Parse PDF
docs = parser.load_data(pdf_path)


Started parsing the file under job_id 5f4a44f0-87e0-41b6-bfe7-f96ec913dceb
...

In [57]:
# Save parsed markdown
with open(output_file, "w", encoding="utf-8") as f:
    for i, doc in enumerate(docs):
        f.write(doc.text)  # Use .text attribute to get string content
        f.write("\n\n---\n\n")  # Add separator between pages/docs

print(f"✅ PDF successfully parsed and saved to {output_file}")

✅ PDF successfully parsed and saved to ../data/parsed_md\test1.md


In [88]:
## now split the markdown file into chunks 
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Step 1: Convert LlamaParse docs to LangChain Documents
lc_docs = [Document(page_content=doc.text) for doc in docs]

## Step 2: Define splitters
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

## Step 3: Split into large "parent" chunks (for context)
parent_chunks = parent_splitter.split_documents(lc_docs)

## child chunks (for retrieval)
child_chunk=child_splitter.split_documents(lc_docs)

In [89]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

## Step 4: Create vector store
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


persist_directory = "../data/chroma_db"
os.makedirs("../data", exist_ok=True)

vector_store = Chroma(
    collection_name="child_chunks",
    embedding_function=embedding_model,
    persist_directory=persist_directory
)

## Step 5: Add small "child" chunks to vector store
vector_store.add_documents(child_chunk)

['d604637f-7755-4d47-bc2f-fb025fc234e9',
 '79be8e51-7d78-46e8-bba7-d40b75cc169b',
 '0a6ac53e-188b-4e97-ac92-c119e118d0e3',
 '51838858-48d5-41f8-94e2-1690b8446d87',
 '6daf79e6-a506-4bdc-b5b1-aa5160555272',
 '963d904a-1974-4de9-a196-6ec8e2204def',
 '369fb695-3d5d-4de2-aa27-fc21e648fb13',
 '51d38437-e496-4fcd-a547-72ea7951f1db',
 'dafceb91-8208-42e5-883f-dd524e361959',
 '790fc6ea-9094-4238-96bb-f07e84fa42c9',
 '42ba919f-f6ba-4107-b5b7-09d82cb773d1',
 '77fa63f3-6597-44d1-ab31-f21e53d7bf55',
 '87ff57c6-1d86-403e-b088-bd365295d50a',
 'f01a2903-1717-4719-aa02-27b8b3962ae1',
 '49a9bec0-390e-4fa1-a87a-90c5018f8f31',
 '4c5e36e6-644c-4305-9786-e3167ef3d211',
 '56ed13d4-ca30-4fdb-8676-54acde06afa5',
 '36963738-64ce-4311-9a85-9a309aed682e',
 'f2a46cef-7511-454d-973a-7e41b1749cd4',
 '693e026b-b040-4b55-8556-492750e5ebe5',
 '2f477b56-7ce8-451c-b032-bf6afc8ccd4d',
 '3d6c82a8-97e5-4f6f-8294-1ff0f1b216ae',
 'da9acfa6-a03e-4a32-ac69-b850dfa3f1c9',
 'fc3ed9b5-df47-4f5f-ad6b-370cee44fd77',
 '7ab0944f-6483-

In [None]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryByteStore
## Step 6: Set up ParentDocumentRetriever
store = InMemoryByteStore()
retriever = ParentDocumentRetriever(
    vectorstore=vector_store,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

## Step 7: Add large chunks to retriever
retriever.add_documents(parent_chunks)




✅ Vector store & retriever setup complete.


In [93]:
result=vector_store.similarity_search("What is the main topic of the document?")
print(f"Result: {result[0].page_content}")

Result: It is the brain of the system that processes all data and their travel along the bus. For example, in order to execute a program, the CPU will read the first instruction from program memory. This instruction is decoded by the CPU and executed. At the completion of the execution of the instruction, the next instruction is fetched from memory and is executed. This procedure is repeated until the


In [111]:
from langchain_core.prompts import PromptTemplate


prompt=PromptTemplate(
   template="""
You are an AI assistant that answers questions based on provided document content.

Use the following context from the document to answer the question accurately.

If the question is unrelated to the context, respond with:
"sorry, I don't know related to this topic"

Context:
{context}

Question:
{question}

Answer:
""",
    input_variables=["context", "question"]
)



summarize_prompt=PromptTemplate(

    template="""You are an AI assistant that summarizes documents.
Use the following context to create a concise summary.  
Context:{context}
Summary:""",
    input_variables=["context"]
)


In [114]:
from langchain_google_genai import ChatGoogleGenerativeAI

# initialize gemini model
api_key=os.getenv("GEIMINI_API_KEY")
model=ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0.2,
    google_api_key=api_key
)

In [115]:
from langchain_core.output_parsers import StrOutputParser
# Output parser
parser=StrOutputParser()

In [133]:
from langchain.schema.runnable import RunnableLambda

# Retrieve context using question
context_retriever = RunnableLambda(
    lambda x: {"context": retriever.get_relevant_documents(x.get("question", "")), **x}
)

# Format for QA Chain
chat_formatter = RunnableLambda(lambda x: {
    "context": "\n\n".join([doc.page_content for doc in x["context"]]),
    "question": x["question"]
})

# Format for Summarize Chain
summarize_formatter = RunnableLambda(lambda x: {
    "context": "\n\n".join([doc.page_content for doc in x["context"]])
})

In [135]:
from langchain.schema.runnable import RunnableSequence



# QA Chain
chat_chain = RunnableSequence(
    context_retriever,
    chat_formatter,
    prompt,
    model,
    parser
)

# Summary Chain
summarize_chain = RunnableSequence(
    context_retriever,
    summarize_formatter,
    summarize_prompt,
    model,
    parser
)

In [136]:
from langchain.schema.runnable import RunnableBranch
def is_chat_mode(x):
    return x.get("mode", "").lower() == "chat"

def is_summarize_mode(x):
    return x.get("mode", "").lower() == "summarize"

rag_mode_chain = RunnableBranch(
    (is_chat_mode, chat_chain),
    (is_summarize_mode, summarize_chain),
    RunnableLambda(lambda x: "❌ Invalid mode selected. Choose 'chat' or 'summarize'.")
)

In [138]:
# ✅ Chat Mode
response_chat = rag_mode_chain.invoke({
    "mode": "chat",
    "question": "who is binisha?"
})
print("🗣 Chat Response:\n", response_chat)

# ✅ Summarize Mode
response_summary = rag_mode_chain.invoke({
    "mode": "summarize",
    "question": "Summarize the document"  # used only to retrieve context
})
print("\n🧾 Summary:\n", response_summary)

🗣 Chat Response:
 sorry, I don't know related to this topic

🧾 Summary:
 This document primarily discusses the use of microcontrollers (MCUs), particularly STMicroelectronics' products, in various applications.  The automotive industry is highlighted as a major driver of MCU development, demanding high performance and reliability in challenging conditions.  The document then contrasts current and future home applications of MCUs, ranging from consumer electronics to smart home technologies.  Finally, it includes a disclaimer regarding liability, intellectual property, and usage restrictions, along with a list of STMicroelectronics' global offices.


ValueError: RunnableBranch requires at least two branches