In [1]:
import os
from dotenv import load_dotenv
from llama_parse import LlamaParse
from langchain_core.documents import Document

In [2]:
load_dotenv()
LLAMA_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")

In [3]:
parsing_instruction = """
You are a document parser. Extract the content into clean, structured markdown format.
- Preserve headings, subheadings, paragraphs clearly.
- Convert tables into proper markdown table syntax.
- Represent images with markdown image syntax ![Description](image_placeholder).
- If image data is missing, describe the image briefly in place.
- Keep lists, bullet points, and code blocks formatted.
- Avoid extra line breaks or broken markdown syntax.
"""

def parse_single_pdf(pdf_path: str) -> list:
    parser = LlamaParse(
        api_key=LLAMA_API_KEY,
        result_type="markdown",
        verbose=True,
        parsing_instruction=parsing_instruction,
    )
    return parser.load_data(pdf_path)

def save_markdown(docs, folder="markdown"):
    os.makedirs(folder, exist_ok=True)
    for i, doc in enumerate(docs):
        with open(os.path.join(folder, f"doc_{i+1}.md"), "w", encoding="utf-8") as f:
            f.write(doc.text)
    print(f" Saved {len(docs)} markdown files to '{folder}'")

pdf_path = "sample.pdf"  #  Provide full path to single PDF
docs = parse_single_pdf(pdf_path)
save_markdown(docs)

Error while parsing the file 'sample.pdf': [Errno 2] No such file or directory: 'd:/ML(ExtraClass Project)/RAG_PROJECT/PDF-AI-Assistant-Chat-Summarize-Quiz-from-PDF/notebooks/sample.pdf'
 Saved 0 markdown files to 'markdown'


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document as LCDocument

In [5]:
# Load markdown as LangChain documents
def load_markdown_docs(folder="markdown"):
    loaded_docs = []
    for filename in os.listdir(folder):
        if filename.endswith(".md"):
            with open(os.path.join(folder, filename), "r", encoding="utf-8") as f:
                text = f.read()
            loaded_docs.append(LCDocument(page_content=text, metadata={"source": filename}))
    return loaded_docs

markdown_docs = load_markdown_docs()
print(f" Loaded {len(markdown_docs)} markdown documents")

 Loaded 12 markdown documents


In [6]:
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)

chunked_docs = []
for doc in markdown_docs:
    for i, chunk in enumerate(child_splitter.split_text(doc.page_content)):
        chunked_docs.append(LCDocument(
            page_content=chunk,
            metadata={"source": doc.metadata["source"], "chunk": i}
        ))

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
persist_directory = "chroma_db"

vectorstore = Chroma(
    collection_name="md_chunks",
    embedding_function=embedding_model,
    persist_directory=persist_directory
)

vectorstore.add_documents(chunked_docs)
print(f"Stored {len(chunked_docs)} chunks in Chroma vector store.")


  from .autonotebook import tqdm as notebook_tqdm
  vectorstore = Chroma(


Stored 71 chunks in Chroma vector store.


In [8]:

from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

docstore = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)
retriever.add_documents(markdown_docs)
print(" ParentDocumentRetriever ready.")

 ParentDocumentRetriever ready.


In [9]:
from langchain_core.prompts import PromptTemplate

chat_prompt = PromptTemplate.from_template("""
You are a helpful assistant. Use the context below to answer the question.
If the answer is not found, say so.

Context:
{context}

Question:
{question}
""")

summarize_prompt = PromptTemplate.from_template("""
You are an expert summarizer. Read the context and summarize key points, headings, and lists.

Context:
{context}

Summary:
""")

quiz_prompt = PromptTemplate.from_template("""
You are a quiz generator. Create {num_questions} MCQs from the context.

Context:
{context}
""")


In [10]:
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser

llm = ChatGroq(model_name="llama-3.1-8b-instant", temperature=0.2)
parser = StrOutputParser()

In [11]:
from langchain.schema.runnable import RunnableSequence, RunnableLambda, RunnableBranch

In [12]:
# Formatters
context_retriever = RunnableLambda(lambda x: {"context": retriever.get_relevant_documents(x.get("question", "")), **x})

chat_formatter = RunnableLambda(lambda x: {
    "context": "\n\n".join(doc.page_content for doc in x.get("context", [])),
    "question": x.get("question", "")
})

summarize_formatter = RunnableLambda(lambda x: {
    "context": "\n\n".join(doc.page_content for doc in x.get("context", []))
})

quiz_formatter = RunnableLambda(lambda x: {
    "context": "\n\n".join(doc.page_content for doc in x.get("context", [])),
    "num_questions": x.get("num_questions", 5)
})


In [13]:
# Chains
chat_chain = RunnableSequence(context_retriever, chat_formatter, chat_prompt, llm, parser)
summarize_chain = RunnableSequence(context_retriever, summarize_formatter, summarize_prompt, llm, parser)
quiz_chain = RunnableSequence(context_retriever, quiz_formatter, quiz_prompt, llm, parser)


In [14]:
def is_chat(x): return x.get("mode") == "chat"
def is_summary(x): return x.get("mode") == "summarize"
def is_quiz(x): return x.get("mode") == "quiz"

In [15]:
rag_chain = RunnableBranch(
    (is_chat, chat_chain),
    (is_summary, summarize_chain),
    (is_quiz, quiz_chain),
    RunnableLambda(lambda _: " Invalid mode. Choose 'chat', 'summarize', or 'quiz'.")
)


In [16]:
chat_response = rag_chain.invoke({
    "mode": "chat",
    "question": "What is a microcontroller?"
})
print("🗣️ Chat:", chat_response)

summary_response = rag_chain.invoke({
    "mode": "summarize",
    "question": "Summarize this"  # for retrieval
})
print("\n🧾 Summary:", summary_response)

quiz_response = rag_chain.invoke({
    "mode": "quiz",
    "question": "Generate quiz",
    "num_questions": 3
})
print("\n🧠 Quiz:\n", quiz_response)

  context_retriever = RunnableLambda(lambda x: {"context": retriever.get_relevant_documents(x.get("question", "")), **x})


🗣️ Chat: A microcontroller is a highly integrated device which includes, on one chip, all or most of the parts needed to perform an application control function. It typically has bit manipulation instructions, easy and direct access to I/O, and quick and efficient interrupt processing.

🧾 Summary: **Summary of Key Points and Headings**

**Microcontrollers Made Easy**

The article discusses the components and organization of microcontrollers, including:

1. **Memory Types**:
	* Flash: electrically erasable and programmable memory
	* RAM (Random Access Memory): stores data temporarily during program execution
	* EEPROM (Electrically Erasable Programmable Read Only Memory): stores data that must be saved through a power down cycle
2. **CPU (Central Processing Unit)**: the brain of the system that processes data and executes instructions
3. **Communication**:
	* CAN (Controller Area Network) and J1850: multiplexed wiring schemes for automotive and industrial control applications
	* CAN spe