In [1]:
import os
from dotenv import load_dotenv
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

In [2]:
load_dotenv()
# Load environment variables
LLAMA_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY")

In [3]:
parsing_instruction = """
You are a document parser. Extract the content into clean, structured markdown format.
- Preserve headings, subheadings, paragraphs clearly.
- Convert tables into proper markdown table syntax.
- Represent images with markdown image syntax ![Description](image_placeholder).
- If image data is missing, describe the image briefly in place.
- Keep lists, bullet points, and code blocks formatted.
- Avoid extra line breaks or broken markdown syntax.
"""

# %%
# Step 5: Define a function to load and parse PDFs using LlamaParse

def load_pdf_as_markdown(pdf_folder: str):
    """
    Load PDF files from a folder, parse with LlamaParse in markdown mode,
    using a detailed parsing instruction for better formatting.

    Args:
        pdf_folder (str): Path to folder with PDFs.

    Returns:
        List of Document objects with markdown text.
    """
    parser = LlamaParse(
        api_key=LLAMA_API_KEY,
        result_type="markdown",      # Get markdown output
        verbose=True,                # Show parsing logs
        parsing_instruction=parsing_instruction,
    )

    loader = SimpleDirectoryReader(
        input_dir=pdf_folder,
        file_extractor={".pdf": parser}
    )

    docs = loader.load_data()
    return docs

In [4]:

def save_docs_as_markdown(docs, save_folder="markdown"):
    """
    Save parsed documents as markdown files in the given folder.

    Args:
        docs (list): List of Document objects with `.text` attribute.
        save_folder (str): Folder name to save markdown files.
    """
    os.makedirs(save_folder, exist_ok=True)

    for i, doc in enumerate(docs):
        filename = f"doc_{i+1}.md"
        file_path = os.path.join(save_folder, filename)
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(doc.text)
    
    print(f"✅ Saved {len(docs)} markdown files to '{save_folder}' folder.")


In [5]:

pdf_folder_path = "../data"  # Your folder containing PDF files
docs = load_pdf_as_markdown(pdf_folder_path)

print(f"✅ Successfully loaded and parsed {len(docs)} document(s).")


print("\n--- Document 1 Preview (Markdown) ---\n")
print(docs[0].text[:1000])

# Save all parsed docs as markdown files in "markdown" folder

save_docs_as_markdown(docs)

Started parsing the file under job_id 9e6033f0-f03e-4b7a-a2ca-1d02fb1c1e2e
✅ Successfully loaded and parsed 12 document(s).

--- Document 1 Preview (Markdown) ---

# AN887
## APPLICATION NOTE

### MICROCONTROLLERS MADE EASY
#### by Microcontroller Division Applications

### WHAT IS A MICROCONTROLLER?
A few years ago, system control functions were implemented using logic components and were usually large, heavy boxes. Later on, microprocessors were used and the entire controller could fit onto a small circuit board. As the process of miniaturization continued, all of the components needed for a controller were built right onto one chip. By only including the features specific to the task, cost is relatively low.

A typical microcontroller has bit manipulation instructions, easy and direct access to I/O, and quick and efficient interrupt processing. Therefore, a microcontroller is a highly integrated device which includes, on one chip, all or most of the parts needed to perform an applic

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document


In [7]:
def load_markdown_docs(folder="markdown"):
    docs = []
    for filename in sorted(os.listdir(folder)):
        if filename.endswith(".md"):
            path = os.path.join(folder, filename)
            with open(path, "r", encoding="utf-8") as f:
                text = f.read()
            docs.append(Document(page_content=text, metadata={"source": filename}))
    return docs

markdown_docs = load_markdown_docs()
print(f"Loaded {len(markdown_docs)} markdown documents")

Loaded 12 markdown documents


In [8]:
child_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)


In [9]:
chunked_docs = []
for doc in markdown_docs:
    splits = child_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(splits):
        chunked_docs.append(
            Document(
                page_content=chunk,
                metadata={"source": doc.metadata["source"], "chunk": i}
            )
        )


In [10]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [11]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
persist_directory = "../chroma_md"  # Folder to save vector DB

vectorstore = Chroma(
    collection_name="markdown_chunks",
    embedding_function=embedding_model,
    persist_directory=persist_directory
)

  vectorstore = Chroma(


In [13]:
# Add chunks to vector store
vectorstore.add_documents(chunked_docs)

['8790f183-b84f-4719-a79e-b6d493330e3a',
 'ac3bbcb2-81a7-4f86-82c1-54c1050c6c94',
 '0a384dab-a5c1-4679-b42a-5f34d74b76a8',
 'c89b3e70-487a-425a-b910-596d486cf64e',
 '6a91b78a-31cb-4bc2-bbab-329f01d582e5',
 'ff41f57f-156d-4035-8117-592532356aac',
 '2b86c6b1-0e4c-4101-b6df-113bd889fa9b',
 'c5823b8b-2201-4368-a7d0-8003d587a441',
 '309e8dfc-efaf-455a-b762-807a580d6068',
 '15c31661-6e34-4850-a64d-7253f81dafa0',
 '613d8110-75f3-4717-8a9c-0ea1032b7673',
 '534c1fec-996f-49d8-ac26-7bd4ccfb737e',
 '54825d07-cda1-4ed4-bb7c-88e3e43791d8',
 '3dfdee2c-e20d-4365-9370-5b059629add9',
 '5c12a50b-3a18-4845-b07a-5108d27a9b9c',
 'fc1536e2-cda9-46bc-96a5-4019cd5e124b',
 '40ac397b-74fc-44c7-80c9-94c28680dc96',
 'd72b7faa-9d40-4f13-9024-8fa7d4d85717',
 'bbf31a28-4425-4d74-9c8c-41095df12ff2',
 'f3225c22-9f73-4cfa-901b-80298fba9538',
 'b861ae97-33ca-49c9-a1e4-bffffabe4c4d',
 '11092d5e-a3ed-4b8c-b954-46c2435f6a9d',
 '89afcb3b-57df-4957-9791-b97d02eb040d',
 '2241ce00-fced-4581-8b2e-751abd02d8b2',
 '025a3552-06d0-

In [14]:
print(f"Stored {len(chunked_docs)} chunks in Chroma vector store.")

Stored 71 chunks in Chroma vector store.


In [15]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

In [16]:
docstore = InMemoryStore()  # For storing full parent docs

In [17]:
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

In [18]:
retriever.add_documents(markdown_docs)  # Add full markdown docs as parents
print("✅ ParentDocumentRetriever is ready.")

✅ ParentDocumentRetriever is ready.


In [19]:
from langchain_core.prompts import PromptTemplate

In [20]:

# 1. Chat Prompt
chat_prompt = PromptTemplate.from_template("""
You are a helpful assistant. Use the context below to answer the question.
If the answer is not found, say so.

Context:
{context}

Question:
{question}
""")

# 2. Summarization Prompt
summarize_prompt = PromptTemplate.from_template("""
You are an expert summarizer. Your task is to read the provided document context and generate a comprehensive, well-structured summary that captures all key points, main ideas, and important details.

Instructions:
- Organize the summary with clear headings and bullet points where appropriate.
- Highlight major sections, concepts, and any lists or processes described in the context.
- Use concise language, but do not omit critical information.
- If the context includes tables, describe their content in summary form.
- If there are images or diagrams referenced, briefly mention their purpose or content.
- The summary should be easy to read and suitable for someone who needs a quick but thorough understanding of the document.

Context:
{context}

Summary:
""")

# 3. Quiz Prompt
quiz_prompt = PromptTemplate.from_template("""
You are a quiz generator. Create {num_questions} MCQs from the context below.

Context:
{context}
""")


In [21]:
from dotenv import load_dotenv
load_dotenv()  # Load environment variables from .env file

True

In [22]:
from langchain_groq import ChatGroq

In [23]:
llm = ChatGroq(model_name="llama-3.1-8b-instant", temperature=0.2)

In [24]:
from langchain_core.output_parsers import StrOutputParser

In [25]:
parser = StrOutputParser()

In [26]:
# Building a chain for our RAG system

In [27]:
from langchain.schema.runnable import RunnableSequence, RunnableLambda, RunnablePassthrough , RunnableBranch

In [28]:
# ✅ Retrieve context using question
context_retriever = RunnableLambda(
    lambda x: {"context": retriever.get_relevant_documents(x.get("question", "")), **x}
)

# ✅ Formatters
chat_formatter = RunnableLambda(lambda x: {
    "context": "\n\n".join([doc.page_content for doc in x.get("context", [])]),
    "question": x.get("question", "")
})

summarize_formatter = RunnableLambda(lambda x: {
    "context": "\n\n".join([doc.page_content for doc in x.get("context", [])])
})

quiz_formatter = RunnableLambda(lambda x: {
    "context": "\n\n".join([doc.page_content for doc in x.get("context", [])]),
    "num_questions": x.get("num_questions", 5)
})

In [29]:

chat_chain = RunnableSequence(
    context_retriever,
    chat_formatter,
    chat_prompt,
    llm,
    parser
)

summarize_chain = RunnableSequence(
    context_retriever,
    summarize_formatter,
    summarize_prompt,
    llm,
    parser
)

quiz_chain = RunnableSequence(
    context_retriever,
    quiz_formatter,
    quiz_prompt,
    llm,
    parser
)

In [None]:

# ✅ SAFE conditions
def is_chat_mode(x):
    return x.get("mode", "").lower() == "chat"

def is_summarize_mode(x):
    return x.get("mode", "").lower() == "summarize"

def is_quiz_mode(x):
    return x.get("mode", "").lower() == "quiz"




rag_mode_chain = RunnableBranch(
    (is_chat_mode, chat_chain),
    (is_summarize_mode, summarize_chain),
    (is_quiz_mode, quiz_chain),
    RunnableLambda(lambda x: " Invalid mode selected. Choose 'chat', 'summarize', or 'quiz'.")
)

In [31]:
# CHAT
response_chat = rag_mode_chain.invoke({
    "mode": "chat",
    "question": "What is a microcontroller?"
})
print("🗣️ Chat Response:\n", response_chat)

# SUMMARIZE
response_summary = rag_mode_chain.invoke({
    "mode": "summarize",
    "question": "Summarize the document"  # used only to retrieve relevant context
})
print("\n🧾 Summary:\n", response_summary)

# QUIZ (dynamic question count)
response_quiz = rag_mode_chain.invoke({
    "mode": "quiz",
    "question": "Generate quiz",
    "num_questions": 3  # 🔢 user-controlled!
})
print("\n🧠 Quiz:\n", response_quiz)


  lambda x: {"context": retriever.get_relevant_documents(x.get("question", "")), **x}


🗣️ Chat Response:
 A microcontroller is a highly integrated device which includes, on one chip, all or most of the parts needed to perform an application control function. It typically has bit manipulation instructions, easy and direct access to I/O, and quick and efficient interrupt processing.

🧾 Summary:
 **Microcontrollers Made Easy: Communication and Automotive Market**

**2.4 Communication: CAN & J1850**

* **CAN (Controller Area Network)**: A multiplexed wiring scheme developed by BOSH and Intel for automotive applications.
* **J1850**: The SAE multiplexed automotive wiring standard used in North America.
* **CAN Specification**: Widely used in industrial control in North America and Europe.
* **Lower Cost Microcontrollers**: Supporting CAN has the potential to increase its adoption.

**CAN Principle**

* The table illustrates the different systems and their corresponding speeds:
	+ **Fast Speed**: ABS/ASR, >125Kb/s
	+ **Slow Speed**: >125Kb/s
	+ **Inter System**: Motor, Dashboa