In [1]:
from dotenv import load_dotenv
import os

In [3]:
load_dotenv()
mykey = os.getenv("llamaParse")


In [28]:
from llama_cloud_services import LlamaExtract
from pydantic import BaseModel, Field
from typing import List

# Initialize extractor client
extractor = LlamaExtract(api_key=mykey)

# Define schema for a single question
class Question(BaseModel):
    question_text: str = Field(description="The text of the question")
    options: List[str] = Field(
        description="The answer choices for the question with the option_id (empty if not MCQ)"
    )
    answer: str = Field(description="The correct answer, sometimes the answers are in a table format so map it to the respective question (eg. A. Processor)")
    department: str = Field(description="The department that will use the question")
    question_type: str = Field(
        description="Type of question, e.g. 'MCQ', 'Fill-in', 'True/False'"
    )

# Wrap multiple questions
class QuestionSet(BaseModel):
    questions: List[Question] = Field(description="A list of extracted questions")



In [None]:
from llama_cloud.types import ExtractConfig, ExtractMode

config = ExtractConfig(use_reasoning=True, 
                      extraction_mode=ExtractMode.MULTIMODAL
                      )

In [None]:
# Create extraction agent
agent = extractor.create_agent(
    name="question-parser_v1",
    config=config,
    data_schema=QuestionSet
)



In [25]:
# Enhanced version of your original code
# agent = extractor.create_agent(
#     name="question-parser_v1",
#     data_schema=QuestionSet,  # Changed from Question to QuestionSet
#     extraction_instruction="""
#     Extract ALL questions from the ENTIRE document. 
#     Process every page thoroughly and don't stop after finding a few questions.
#     Look for all types of questions: MCQ, fill-in-the-blank, true/false, short answer.
#     Include all answer choices and map answers to questions.
#     """
# )

In [None]:
# Step 2: Use LlamaExtract to structure the extracted text
result = agent.extract("./CS_Questions.pdf")
print(result.data)

Uploading files: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]
Uploading files: 100%|██████████| 1/1 [00:06<00:00,  6.62s/it]]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.17it/s]
Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00,  1.17it/s]
Extracting files: 100%|██████████| 1/1 [02:35<00:00, 155.66s/it]

{'questions': [{'question_text': 'UNIVAC is', 'options': ['A) Universal Automatic Computer', 'B) Universal Array Computer', 'C) Unique Automatic Computer', 'D) Unvalued Automatic Computer'], 'answer': 'A) Universal Automatic Computer', 'department': '', 'question_type': 'MCQ'}, {'question_text': 'The basic operations performed by a computer are', 'options': ['A) Arithmetic operation', 'B) Logical operation', 'C) Storage and relative', 'D) All the above'], 'answer': 'D) All the above', 'department': '', 'question_type': 'MCQ'}, {'question_text': 'The two major types of computer chips are', 'options': ['A) External memory chip', 'B) Primary memory chip', 'C) Microprocessor chip', 'D) Both b and c'], 'answer': 'D) Both b and c', 'department': '', 'question_type': 'MCQ'}, {'question_text': 'Microprocessors as switching devices are for which generation computers', 'options': ['A) First Generation', 'B) Second Generation', 'C) Third Generation', 'D) Fourth Generation'], 'answer': 'D) Fourth 




Uploading files: 100%|██████████| 1/1 [00:06<00:00,  6.59s/it]

Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.71s/it]

Extracting files: 100%|██████████| 1/1 [00:38<00:00, 38.05s/it]



In [32]:
result.data

{'questions': [{'question_text': 'UNIVAC is',
   'options': ['A) Universal Automatic Computer',
    'B) Universal Array Computer',
    'C) Unique Automatic Computer',
    'D) Unvalued Automatic Computer'],
   'answer': 'A) Universal Automatic Computer',
   'department': '',
   'question_type': 'MCQ'},
  {'question_text': 'The basic operations performed by a computer are',
   'options': ['A) Arithmetic operation',
    'B) Logical operation',
    'C) Storage and relative',
    'D) All the above'],
   'answer': 'D) All the above',
   'department': '',
   'question_type': 'MCQ'},
  {'question_text': 'The two major types of computer chips are',
   'options': ['A) External memory chip',
    'B) Primary memory chip',
    'C) Microprocessor chip',
    'D) Both b and c'],
   'answer': 'D) Both b and c',
   'department': '',
   'question_type': 'MCQ'},
  {'question_text': 'Microprocessors as switching devices are for which generation computers',
   'options': ['A) First Generation',
    'B) Seco

In [33]:
# Extract from PDF
result = agent.extract("./biolset2.pdf")

# Preview structured output
print(result.data)

{'questions': [{'question_text': 'The adult human of average age and size has approximately how many quarts of blood?', 'options': ['4', '6', '8', '10'], 'answer': '6', 'department': 'Biology', 'question_type': 'MCQ'}, {'question_text': 'Once the erythrocytes enter the blood in humans, it is estimated that they have an average lifetime of how many days?', 'options': ['10 days', '120 days', '200 days', '360 days'], 'answer': '120 days', 'department': 'Biology', 'question_type': 'MCQ'}, {'question_text': 'Which mechanisms are important in the death of erythrocytes in human blood?', 'options': ['phagocytosis', 'hemolysis', 'mechanical damage', 'all of the above'], 'answer': 'all of the above', 'department': 'Biology', 'question_type': 'MCQ'}, {'question_text': 'Surplus red blood cells, needed to meet an emergency, are MAINLY stored in what organ of the human body?', 'options': ['pancreas', 'spleen', 'liver', 'kidneys'], 'answer': 'spleen', 'department': 'Biology', 'question_type': 'MCQ'},

In [34]:
result.data

{'questions': [{'question_text': 'The adult human of average age and size has approximately how many quarts of blood?',
   'options': ['4', '6', '8', '10'],
   'answer': '6',
   'department': 'Biology',
   'question_type': 'MCQ'},
  {'question_text': 'Once the erythrocytes enter the blood in humans, it is estimated that they have an average lifetime of how many days?',
   'options': ['10 days', '120 days', '200 days', '360 days'],
   'answer': '120 days',
   'department': 'Biology',
   'question_type': 'MCQ'},
  {'question_text': 'Which mechanisms are important in the death of erythrocytes in human blood?',
   'options': ['phagocytosis',
    'hemolysis',
    'mechanical damage',
    'all of the above'],
   'answer': 'all of the above',
   'department': 'Biology',
   'question_type': 'MCQ'},
  {'question_text': 'Surplus red blood cells, needed to meet an emergency, are MAINLY stored in what organ of the human body?',
   'options': ['pancreas', 'spleen', 'liver', 'kidneys'],
   'answer':

In [8]:
from collections import defaultdict

dept_counters = defaultdict(int)  # keeps track of counters per department

for q in result.data['questions']:
    # Generate department prefix (e.g., BIO, GEO, MTH...)
    dept_prefix = q["department"][:3].upper()

    # Increment counter for this department
    dept_counters[dept_prefix] += 1

    # Create question_id like BIO-001
    q["question_id"] = f"{dept_prefix}-{dept_counters[dept_prefix]:03d}"

    # Add public_text for user display
    public_text = q["question_text"]
    if q["options"]:
        public_text += " Options: " + ", ".join(q["options"])
    q["public_text"] = public_text

In [9]:
result.data

{'questions': [{'question_text': 'The adult human of average age and size has approximately how many quarts of blood?',
   'options': ['4', '6', '8', '10'],
   'answer': '6',
   'department': 'Biology',
   'question_type': 'MCQ',
   'question_id': 'BIO-001',
   'public_text': 'The adult human of average age and size has approximately how many quarts of blood? Options: 4, 6, 8, 10'},
  {'question_text': 'Once the erythrocytes enter the blood in humans, it is estimated that they have an average lifetime of how many days.',
   'options': ['10 days', '120 days', '200 days', '360 days'],
   'answer': '120 days',
   'department': 'Biology',
   'question_type': 'MCQ',
   'question_id': 'BIO-002',
   'public_text': 'Once the erythrocytes enter the blood in humans, it is estimated that they have an average lifetime of how many days. Options: 10 days, 120 days, 200 days, 360 days'},
  {'question_text': 'Which mechanisms are important in the death of erythrocytes in human blood?',
   'options': [

In [None]:
import json

with open("biolset2.txt", "w") as f:
    json.dump(result.data, f, indent=4)

print("Structured data saved to biolset2.txt")

Structured data saved to biolset2.txt


In [None]:
result.data

{'questions': [{'question_text': 'The adult human of average age and size has approximately how many quarts of blood?',
   'options': ['4', '6', '8', '10'],
   'answer': '6',
   'department': 'Biology',
   'question_type': 'MCQ',
   'question_id': 'BIO-001',
   'public_text': 'The adult human of average age and size has approximately how many quarts of blood? Options: 4, 6, 8, 10'},
  {'question_text': 'Once the erythrocytes enter the blood in humans, it is estimated that they have an average lifetime of how many days?',
   'options': ['10 days', '120 days', '200 days', '360 days'],
   'answer': '120 days',
   'department': 'Biology',
   'question_type': 'MCQ',
   'question_id': 'BIO-002',
   'public_text': 'Once the erythrocytes enter the blood in humans, it is estimated that they have an average lifetime of how many days? Options: 10 days, 120 days, 200 days, 360 days'},
  {'question_text': 'Which mechanisms are important in the death of erythrocytes in human blood?',
   'options': [

In [10]:
from pymongo import MongoClient
from collections import defaultdict

# 1. Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")  # or your Atlas URI
print("connection successful 0")
db = client["quiz_db"]
print("connection successful 1")
collection = db["questions"]
print("connection successful2")

connection successful 0
connection successful 1
connection successful2


In [11]:

# 2. Track counters per department for sequential question_id
# Check if the collection is empty to reset counters, otherwise load existing counts
if collection.count_documents({}) == 0:
    dept_counters = defaultdict(int)
else:
    # Load existing counts from the database
    dept_counters = defaultdict(int)
    for doc in collection.find({}, {"department": 1, "question_id": 1}):
        if "department" in doc and "question_id" in doc:
            dept = doc["department"][:3].upper()
            # Extract the counter part from the question_id (e.g., 'BIO-001' -> 1)
            try:
                count = int(doc["question_id"].split('-')[-1])
                if count > dept_counters[dept]:
                    dept_counters[dept] = count
            except (ValueError, IndexError):
                # Handle cases where question_id is not in expected format
                pass


# 3. Process and insert each question
for q in result.data['questions']:
    # Normalize department name
    dept_clean = q["department"].strip().title()

    # Increment per-department counter
    dept_prefix = dept_clean[:3].upper()
    dept_counters[dept_prefix] += 1

    # Build question_id (e.g., BIO-001)
    question_id = f"{dept_prefix}-{dept_counters[dept_prefix]:03d}"

    # Build public_text
    public_text = q["question_text"]
    if q["options"]:
        public_text += " Options: " + ", ".join(q["options"])

    # Final document
    doc = {
        "question_id": question_id,
        "question_text": q["question_text"],
        "options": q["options"],
        "answer": q["answer"],
        "department": dept_clean,
        "question_type": q["question_type"],
        "public_text": public_text
    }

    # Insert into MongoDB
    collection.insert_one(doc)
    print(f"Inserted {question_id} → {q['question_text']}")

Inserted BIO-001 → The adult human of average age and size has approximately how many quarts of blood?
Inserted BIO-002 → Once the erythrocytes enter the blood in humans, it is estimated that they have an average lifetime of how many days.
Inserted BIO-003 → Which mechanisms are important in the death of erythrocytes in human blood?
Inserted BIO-004 → Surplus red blood cells, needed to meet an emergency, are MAINLY stored in what organ of the human body?
Inserted BIO-005 → When a human donor gives a pint of blood, it usually requires how many weeks for the body RESERVE of red corpuscles to be replaced?
Inserted BIO-006 → There are three substances found in human blood which carry oxygen and which begin with the letter "H". Name two of these substances.
Inserted BIO-007 → The several types of white blood cells are sometimes collectively referred to as:
Inserted BIO-008 → The condition in which there is a DECREASE in the number of white blood cells in humans is known as:
Inserted BIO-009

In [12]:
from pymongo import MongoClient

client = MongoClient("mongodb://127.0.0.1:27017", serverSelectionTimeoutMS=5000)
print(client.admin.command("ping"))


{'ok': 1.0}


In [None]:
# Extract from PDF
result = agent.extract("/content/chemistry.pdf")

# Preview structured output
print(result.data)

Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.80s/it]
Extracting files: 100%|██████████| 1/1 [01:05<00:00, 65.06s/it]

{'questions': [{'question_text': 'Which of following is not a Colligative property?', 'options': ['Relative lowering of vapour pressure', 'Elevation in boiling point', 'Osmotic pressure', 'Atmospheric pressure'], 'answer': 'Atmospheric pressure', 'department': 'Chemistry', 'question_type': 'MCQ'}, {'question_text': 'If solute do not undergo neither association nor dissociation, its Van’t Hoff factor (i) will be', 'options': ['less than unity', 'greater than unity', 'unity', 'zero'], 'answer': 'unity', 'department': 'Chemistry', 'question_type': 'MCQ'}, {'question_text': 'Who proposed the law of independent migration of ions of an electrolyte?', 'options': ['Raoult', 'Van’t Hoff', 'Kohlrausch', 'Hess'], 'answer': 'Kohlrausch', 'department': 'Chemistry', 'question_type': 'MCQ'}, {'question_text': 'Units for rate constant (k) for all pseudo first order reactions is', 'options': ['mol⁻¹ L s⁻¹', 'mol⁻¹ L⁻¹ s⁻¹', 'mol L⁻¹ s⁻¹', 's⁻¹'], 'answer': 's⁻¹', 'department': 'Chemistry', 'question_ty




In [None]:
result.data

{'questions': [{'question_text': 'Which of following is not a Colligative property?',
   'options': ['Relative lowering of vapour pressure',
    'Elevation in boiling point',
    'Osmotic pressure',
    'Atmospheric pressure'],
   'answer': 'Atmospheric pressure',
   'department': 'Chemistry',
   'question_type': 'MCQ'},
  {'question_text': 'If solute do not undergo neither association nor dissociation, its Van’t Hoff factor (i) will be',
   'options': ['less than unity', 'greater than unity', 'unity', 'zero'],
   'answer': 'unity',
   'department': 'Chemistry',
   'question_type': 'MCQ'},
  {'question_text': 'Who proposed the law of independent migration of ions of an electrolyte?',
   'options': ['Raoult', 'Van’t Hoff', 'Kohlrausch', 'Hess'],
   'answer': 'Kohlrausch',
   'department': 'Chemistry',
   'question_type': 'MCQ'},
  {'question_text': 'Units for rate constant (k) for all pseudo first order reactions is',
   'options': ['mol⁻¹ L s⁻¹', 'mol⁻¹ L⁻¹ s⁻¹', 'mol L⁻¹ s⁻¹', 's⁻¹'],

In [None]:
from collections import defaultdict

dept_counters = defaultdict(int)  # keeps track of counters per department

for q in result.data['questions']:
    # Generate department prefix (e.g., BIO, GEO, MTH...)
    dept_prefix = q["department"][:3].upper()

    # Increment counter for this department
    dept_counters[dept_prefix] += 1

    # Create question_id like BIO-001
    q["question_id"] = f"{dept_prefix}-{dept_counters[dept_prefix]:03d}"

    # Add public_text for user display
    public_text = q["question_text"]
    if q["options"]:
        public_text += " Options: " + ", ".join(q["options"])
    q["public_text"] = public_text

In [None]:
result.data

{'questions': [{'question_text': 'Which of following is not a Colligative property?',
   'options': ['Relative lowering of vapour pressure',
    'Elevation in boiling point',
    'Osmotic pressure',
    'Atmospheric pressure'],
   'answer': 'Atmospheric pressure',
   'department': 'Chemistry',
   'question_type': 'MCQ',
   'question_id': 'CHE-001',
   'public_text': 'Which of following is not a Colligative property? Options: Relative lowering of vapour pressure, Elevation in boiling point, Osmotic pressure, Atmospheric pressure'},
  {'question_text': 'If solute do not undergo neither association nor dissociation, its Van’t Hoff factor (i) will be',
   'options': ['less than unity', 'greater than unity', 'unity', 'zero'],
   'answer': 'unity',
   'department': 'Chemistry',
   'question_type': 'MCQ',
   'question_id': 'CHE-002',
   'public_text': 'If solute do not undergo neither association nor dissociation, its Van’t Hoff factor (i) will be Options: less than unity, greater than unity,

In [None]:
# Extract from PDF
result = agent.extract("/content/chemistry-6-53.pdf")

# Preview structured output
print(result.data)

Uploading files: 100%|██████████| 1/1 [00:01<00:00,  1.89s/it]
Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
Extracting files: 100%|██████████| 1/1 [01:17<00:00, 77.82s/it]

{'questions': [{'question_text': 'Which of the following aqueous solutions would have the highest boiling point?', 'options': ['(a) 1.0 M NaOH', '(b) 1.0 M Na₂SO₄', '(c) 1.0 M NH₄NO₃', '(d) 1.0 M KNO₃'], 'answer': '(b) 1.0 M Na₂SO₄', 'department': 'Chemistry', 'question_type': 'MCQ'}, {'question_text': 'The set with the correct order of acidity is:', 'options': ['(a) HClO < HClO₂ < HClO₃ < HClO₄', '(b) HClO₄ < HClO₃ < HClO₂ < HClO', '(c) HClO₄ < HClO₃, HClO, HClO₂', '(d) HClO₃ < HClO₂ < HClO < HClO₄'], 'answer': '(a) HClO < HClO₂ < HClO₃ < HClO₄', 'department': 'Chemistry', 'question_type': 'MCQ'}, {'question_text': 'Each polypeptide in a protein has amino acids linked with each other in a specific sequence. This sequence of amino acids is said to be:', 'options': ['(a) Secondary structure of Protein', '(b) Tertiary structure of Protein', '(c) Primary structure of Protein', '(d) Quaternary structure of Protein'], 'answer': '(c) Primary structure of Protein', 'department': 'Chemistry', 




In [None]:
result.data

{'questions': [{'question_text': 'Which of the following aqueous solutions would have the highest boiling point?',
   'options': ['(a) 1.0 M NaOH',
    '(b) 1.0 M Na₂SO₄',
    '(c) 1.0 M NH₄NO₃',
    '(d) 1.0 M KNO₃'],
   'answer': '(b) 1.0 M Na₂SO₄',
   'department': 'Chemistry',
   'question_type': 'MCQ'},
  {'question_text': 'The set with the correct order of acidity is:',
   'options': ['(a) HClO < HClO₂ < HClO₃ < HClO₄',
    '(b) HClO₄ < HClO₃ < HClO₂ < HClO',
    '(c) HClO₄ < HClO₃, HClO, HClO₂',
    '(d) HClO₃ < HClO₂ < HClO < HClO₄'],
   'answer': '(a) HClO < HClO₂ < HClO₃ < HClO₄',
   'department': 'Chemistry',
   'question_type': 'MCQ'},
  {'question_text': 'Each polypeptide in a protein has amino acids linked with each other in a specific sequence. This sequence of amino acids is said to be:',
   'options': ['(a) Secondary structure of Protein',
    '(b) Tertiary structure of Protein',
    '(c) Primary structure of Protein',
    '(d) Quaternary structure of Protein'],
   'an