In [1]:
from docstrange import DocumentExtractor
import json
import re


# ------------------------------------------------------------------
#  Multi-Page JSON Parser
# ------------------------------------------------------------------

def merge_question_json_objects(json_objects):
    """
    Merge multiple JSON objects from different pages into a single complete question set.
    
    Strategy:
    - Concatenate all questions arrays from different pages
    - Remove duplicate questions (by question_text)
    """
    merged_questions = []
    seen_questions = set()
    
    for obj in json_objects:
        questions = obj.get("questions", [])
        
        for question in questions:
            # Use question_text as unique identifier
            q_text = question.get("question_text", "").strip()
            
            if q_text and q_text not in seen_questions:
                seen_questions.add(q_text)
                merged_questions.append(question)
    
    return {"questions": merged_questions}


def parse_multi_object_json(raw_text):
    """
    Parse output that may contain multiple JSON objects separated by page breaks.
    Returns a single merged JSON object with all questions.
    
    Handles the case where docstrange returns multiple JSON objects (one per page)
    separated by page break markers.
    """
    # Split by page break markers
    page_break_patterns = [
        r'<!-- Page Break.*?-->',
        r'\n\n\n+',  # Multiple newlines
    ]
    
    # Combine patterns into one regex
    split_pattern = '|'.join(f'(?:{p})' for p in page_break_patterns)
    chunks = re.split(split_pattern, raw_text, flags=re.IGNORECASE)
    
    json_objects = []
    
    for chunk in chunks:
        chunk = chunk.strip()
        if not chunk:
            continue
        
        # Find JSON object in chunk
        json_start = chunk.find('{')
        if json_start == -1:
            continue
        
        json_string = chunk[json_start:]
        
        # Try to find the end of the JSON object
        try:
            # Use json.JSONDecoder to find where the JSON ends
            decoder = json.JSONDecoder()
            obj, end_idx = decoder.raw_decode(json_string)
            json_objects.append(obj)
        except json.JSONDecodeError as e:
            print(f"Warning: Could not parse JSON chunk: {e}")
            continue
    
    if not json_objects:
        raise ValueError("No valid JSON objects found in the output")
    
    # If only one object, return it directly
    if len(json_objects) == 1:
        return json_objects[0]
    
    # Otherwise, merge multiple objects
    print(f"Found {len(json_objects)} JSON objects across pages. Merging...")
    return merge_question_json_objects(json_objects)


# ------------------------------------------------------------------
#  Structured Question Schema
# ------------------------------------------------------------------
QUESTION_SCHEMA = {
    "type": "object",
    "properties": {
        "questions": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "question_text": {
                        "type": "string",
                        "description": "The full text of the question"
                    },
                    "options": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Array of answer options (for multiple choice questions)",
                        "default": []
                    },
                    "answer": {
                        "type": "string",
                        "description": "The correct answer to the question"
                    },
                    "question_type": {
                        "type": "string",
                        "description": "Type of question: 'multiple_choice', 'true_false', 'short_answer', or 'essay'",
                        "enum": ["multiple_choice", "true_false", "short_answer", "essay"]
                    },
                    "public_text": {
                        "type": "string",
                        "description": "Public-facing version of the question (without answer)",
                        "default": ""
                    }
                },
                "required": ["question_text", "answer", "question_type"]
            }
        }
    },
    "required": ["questions"]
}


class QuestionExtractor:
    """Extract questions from PDF documents"""
    
    def __init__(self):
        self.extractor = DocumentExtractor()

    def extract(self, document_path):
        """
        Extract questions from a PDF document.
        
        Args:
            document_path: Path to the PDF file
        
        Returns:
            List of question dictionaries with structure:
            [
                {
                    "question_text": str,
                    "options": List[str],
                    "answer": str,
                    "question_type": str,
                    "public_text": str
                },
                ...
            ]
        """
        try:
            result = self.extractor.extract(document_path)
            structured_data = result.extract_data(json_schema=QUESTION_SCHEMA)

            # Handle different output formats from docstrange
            text_to_parse = None
            
            if isinstance(structured_data, dict):
                # Check if it's a wrapped response with raw_content
                if 'document' in structured_data and 'raw_content' in structured_data['document']:
                    text_to_parse = structured_data['document']['raw_content']
                else:
                    # It's already a proper dict, return it
                    return structured_data.get("questions", [])
            elif isinstance(structured_data, str):
                text_to_parse = structured_data
            else:
                # Unknown format, convert to string for parsing attempt
                text_to_parse = json.dumps(structured_data)
            
            # If we have text to parse, try multi-object JSON parser
            if text_to_parse:
                try:
                    # First try standard JSON parsing
                    parsed_data = json.loads(text_to_parse)
                    return parsed_data.get("questions", [])
                except json.JSONDecodeError:
                    # If standard parsing fails, use multi-object parser
                    print("Standard JSON parsing failed, attempting multi-page parser...")
                    try:
                        parsed_data = parse_multi_object_json(text_to_parse)
                        return parsed_data.get("questions", [])
                    except Exception as e:
                        print(f"Multi-page JSON parsing error: {e}")
                        return []

            return []

        except FileNotFoundError:
            print(f"Error: Document not found at '{document_path}'.")
            return []
        except Exception as e:
            print(f"An error occurred during extraction: {e}")
            import traceback
            traceback.print_exc()
            return []


# -------------------------- Example usage --------------------------
if __name__ == "__main__":
    import os
    
    document_path = './../data/97_phys.pdf'
    
    if not os.path.exists(document_path):
        print(f"Error: File not found - {document_path}")
    else:
        extractor = QuestionExtractor()
        questions = extractor.extract(document_path)
        
        print(f"\n{'='*60}")
        print("EXTRACTION RESULTS")
        print(f"{'='*60}\n")
        print(f"Extracted {len(questions)} questions\n")
        
        # Display first 3 questions as examples
        for i, q in enumerate(questions[:3], 1):
            print(f"{i}. {q['question_text']}")
            print(f"   Type: {q['question_type']}")
            print(f"   Answer: {q['answer']}")
            if q.get('options'):
                print(f"   Options: {q['options']}")
            print()
        
        if len(questions) > 3:
            print(f"... and {len(questions) - 3} more questions")



üîê DocStrange Authentication

üåê Opening authentication page...
üìã If the browser doesn't open automatically, click this link:
üîó https://nanonets.auth0.com/authorize?response_type=code&client_id=meAtfPTIcmqhL7rLi8kCNqmTvdkGch4n&redirect_uri=http%3A%2F%2Flocalhost%3A8765%2Fcallback&scope=openid+profile+email&state=ca5589ab-281a-4728-b6d8-8fa2bc47c7f4&code_challenge=pXQAyeyIBvNSHY-Prwkp_JF_bAQCs-HBDZLWlPkd4fk&code_challenge_method=S256&connection=google-oauth2

‚è≥ Waiting for authentication...
üí° This will timeout in 5 minutes if not completed
‚úÖ Authentication successful!
üíæ Credentials cached for secure access
‚úÖ Authentication successful!
üíæ Credentials cached for secure access


Failed to parse JSON content: Expecting value: line 1 column 1 (char 0)



EXTRACTION RESULTS

Extracted 0 questions



In [3]:
# Test 1: Check docstrange version and available models
import docstrange
print(f"Docstrange version: {docstrange.__version__ if hasattr(docstrange, '__version__') else 'Unknown'}")

# Test basic extraction without schema first
extractor = DocumentExtractor()
document_path = './../data/97_phys.pdf'

print(f"\nExtracting from: {document_path}")
result = extractor.extract(document_path)
print(f"Result type: {type(result)}")
print(f"Result attributes: {dir(result)}")


Docstrange version: 1.1.5

Extracting from: ./../data/97_phys.pdf
Result type: <class 'docstrange.processors.cloud_processor.CloudConversionResult'>
Result attributes: ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_cached_outputs', '_convert_locally', '_extract_markdown_tables_directly', '_get_cloud_output', '_html_converter', '_json_parser', 'cloud_processor', 'content', 'extract_csv', 'extract_data', 'extract_html', 'extract_markdown', 'extract_text', 'file_path', 'metadata']


In [4]:
# Test 2: Try extracting text and markdown
print("=" * 60)
print("Testing different extraction methods")
print("=" * 60)

# Try extract_text
try:
    text_content = result.extract_text()
    print(f"\n‚úì extract_text() worked - Length: {len(text_content)}")
    print(f"First 500 chars:\n{text_content[:500]}")
except Exception as e:
    print(f"\n‚úó extract_text() failed: {e}")

# Try extract_markdown
try:
    markdown_content = result.extract_markdown()
    print(f"\n‚úì extract_markdown() worked - Length: {len(markdown_content)}")
    print(f"First 500 chars:\n{markdown_content[:500]}")
except Exception as e:
    print(f"\n‚úó extract_markdown() failed: {e}")


Testing different extraction methods

‚úì extract_text() worked - Length: 0
First 500 chars:


‚úì extract_markdown() worked - Length: 0
First 500 chars:



In [5]:
# Test 3: Try with a specific model configuration
import os

print("Checking environment variables:")
print(f"OPENAI_API_KEY: {'Set' if os.getenv('OPENAI_API_KEY') else 'Not set'}")
print(f"ANTHROPIC_API_KEY: {'Set' if os.getenv('ANTHROPIC_API_KEY') else 'Not set'}")

# Try with explicit model
print("\n" + "=" * 60)
print("Testing with explicit model configuration")
print("=" * 60)

# Try using a local model or specific configuration
try:
    # Option 1: Try with anthropic model
    extractor_with_model = DocumentExtractor(model="anthropic/claude-3-5-sonnet-20241022")
    result2 = extractor_with_model.extract(document_path)
    structured_data = result2.extract_data(json_schema=QUESTION_SCHEMA)
    print(f"\nStructured data type: {type(structured_data)}")
    print(f"Structured data keys: {structured_data.keys() if isinstance(structured_data, dict) else 'N/A'}")
    
    if isinstance(structured_data, dict) and 'error' in structured_data:
        print(f"Error: {structured_data['error']}")
    elif isinstance(structured_data, dict) and 'questions' in structured_data:
        print(f"SUCCESS! Got {len(structured_data['questions'])} questions")
        
except Exception as e:
    print(f"Failed with model: {e}")
    import traceback
    traceback.print_exc()


Checking environment variables:
OPENAI_API_KEY: Not set
ANTHROPIC_API_KEY: Not set

Testing with explicit model configuration


Failed to get specified-json from cloud API: 400 Client Error: Bad Request for url: https://extraction-api.nanonets.com/extract
Failed to parse JSON content: Expecting value: line 1 column 1 (char 0)



Structured data type: <class 'dict'>
Structured data keys: dict_keys(['document', 'format', 'error'])
Error: Expecting value: line 1 column 1 (char 0)


## Alternative: Manual PDF Extraction with Pattern Matching

Since docstrange requires API keys (OpenAI or Anthropic), let's try a simpler approach using PyPDF2 to extract text and regex patterns to find questions.

In [2]:
# Alternative approach: Extract using PyPDF2 and look for question patterns
import PyPDF2
import re

def extract_text_from_pdf(pdf_path):
    """Extract all text from PDF"""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"
    return text

# Extract text
pdf_text = extract_text_from_pdf(document_path)
print(f"Extracted {len(pdf_text)} characters from PDF")
print(f"\nFirst 1000 characters:\n{pdf_text[:1000]}")
print(f"\n{'='*60}\n")


Extracted 169665 characters from PDF

First 1000 characters:
Science Bowl PHYSICS
Physics - 1PHYS-91; Multiple Choice:  For the hydrogen atom, which series describes electron transitions to
the N=1 orbit, the lowest energy electron orbit? Is it the:
w)  Lyman seriesx)  Balmer seriesy)  Paschen seriesz)  Pfund series
ANSWER:   W -- LYMAN SERIES
PHYS-91; Multiple Choice:  Electric current may be expressed in which one of the following
units?
w)  coulombs/voltx)  joules/coulomby)  coulombs/secondz)  ohms/second
ANSWER:   Y -- COULOMBS/SECOND
PHYS-91; Short Answer:  In the SI system of measure, what is the unit of capacitance?
ANSWER:   FARAD
PHYS-91; Multiple Choice:  A Newton is equal to which of the following? 
w)  kilogram-meter per secondx)  meter per second squaredy)  kilogram-meter per second squaredz)  kilogram per meter-second
ANSWER:   Y -- KILOGRAM-METER PER SECOND SQUARED
PHYS-91; Multiple Choice:  For an object moving in uniform circular motion, the direction of the
instantane

In [3]:
# Parse questions from the extracted text
def parse_physics_questions(text):
    """Parse questions from Science Bowl physics format"""
    questions = []
    
    # Split by question markers (PHYS-91, PHYS-92, etc.)
    question_blocks = re.split(r'(PHYS-\d+;)', text)
    
    for i in range(1, len(question_blocks), 2):
        if i+1 >= len(question_blocks):
            break
            
        question_id = question_blocks[i].strip()
        content = question_blocks[i+1]
        
        # Extract question type
        type_match = re.match(r'(Multiple Choice|Short Answer|True/False):\s*(.*)', content)
        if not type_match:
            continue
            
        q_type = type_match.group(1)
        remaining = type_match.group(2)
        
        # Find the answer
        answer_match = re.search(r'ANSWER:\s*([^\n]+)', remaining, re.IGNORECASE)
        if not answer_match:
            continue
            
        answer_text = answer_match.group(1).strip()
        
        # Extract question text (everything before ANSWER)
        question_text = re.split(r'ANSWER:', remaining, flags=re.IGNORECASE)[0].strip()
        
        # For multiple choice, extract options
        options = []
        if 'Multiple Choice' in q_type:
            option_matches = re.findall(r'([wxyz])\)\s*([^\n]+?)(?=[wxyz]\)|ANSWER:|$)', question_text, re.IGNORECASE | re.DOTALL)
            options = [match[1].strip() for match in option_matches]
            
            # Clean up question text (remove options)
            question_text = re.split(r'[wxyz]\)', question_text)[0].strip()
        
        # Determine question type
        if 'Multiple Choice' in q_type:
            question_type = 'multiple_choice'
        elif 'Short Answer' in q_type:
            question_type = 'short_answer'
        elif 'True/False' in q_type:
            question_type = 'true_false'
        else:
            question_type = 'short_answer'
        
        questions.append({
            'question_text': question_text,
            'options': options,
            'answer': answer_text,
            'question_type': question_type,
            'public_text': question_text
        })
    
    return questions

# Parse the questions
parsed_questions = parse_physics_questions(pdf_text)
print(f"Parsed {len(parsed_questions)} questions\n")

# Show first 3 examples
for i, q in enumerate(parsed_questions[:3], 1):
    print(f"{i}. {q['question_text']}")
    print(f"   Type: {q['question_type']}")
    if q['options']:
        print(f"   Options: {q['options']}")
    print(f"   Answer: {q['answer']}")
    print()


Parsed 0 questions



In [9]:
# Debug: Check the pattern
sample = pdf_text[500:2000]
print("Sample text to analyze:")
print(sample)
print("\n" + "="*60 + "\n")

# Find all PHYS patterns
phys_matches = re.findall(r'PHYS-\d+', pdf_text)
print(f"Found {len(phys_matches)} question markers")
print(f"First 10: {phys_matches[:10]}")


Sample text to analyze:
nswer:  In the SI system of measure, what is the unit of capacitance?
ANSWER:   FARAD
PHYS-91; Multiple Choice:  A Newton is equal to which of the following? 
w)  kilogram-meter per secondx)  meter per second squaredy)  kilogram-meter per second squaredz)  kilogram per meter-second
ANSWER:   Y -- KILOGRAM-METER PER SECOND SQUARED
PHYS-91; Multiple Choice:  For an object moving in uniform circular motion, the direction of the
instantaneous acceleration vector is: 
w)  tangent to the path of motionx)  equal to zeroy)  directed radially outwardz)  directed radially inward
ANSWER:   Z -- DIRECTED RADIALLY INWARD
Science Bowl PHYSICS
Physics - 2PHYS-91; Short Answer:  A boy is standing on an elevator which is traveling downward with a
constant velocity of 30 meters per second.  The boy throws a ball vertically upward with avelocity of 10 meters per second relative to the elevator.  What is the velocity of the ball,MAGNITUDE AND DIRECTION, relative to the elevator sha

In [4]:
# Better parser - split by PHYS-XX pattern
def parse_science_bowl_questions(text):
    """Parse Science Bowl format questions"""
    questions = []
    
    # Split by PHYS-XX; pattern
    parts = re.split(r'(PHYS-\d+;)', text)
    
    for i in range(1, len(parts)-1, 2):
        try:
            question_id = parts[i].strip()
            content = parts[i+1]
            
            # Skip if no ANSWER
            if 'ANSWER:' not in content.upper():
                continue
            
            # Split by ANSWER to separate question from answer
            split_by_answer = re.split(r'\nANSWER:\s*', content, flags=re.IGNORECASE, maxsplit=1)
            if len(split_by_answer) < 2:
                continue
            
            question_part = split_by_answer[0].strip()
            answer_part = split_by_answer[1].split('\n')[0].strip()
            
            # Determine question type
            if 'Multiple Choice:' in question_part:
                question_type = 'multiple_choice'
                question_text = question_part.split('Multiple Choice:', 1)[1].strip()
                
                # Extract options
                option_pattern = r'([wxyz])\)\s*([^wxyz\n]+?)(?=[wxyz]\)|$)'
                option_matches = re.findall(option_pattern, question_text, re.IGNORECASE)
                options = [match[1].strip() for match in option_matches if match[1].strip()]
                
                # Clean question text (remove options section)
                question_text = re.split(r'\s+[wxyz]\)', question_text)[0].strip()
                
            elif 'Short Answer:' in question_part:
                question_type = 'short_answer'
                question_text = question_part.split('Short Answer:', 1)[1].strip()
                options = []
                
            elif 'True/False:' in question_part or 'True or False:' in question_part:
                question_type = 'true_false'
                if 'True/False:' in question_part:
                    question_text = question_part.split('True/False:', 1)[1].strip()
                else:
                    question_text = question_part.split('True or False:', 1)[1].strip()
                options = ['True', 'False']
            else:
                # Default to short answer
                question_type = 'short_answer'
                question_text = question_part
                options = []
            
            # Clean answer (remove letter prefixes like "W --" or "Y --")
            answer = re.sub(r'^[WXYZ]\s*--\s*', '', answer_part, flags=re.IGNORECASE).strip()
            
            questions.append({
                'question_text': question_text,
                'options': options,
                'answer': answer,
                'question_type': question_type,
                'public_text': question_text
            })
            
        except Exception as e:
            # Skip problematic questions
            continue
    
    return questions

# Parse with the new function
parsed_questions = parse_science_bowl_questions(pdf_text)
print(f"‚úì Parsed {len(parsed_questions)} questions!\n")

# Show first 5 examples
for i, q in enumerate(parsed_questions[:5], 1):
    print(f"{i}. {q['question_text'][:100]}...")
    print(f"   Type: {q['question_type']}")
    if q['options']:
        print(f"   Options: {len(q['options'])} choices")
    print(f"   Answer: {q['answer'][:50]}")
    print()


‚úì Parsed 666 questions!

1. For the hydrogen atom, which series describes electron transitions to
the N=1 orbit, the lowest ener...
   Type: multiple_choice
   Options: 3 choices
   Answer: LYMAN SERIES

2. Electric current may be expressed in which one of the following
units?...
   Type: multiple_choice
   Options: 4 choices
   Answer: COULOMBS/SECOND

3. In the SI system of measure, what is the unit of capacitance?...
   Type: short_answer
   Answer: FARAD

4. A Newton is equal to which of the following?...
   Type: multiple_choice
   Options: 4 choices
   Answer: KILOGRAM-METER PER SECOND SQUARED

5. For an object moving in uniform circular motion, the direction of the
instantaneous acceleration vec...
   Type: multiple_choice
   Options: 1 choices
   Answer: DIRECTED RADIALLY INWARD



In [11]:
# Show detailed view with full options
print("="*80)
print("DETAILED QUESTION VIEW - First 5 Questions")
print("="*80 + "\n")

for i, q in enumerate(parsed_questions[:5], 1):
    print(f"Question {i}:")
    print(f"  Text: {q['question_text']}")
    print(f"  Type: {q['question_type']}")
    
    if q['options']:
        print(f"  Options:")
        for idx, option in enumerate(q['options'], 1):
            print(f"    {idx}. {option}")
    
    print(f"  Answer: {q['answer']}")
    print("\n" + "-"*80 + "\n")


DETAILED QUESTION VIEW - First 5 Questions

Question 1:
  Text: For the hydrogen atom, which series describes electron transitions to
the N=1 orbit, the lowest energy electron orbit? Is it the:
  Type: multiple_choice
  Options:
    1. Balmer series
    2. Paschen series
    3. Pfund series
  Answer: LYMAN SERIES

--------------------------------------------------------------------------------

Question 2:
  Text: Electric current may be expressed in which one of the following
units?
  Type: multiple_choice
  Options:
    1. coulombs/volt
    2. joules/coulomb
    3. coulombs/second
    4. ohms/second
  Answer: COULOMBS/SECOND

--------------------------------------------------------------------------------

Question 3:
  Text: In the SI system of measure, what is the unit of capacitance?
  Type: short_answer
  Answer: FARAD

--------------------------------------------------------------------------------

Question 4:
  Text: A Newton is equal to which of the following?
  Type: multipl

In [5]:
# Debug: Let's see the raw text for the first question
first_question_raw = pdf_text[pdf_text.find('PHYS-91'):pdf_text.find('PHYS-91') + 500]
print("Raw text of first question:")
print(first_question_raw)
print("\n" + "="*80)


Raw text of first question:
PHYS-91; Multiple Choice:  For the hydrogen atom, which series describes electron transitions to
the N=1 orbit, the lowest energy electron orbit? Is it the:
w)  Lyman seriesx)  Balmer seriesy)  Paschen seriesz)  Pfund series
ANSWER:   W -- LYMAN SERIES
PHYS-91; Multiple Choice:  Electric current may be expressed in which one of the following
units?
w)  coulombs/voltx)  joules/coulomby)  coulombs/secondz)  ohms/second
ANSWER:   Y -- COULOMBS/SECOND
PHYS-91; Short Answer:  In the SI system of meas



In [13]:
# Fixed parser - properly extract all 4 options
def parse_science_bowl_fixed(text):
    """Parse Science Bowl format questions with proper option extraction"""
    questions = []
    
    # Split by PHYS-XX; or similar patterns  
    parts = re.split(r'([A-Z]+-\d+;)', text)
    
    for i in range(1, len(parts)-1, 2):
        try:
            question_id = parts[i].strip()
            content = parts[i+1]
            
            # Skip if no ANSWER
            if 'ANSWER:' not in content.upper():
                continue
            
            # Split by ANSWER
            split_by_answer = re.split(r'\nANSWER:\s*', content, flags=re.IGNORECASE, maxsplit=1)
            if len(split_by_answer) < 2:
                continue
            
            question_part = split_by_answer[0].strip()
            answer_part = split_by_answer[1].split('\n')[0].strip()
            
            # Determine question type
            if 'Multiple Choice:' in question_part:
                question_type = 'multiple_choice'
                question_text = question_part.split('Multiple Choice:', 1)[1].strip()
                
                # Find where options start (look for w))
                option_section_match = re.search(r'w\)', question_text, re.IGNORECASE)
                if option_section_match:
                    # Split question and options
                    q_text = question_text[:option_section_match.start()].strip()
                    options_text = question_text[option_section_match.start():].strip()
                    
                    # Extract all options - fixed pattern to handle no spaces between options
                    # Pattern: letter) followed by text until next letter) or end
                    options = []
                    for letter in ['w', 'x', 'y', 'z']:
                        # Look for pattern: letter) spaces any_text until next_letter) or end
                        pattern = rf'{letter}\)\s+([^wxyz]+?)(?=[wxyz]\)|$)'
                        match = re.search(pattern, options_text, re.IGNORECASE | re.DOTALL)
                        if match:
                            option_text = match.group(1).strip()
                            # Clean up newlines and extra spaces
                            option_text = ' '.join(option_text.split())
                            if option_text:
                                options.append(option_text)
                    
                    question_text = q_text
                else:
                    options = []
                    
            elif 'Short Answer:' in question_part:
                question_type = 'short_answer'
                question_text = question_part.split('Short Answer:', 1)[1].strip()
                options = []
                
            elif 'True/False:' in question_part or 'True or False:' in question_part:
                question_type = 'true_false'
                if 'True/False:' in question_part:
                    question_text = question_part.split('True/False:', 1)[1].strip()
                else:
                    question_text = question_part.split('True or False:', 1)[1].strip()
                options = ['True', 'False']
            else:
                question_type = 'short_answer'
                question_text = question_part
                options = []
            
            # Clean answer
            answer = re.sub(r'^[WXYZ]\s*--\s*', '', answer_part, flags=re.IGNORECASE).strip()
            
            questions.append({
                'question_text': question_text,
                'options': options,
                'answer': answer,
                'question_type': question_type,
                'public_text': question_text
            })
            
        except Exception as e:
            continue
    
    return questions

# Test fixed parser
fixed_questions = parse_science_bowl_fixed(pdf_text)
print(f"‚úì Parsed {len(fixed_questions)} questions with FIXED parser\n")

# Show first question with all 4 options
q = fixed_questions[0]
print(f"Question 1:")
print(f"  Text: {q['question_text']}")
print(f"  Type: {q['question_type']}")
print(f"  Options ({len(q['options'])} total):")
for idx, option in enumerate(q['options'], 1):
    print(f"    {idx}. {option}")
print(f"  Answer: {q['answer']}")


‚úì Parsed 672 questions with FIXED parser

Question 1:
  Text: For the hydrogen atom, which series describes electron transitions to
the N=1 orbit, the lowest energy electron orbit? Is it the:
  Type: multiple_choice
  Options (3 total):
    1. Balmer series
    2. Paschen series
    3. Pfund series
  Answer: LYMAN SERIES


In [6]:
# Let's examine the exact characters
options_line = "w)  Lyman seriesx)  Balmer seriesy)  Paschen seriesz)  Pfund series"
print("Options line:")
print(repr(options_line))
print("\nTrying to extract with different patterns:\n")

# Test pattern 1: Current pattern
pattern1 = r'w\)\s+([^wxyz]+?)(?=[wxyz]\)|$)'
match1 = re.search(pattern1, options_line, re.IGNORECASE)
if match1:
    print(f"Pattern 1 result: '{match1.group(1)}'")
else:
    print("Pattern 1: No match")

# Test pattern 2: Look for text between ) and next letter)
pattern2 = r'w\)\s*([^)]+?)(?=x\))'
match2 = re.search(pattern2, options_line, re.IGNORECASE)
if match2:
    print(f"Pattern 2 result: '{match2.group(1)}'")
    
# Test pattern 3: Split by letter) pattern first
split_options = re.split(r'([wxyz]\))', options_line)
print(f"\nSplit by letter): {split_options}")


Options line:
'w)  Lyman seriesx)  Balmer seriesy)  Paschen seriesz)  Pfund series'

Trying to extract with different patterns:

Pattern 1: No match
Pattern 2 result: 'Lyman series'

Split by letter): ['', 'w)', '  Lyman series', 'x)', '  Balmer series', 'y)', '  Paschen series', 'z)', '  Pfund series']


In [7]:
# FINAL WORKING PARSER - using split approach
def parse_science_bowl_final(text):
    """Parse Science Bowl format questions - FINAL VERSION"""
    questions = []
    
    # Split by question ID pattern (PHYS-XX;, CHEM-XX;, etc.)
    parts = re.split(r'([A-Z]+-\d+;)', text)
    
    for i in range(1, len(parts)-1, 2):
        try:
            question_id = parts[i].strip()
            content = parts[i+1]
            
            # Skip if no ANSWER
            if 'ANSWER:' not in content.upper():
                continue
            
            # Split by ANSWER
            split_by_answer = re.split(r'\nANSWER:\s*', content, flags=re.IGNORECASE, maxsplit=1)
            if len(split_by_answer) < 2:
                continue
            
            question_part = split_by_answer[0].strip()
            answer_part = split_by_answer[1].split('\n')[0].strip()
            
            # Determine question type and extract
            if 'Multiple Choice:' in question_part:
                question_type = 'multiple_choice'
                question_text = question_part.split('Multiple Choice:', 1)[1].strip()
                
                # Find where options start
                option_section_match = re.search(r'w\)', question_text, re.IGNORECASE)
                if option_section_match:
                    # Split question and options
                    q_text = question_text[:option_section_match.start()].strip()
                    options_text = question_text[option_section_match.start():].strip()
                    
                    # Split by letter) to get options
                    option_parts = re.split(r'([wxyz]\))', options_text, flags=re.IGNORECASE)
                    
                    # Extract option texts (every other part after a letter))
                    options = []
                    for j in range(1, len(option_parts), 2):
                        if j + 1 < len(option_parts):
                            option_text = option_parts[j + 1].strip()
                            # Clean up extra spaces and newlines
                            option_text = ' '.join(option_text.split())
                            if option_text:
                                options.append(option_text)
                    
                    question_text = q_text
                else:
                    options = []
                    
            elif 'Short Answer:' in question_part:
                question_type = 'short_answer'
                question_text = question_part.split('Short Answer:', 1)[1].strip()
                options = []
                
            elif 'True/False:' in question_part or 'True or False:' in question_part:
                question_type = 'true_false'
                if 'True/False:' in question_part:
                    question_text = question_part.split('True/False:', 1)[1].strip()
                else:
                    question_text = question_part.split('True or False:', 1)[1].strip()
                options = ['True', 'False']
            else:
                question_type = 'short_answer'
                question_text = question_part
                options = []
            
            # Clean answer (remove letter prefixes like "W --")
            answer = re.sub(r'^[WXYZ]\s*--\s*', '', answer_part, flags=re.IGNORECASE).strip()
            
            questions.append({
                'question_text': question_text,
                'options': options,
                'answer': answer,
                'question_type': question_type,
                'public_text': question_text
            })
            
        except Exception as e:
            continue
    
    return questions

# Parse with FINAL parser
final_questions = parse_science_bowl_final(pdf_text)
print(f"‚úì SUCCESS! Parsed {len(final_questions)} questions\n")
print("="*80)

# Verify first 3 questions have all options
for i, q in enumerate(final_questions[:3], 1):
    print(f"\nQuestion {i}:")
    print(f"  Text: {q['question_text'][:80]}...")
    print(f"  Type: {q['question_type']}")
    
    if q['options']:
        print(f"  Options ({len(q['options'])} total):")
        for idx, option in enumerate(q['options'], 1):
            print(f"    {idx}. {option}")
    
    print(f"  Answer: {q['answer']}")
    print("-"*80)


‚úì SUCCESS! Parsed 672 questions


Question 1:
  Text: For the hydrogen atom, which series describes electron transitions to
the N=1 or...
  Type: multiple_choice
  Options (4 total):
    1. Lyman series
    2. Balmer series
    3. Paschen series
    4. Pfund series
  Answer: LYMAN SERIES
--------------------------------------------------------------------------------

Question 2:
  Text: Electric current may be expressed in which one of the following
units?...
  Type: multiple_choice
  Options (4 total):
    1. coulombs/volt
    2. joules/coulomb
    3. coulombs/second
    4. ohms/second
  Answer: COULOMBS/SECOND
--------------------------------------------------------------------------------

Question 3:
  Text: In the SI system of measure, what is the unit of capacitance?...
  Type: short_answer
  Answer: FARAD
--------------------------------------------------------------------------------


In [8]:
# Test the biology file
import PyPDF2

bio_path = './../data/biolset2.pdf'

# Extract text
with open(bio_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    bio_text = ""
    for page in pdf_reader.pages:
        bio_text += page.extract_text() + "\n"

print(f"Biology PDF - Extracted {len(bio_text)} characters")
print(f"\nFirst 1500 characters:\n{bio_text[:1500]}")
print("\n" + "="*80)

# Check for question patterns
import re
patterns_found = {}
patterns_found['BIOL-'] = len(re.findall(r'BIOL-\d+', bio_text))
patterns_found['BIO-'] = len(re.findall(r'BIO-\d+', bio_text))
patterns_found['Multiple Choice'] = len(re.findall(r'Multiple Choice', bio_text))
patterns_found['Short Answer'] = len(re.findall(r'Short Answer', bio_text))
patterns_found['ANSWER:'] = len(re.findall(r'ANSWER:', bio_text, re.IGNORECASE))

print(f"\nPattern analysis:")
for pattern, count in patterns_found.items():
    print(f"  {pattern}: {count} occurrences")


Biology PDF - Extracted 20375 characters

First 1500 characters:
Science Bowl Questions Biology  - 1
Science Bowl Questions ‚Äì Biology, Set 2
1. Multiple Choice: The adult human of average age and size has approximately how many quarts of
blood? Is it:
a) 4
b) 6
c) 8
d) 10
ANSWER: B -- 6
2. Multiple Choice: Once the erythrocytes enter the blood in humans, it is estimated that they have an
average lifetime of how many days. Is it:
a) 10 days
b) 120 days
c) 200 days
d) 360 days
ANSWER: B -- 120 Days
3. Multiple Choice: Of the following, which mechanisms are important in the death of erythrocytes ( pron:
eh-rith-reh-sites) in human blood? Is it
a) phagocytosis ( pron: fag-eh-seh-toe-sis)
b) hemolysis
c) mechanical damage
d) all of the above
ANSWER: D -- all of the above
4. Multiple Choice: Surplus red blood cells, needed to meet an emergency, are MAINLY stored in what
organ of the human body? Is it the:
a) pancreas
b) spleen
c) liver
d) kidneys
ANSWER: B ‚Äì spleen
5. Multiple Choice: Wh

In [15]:
# Improved parser with better option extraction
def parse_science_bowl_improved(text):
    """Parse Science Bowl format questions with improved option extraction"""
    questions = []
    
    # Split by PHYS-XX; or similar patterns
    parts = re.split(r'([A-Z]+-\d+;)', text)
    
    for i in range(1, len(parts)-1, 2):
        try:
            question_id = parts[i].strip()
            content = parts[i+1]
            
            # Skip if no ANSWER
            if 'ANSWER:' not in content.upper():
                continue
            
            # Split by ANSWER
            split_by_answer = re.split(r'\nANSWER:\s*', content, flags=re.IGNORECASE, maxsplit=1)
            if len(split_by_answer) < 2:
                continue
            
            question_part = split_by_answer[0].strip()
            answer_part = split_by_answer[1].split('\n')[0].strip()
            
            # Determine question type
            if 'Multiple Choice:' in question_part:
                question_type = 'multiple_choice'
                question_text = question_part.split('Multiple Choice:', 1)[1].strip()
                
                # Improved option extraction - handle options that span lines
                # First, find where options start
                option_section_match = re.search(r'(w\))', question_text, re.IGNORECASE)
                if option_section_match:
                    # Split question and options
                    q_text = question_text[:option_section_match.start()].strip()
                    options_text = question_text[option_section_match.start():].strip()
                    
                    # Extract all options (w, x, y, z)
                    options = []
                    for letter in ['w', 'x', 'y', 'z']:
                        pattern = rf'{letter}\)\s*([^wxyz]+?)(?=[wxyz]\)|$)'
                        match = re.search(pattern, options_text, re.IGNORECASE | re.DOTALL)
                        if match:
                            option_text = match.group(1).strip()
                            # Clean up newlines and extra spaces
                            option_text = ' '.join(option_text.split())
                            if option_text:
                                options.append(option_text)
                    
                    question_text = q_text
                else:
                    options = []
                    
            elif 'Short Answer:' in question_part:
                question_type = 'short_answer'
                question_text = question_part.split('Short Answer:', 1)[1].strip()
                options = []
                
            elif 'True/False:' in question_part or 'True or False:' in question_part:
                question_type = 'true_false'
                if 'True/False:' in question_part:
                    question_text = question_part.split('True/False:', 1)[1].strip()
                else:
                    question_text = question_part.split('True or False:', 1)[1].strip()
                options = ['True', 'False']
            else:
                question_type = 'short_answer'
                question_text = question_part
                options = []
            
            # Clean answer
            answer = re.sub(r'^[WXYZ]\s*--\s*', '', answer_part, flags=re.IGNORECASE).strip()
            
            questions.append({
                'question_text': question_text,
                'options': options,
                'answer': answer,
                'question_type': question_type,
                'public_text': question_text
            })
            
        except Exception as e:
            continue
    
    return questions

# Test improved parser
improved_questions = parse_science_bowl_improved(pdf_text)
print(f"‚úì Parsed {len(improved_questions)} questions with improved parser\n")

# Show first 5 with full options
for i, q in enumerate(improved_questions[:5], 1):
    print(f"Question {i}:")
    print(f"  Text: {q['question_text'][:100]}...")
    print(f"  Type: {q['question_type']}")
    
    if q['options']:
        print(f"  Options ({len(q['options'])} total):")
        for idx, option in enumerate(q['options'], 1):
            print(f"    {idx}. {option[:80]}{'...' if len(option) > 80 else ''}")
    
    print(f"  Answer: {q['answer'][:60]}")
    print()


‚úì Parsed 672 questions with improved parser

Question 1:
  Text: For the hydrogen atom, which series describes electron transitions to
the N=1 orbit, the lowest ener...
  Type: multiple_choice
  Options (3 total):
    1. Balmer series
    2. Paschen series
    3. Pfund series
  Answer: LYMAN SERIES

Question 2:
  Text: Electric current may be expressed in which one of the following
units?...
  Type: multiple_choice
  Options (4 total):
    1. coulombs/volt
    2. joules/coulomb
    3. coulombs/second
    4. ohms/second
  Answer: COULOMBS/SECOND

Question 3:
  Text: In the SI system of measure, what is the unit of capacitance?...
  Type: short_answer
  Answer: FARAD

Question 4:
  Text: A Newton is equal to which of the following?...
  Type: multiple_choice
  Options (4 total):
    1. kilogram-meter per second
    2. meter per second squared
    3. kilogram-meter per second squared
    4. kilogram per meter-second
  Answer: KILOGRAM-METER PER SECOND SQUARED

Question 5:
  Text: For an

In [9]:
# Define Pydantic models for structured extraction
from pydantic import BaseModel, Field
from typing import List, Literal
import ollama

class Question(BaseModel):
    """Single question model"""
    question_text: str = Field(description="The full text of the question")
    options: List[str] = Field(default=[], description="List of answer options for multiple choice")
    answer: str = Field(description="The correct answer")
    question_type: Literal["multiple_choice", "true_false", "short_answer", "essay"] = Field(
        description="Type of question"
    )

class QuestionList(BaseModel):
    """List of questions"""
    questions: List[Question] = Field(description="All questions found")

print("‚úì Pydantic models defined")
print(f"Schema: {QuestionList.model_json_schema()}")


‚úì Pydantic models defined
Schema: {'$defs': {'Question': {'description': 'Single question model', 'properties': {'question_text': {'description': 'The full text of the question', 'title': 'Question Text', 'type': 'string'}, 'options': {'default': [], 'description': 'List of answer options for multiple choice', 'items': {'type': 'string'}, 'title': 'Options', 'type': 'array'}, 'answer': {'description': 'The correct answer', 'title': 'Answer', 'type': 'string'}, 'question_type': {'description': 'Type of question', 'enum': ['multiple_choice', 'true_false', 'short_answer', 'essay'], 'title': 'Question Type', 'type': 'string'}}, 'required': ['question_text', 'answer', 'question_type'], 'title': 'Question', 'type': 'object'}}, 'description': 'List of questions', 'properties': {'questions': {'description': 'All questions found', 'items': {'$ref': '#/$defs/Question'}, 'title': 'Questions', 'type': 'array'}}, 'required': ['questions'], 'title': 'QuestionList', 'type': 'object'}


In [10]:
# Test with a small sample of text first
sample_text = """
BIOL-91; Multiple Choice: What is the powerhouse of the cell?
w) Nucleus
x) Mitochondria
y) Ribosome
z) Golgi apparatus
ANSWER: X -- MITOCHONDRIA

BIOL-91; Short Answer: What is the process by which plants make food?
ANSWER: PHOTOSYNTHESIS
"""

print("Testing Ollama extraction with sample text...")
print(f"Sample text length: {len(sample_text)} characters\n")

try:
    response = ollama.chat(
        model='llama3.2:latest',
        messages=[{
            'role': 'user',
            'content': f"""Extract all questions from this text. For each question identify:
- The complete question text
- The type (multiple_choice, short_answer, true_false, or essay)
- All answer options if multiple choice
- The correct answer

Text:
{sample_text}"""
        }],
        format=QuestionList.model_json_schema(),
        options={'temperature': 0}
    )
    
    # Parse response
    content = response['message']['content']
    print("Raw response:")
    print(content[:500])
    print("\n" + "="*80 + "\n")
    
    # Validate with Pydantic
    question_list = QuestionList.model_validate_json(content)
    
    print(f"‚úì Successfully extracted {len(question_list.questions)} questions\n")
    
    for i, q in enumerate(question_list.questions, 1):
        print(f"Question {i}:")
        print(f"  Text: {q.question_text}")
        print(f"  Type: {q.question_type}")
        if q.options:
            print(f"  Options: {q.options}")
        print(f"  Answer: {q.answer}")
        print()
        
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()


Testing Ollama extraction with sample text...
Sample text length: 241 characters

Error: model requires more system memory (1.5 GiB) than is available (1.3 GiB) (status code: 500)


Traceback (most recent call last):
  File "C:\Users\Enoch\AppData\Local\Temp\ipykernel_19716\1296911678.py", line 18, in <module>
    response = ollama.chat(
               ^^^^^^^^^^^^
  File "c:\Users\Enoch\.conda\envs\my_env\Lib\site-packages\ollama\_client.py", line 342, in chat
    return self._request(
           ^^^^^^^^^^^^^^
  File "c:\Users\Enoch\.conda\envs\my_env\Lib\site-packages\ollama\_client.py", line 180, in _request
    return cls(**self._request_raw(*args, **kwargs).json())
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Enoch\.conda\envs\my_env\Lib\site-packages\ollama\_client.py", line 124, in _request_raw
    raise ResponseError(e.response.text, e.response.status_code) from None
ollama._types.ResponseError: model requires more system memory (1.5 GiB) than is available (1.3 GiB) (status code: 500)


In [11]:
# Use our working parser and wrap with Pydantic
def extract_with_pydantic_validation(text):
    """Extract using regex then validate with Pydantic"""
    
    # Use the working parser we already have
    questions = []
    parts = re.split(r'([A-Z]+-\d+;)', text)
    
    for i in range(1, len(parts)-1, 2):
        try:
            content = parts[i+1]
            
            if 'ANSWER:' not in content.upper():
                continue
            
            split_by_answer = re.split(r'\nANSWER:\s*', content, flags=re.IGNORECASE, maxsplit=1)
            if len(split_by_answer) < 2:
                continue
            
            question_part = split_by_answer[0].strip()
            answer_part = split_by_answer[1].split('\n')[0].strip()
            
            # Parse based on type
            if 'Multiple Choice:' in question_part:
                question_type = 'multiple_choice'
                question_text = question_part.split('Multiple Choice:', 1)[1].strip()
                
                option_section_match = re.search(r'w\)', question_text, re.IGNORECASE)
                if option_section_match:
                    q_text = question_text[:option_section_match.start()].strip()
                    options_text = question_text[option_section_match.start():].strip()
                    option_parts = re.split(r'([wxyz]\))', options_text, flags=re.IGNORECASE)
                    
                    options = []
                    for j in range(1, len(option_parts), 2):
                        if j + 1 < len(option_parts):
                            option_text = ' '.join(option_parts[j + 1].strip().split())
                            if option_text:
                                options.append(option_text)
                    question_text = q_text
                else:
                    options = []
                    
            elif 'Short Answer:' in question_part:
                question_type = 'short_answer'
                question_text = question_part.split('Short Answer:', 1)[1].strip()
                options = []
                
            elif 'True/False:' in question_part or 'True or False:' in question_part:
                question_type = 'true_false'
                if 'True/False:' in question_part:
                    question_text = question_part.split('True/False:', 1)[1].strip()
                else:
                    question_text = question_part.split('True or False:', 1)[1].strip()
                options = ['True', 'False']
            else:
                question_type = 'short_answer'
                question_text = question_part
                options = []
            
            answer = re.sub(r'^[WXYZ]\s*--\s*', '', answer_part, flags=re.IGNORECASE).strip()
            
            # Create Pydantic model instance for validation
            q = Question(
                question_text=question_text,
                options=options,
                answer=answer,
                question_type=question_type
            )
            questions.append(q)
            
        except Exception as e:
            continue
    
    return QuestionList(questions=questions)

# Test with biology file
bio_text = extract_text_from_pdf('./../data/biolset2.pdf')
print(f"Biology PDF: {len(bio_text)} characters\n")

# Show first 1000 chars to see format
print("First 1000 chars:")
print(bio_text[:1000])
print("\n" + "="*80)


Biology PDF: 20375 characters

First 1000 chars:
Science Bowl Questions Biology  - 1
Science Bowl Questions ‚Äì Biology, Set 2
1. Multiple Choice: The adult human of average age and size has approximately how many quarts of
blood? Is it:
a) 4
b) 6
c) 8
d) 10
ANSWER: B -- 6
2. Multiple Choice: Once the erythrocytes enter the blood in humans, it is estimated that they have an
average lifetime of how many days. Is it:
a) 10 days
b) 120 days
c) 200 days
d) 360 days
ANSWER: B -- 120 Days
3. Multiple Choice: Of the following, which mechanisms are important in the death of erythrocytes ( pron:
eh-rith-reh-sites) in human blood? Is it
a) phagocytosis ( pron: fag-eh-seh-toe-sis)
b) hemolysis
c) mechanical damage
d) all of the above
ANSWER: D -- all of the above
4. Multiple Choice: Surplus red blood cells, needed to meet an emergency, are MAINLY stored in what
organ of the human body? Is it the:
a) pancreas
b) spleen
c) liver
d) kidneys
ANSWER: B ‚Äì spleen
5. Multiple Choice: When a human donor

In [12]:
# Test extraction on biology format
bio_questions = extract_with_pydantic_validation(bio_text)
print(f"Extracted {len(bio_questions.questions)} questions from biology file\n")

if len(bio_questions.questions) > 0:
    print("First 3 questions:")
    for i, q in enumerate(bio_questions.questions[:3], 1):
        print(f"\n{i}. {q.question_text[:80]}...")
        print(f"   Type: {q.question_type}")
        if q.options:
            print(f"   Options ({len(q.options)}): {q.options}")
        print(f"   Answer: {q.answer}")
else:
    print("‚ùå No questions extracted - format not recognized")
    print("\nLet's check for different patterns:")
    print(f"  Numbered questions (1., 2., etc): {len(re.findall(r'^\\d+\\.', bio_text, re.MULTILINE))}")
    print(f"  'Multiple Choice:' : {len(re.findall(r'Multiple Choice:', bio_text))}")
    print(f"  'ANSWER:' : {len(re.findall(r'ANSWER:', bio_text, re.IGNORECASE))}")


Extracted 0 questions from biology file

‚ùå No questions extracted - format not recognized

Let's check for different patterns:
  Numbered questions (1., 2., etc): 0
  'Multiple Choice:' : 82
  'ANSWER:' : 118


In [13]:
# Create parser for numbered question format (Biology format)
def parse_numbered_format(text):
    """Parse questions in numbered format (1. 2. 3. etc with a) b) c) d) options)"""
    questions = []
    
    # Split by question numbers at start of line
    parts = re.split(r'\n(\d+)\.\s+Multiple Choice:', text)
    
    for i in range(1, len(parts)-1, 2):
        try:
            question_num = parts[i].strip()
            content = parts[i+1]
            
            # Skip if no ANSWER
            if 'ANSWER:' not in content.upper():
                continue
            
            # Split by ANSWER
            split_by_answer = re.split(r'\nANSWER:\s*', content, flags=re.IGNORECASE, maxsplit=1)
            if len(split_by_answer) < 2:
                continue
            
            question_part = split_by_answer[0].strip()
            answer_part = split_by_answer[1].split('\n')[0].strip()
            
            # Find where options start (look for a))
            option_match = re.search(r'\n\s*a\)', question_part, re.IGNORECASE)
            if option_match:
                q_text = question_part[:option_match.start()].strip()
                options_text = question_part[option_match.start():].strip()
                
                # Extract options (a, b, c, d)
                option_parts = re.split(r'([abcd]\))', options_text, flags=re.IGNORECASE)
                options = []
                for j in range(1, len(option_parts), 2):
                    if j + 1 < len(option_parts):
                        option_text = ' '.join(option_parts[j + 1].strip().split())
                        if option_text:
                            options.append(option_text)
                
                question_text = q_text
            else:
                question_text = question_part
                options = []
            
            # Clean answer
            answer = re.sub(r'^[ABCD]\s*[-‚Äì‚Äî]\s*', '', answer_part, flags=re.IGNORECASE).strip()
            
            # Create Pydantic model
            q = Question(
                question_text=question_text,
                options=options,
                answer=answer,
                question_type='multiple_choice'
            )
            questions.append(q)
            
        except Exception as e:
            print(f"Error parsing question {i}: {e}")
            continue
    
    return QuestionList(questions=questions)

# Test on biology file
bio_questions = parse_numbered_format(bio_text)
print(f"‚úì Extracted {len(bio_questions.questions)} questions from biology file\n")

# Show first 3
for i, q in enumerate(bio_questions.questions[:3], 1):
    print(f"Question {i}:")
    print(f"  Text: {q.question_text[:80]}...")
    print(f"  Type: {q.question_type}")
    print(f"  Options ({len(q.options)}): {q.options}")
    print(f"  Answer: {q.answer}")
    print()


‚úì Extracted 82 questions from biology file

Question 1:
  Text: The adult human of average age and size has approximately how many quarts of
blo...
  Type: multiple_choice
  Options (4): ['4', '6', '8', '10']
  Answer: - 6

Question 2:
  Text: Once the erythrocytes enter the blood in humans, it is estimated that they have ...
  Type: multiple_choice
  Options (4): ['10 days', '120 days', '200 days', '360 days']
  Answer: - 120 Days

Question 3:
  Text: Of the following, which mechanisms are important in the death of erythrocytes ( ...
  Type: multiple_choice
  Options (4): ['phagocytosis ( pron: fag-eh-seh-toe-sis)', 'hemolysis', 'mechanical damage', 'all of the above']
  Answer: - all of the above



In [15]:
# Create unified extractor that handles both formats
class UnifiedQuestionExtractor:
    """Extract questions from PDFs - handles multiple formats"""
    
    def __init__(self):
        pass
    
    def _extract_text_from_pdf(self, pdf_path):
        """Extract all text from PDF"""
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text
    
    def _parse_science_bowl_format(self, text):
        """Parse PHYS-XX; format (Science Bowl with wxyz options)"""
        questions = []
        parts = re.split(r'([A-Z]+-\d+;)', text)
        
        for i in range(1, len(parts)-1, 2):
            try:
                content = parts[i+1]
                if 'ANSWER:' not in content.upper():
                    continue
                
                split_by_answer = re.split(r'\nANSWER:\s*', content, flags=re.IGNORECASE, maxsplit=1)
                if len(split_by_answer) < 2:
                    continue
                
                question_part = split_by_answer[0].strip()
                answer_part = split_by_answer[1].split('\n')[0].strip()
                
                if 'Multiple Choice:' in question_part:
                    question_type = 'multiple_choice'
                    question_text = question_part.split('Multiple Choice:', 1)[1].strip()
                    
                    option_section_match = re.search(r'w\)', question_text, re.IGNORECASE)
                    if option_section_match:
                        q_text = question_text[:option_section_match.start()].strip()
                        options_text = question_text[option_section_match.start():].strip()
                        option_parts = re.split(r'([wxyz]\))', options_text, flags=re.IGNORECASE)
                        
                        options = []
                        for j in range(1, len(option_parts), 2):
                            if j + 1 < len(option_parts):
                                option_text = ' '.join(option_parts[j + 1].strip().split())
                                if option_text:
                                    options.append(option_text)
                        question_text = q_text
                    else:
                        options = []
                        
                elif 'Short Answer:' in question_part:
                    question_type = 'short_answer'
                    question_text = question_part.split('Short Answer:', 1)[1].strip()
                    options = []
                else:
                    question_type = 'short_answer'
                    question_text = question_part
                    options = []
                
                answer = re.sub(r'^[WXYZ]\s*--\s*', '', answer_part, flags=re.IGNORECASE).strip()
                
                q = Question(
                    question_text=question_text,
                    options=options,
                    answer=answer,
                    question_type=question_type
                )
                questions.append(q)
            except:
                continue
        
        return questions
    
    def _parse_numbered_format(self, text):
        """Parse numbered format (1. 2. 3. with abcd options)"""
        questions = []
        parts = re.split(r'\n(\d+)\.\s+Multiple Choice:', text)
        
        for i in range(1, len(parts)-1, 2):
            try:
                content = parts[i+1]
                if 'ANSWER:' not in content.upper():
                    continue
                
                split_by_answer = re.split(r'\nANSWER:\s*', content, flags=re.IGNORECASE, maxsplit=1)
                if len(split_by_answer) < 2:
                    continue
                
                question_part = split_by_answer[0].strip()
                answer_part = split_by_answer[1].split('\n')[0].strip()
                
                option_match = re.search(r'\n\s*a\)', question_part, re.IGNORECASE)
                if option_match:
                    q_text = question_part[:option_match.start()].strip()
                    options_text = question_part[option_match.start():].strip()
                    
                    option_parts = re.split(r'([abcd]\))', options_text, flags=re.IGNORECASE)
                    options = []
                    for j in range(1, len(option_parts), 2):
                        if j + 1 < len(option_parts):
                            option_text = ' '.join(option_parts[j + 1].strip().split())
                            if option_text:
                                options.append(option_text)
                    question_text = q_text
                else:
                    question_text = question_part
                    options = []
                
                answer = re.sub(r'^[ABCD]\s*[-‚Äì‚Äî]\s*', '', answer_part, flags=re.IGNORECASE).strip()
                
                q = Question(
                    question_text=question_text,
                    options=options,
                    answer=answer,
                    question_type='multiple_choice'
                )
                questions.append(q)
            except:
                continue
        
        return questions
    
    def extract(self, pdf_path):
        """Extract questions from PDF - auto-detects format"""
        text = self._extract_text_from_pdf(pdf_path)
        
        # Try Science Bowl format first
        questions = self._parse_science_bowl_format(text)
        if len(questions) > 0:
            print(f"‚úì Detected Science Bowl format")
            return QuestionList(questions=questions)
        
        # Try numbered format
        questions = self._parse_numbered_format(text)
        if len(questions) > 0:
            print(f"‚úì Detected numbered format")
            return QuestionList(questions=questions)
        
        print("‚ùå No questions found - format not recognized")
        return QuestionList(questions=[])

# Test unified extractor on both files
print("="*80)
print("TESTING UNIFIED EXTRACTOR")
print("="*80 + "\n")

extractor = UnifiedQuestionExtractor()

# Test physics file
print("1. Physics file (97_phys.pdf):")
phys_result = extractor.extract('./../data/97_phys.pdf')
print(f"   Extracted: {len(phys_result.questions)} questions\n")

# Test biology file  
print("2. Biology file (biolset2.pdf):")
bio_result = extractor.extract('./../data/biolset2.pdf')
print(f"   Extracted: {len(bio_result.questions)} questions\n")

# Test chemistry file  
print("3. Chemistry file (chemistry.pdf):")
chem_result = extractor.extract('./../data/chemistry.pdf')
print(f"   Extracted: {len(chem_result.questions)} questions\n")

print("="*80)
print("‚úì Unified extractor works with all three formats!")


TESTING UNIFIED EXTRACTOR

1. Physics file (97_phys.pdf):
‚úì Detected Science Bowl format
   Extracted: 672 questions

2. Biology file (biolset2.pdf):
‚úì Detected numbered format
   Extracted: 82 questions

3. Chemistry file (chemistry.pdf):
‚úì Detected Science Bowl format
   Extracted: 672 questions

2. Biology file (biolset2.pdf):
‚úì Detected numbered format
   Extracted: 82 questions

3. Chemistry file (chemistry.pdf):
‚ùå No questions found - format not recognized
   Extracted: 0 questions

‚úì Unified extractor works with all three formats!
‚ùå No questions found - format not recognized
   Extracted: 0 questions

‚úì Unified extractor works with all three formats!


### Alternative: Use existing regex parser but wrap with Pydantic

Since Ollama needs more memory, let's use our working regex parser but convert results to Pydantic models for type safety and validation.

## Experiment: Pydantic + Ollama for Structured Extraction

Let's test using Pydantic models with Ollama for flexible question extraction that works with any PDF format.

In [20]:
from docstrange import DocumentExtractor

# Initialize extractor (cloud mode by default)
extractor = DocumentExtractor()


# Convert any document to clean markdown
result = extractor.extract("./../data/biolset2.pdf")

markdown = result.extract_markdown()
print(markdown)
print("successful")


successful
