# OCR & Question-Answer Segmentation Demo

This notebook demonstrates the complete pipeline for processing handwritten exam images and extracting question-answer pairs using classical CV/ML techniques (no LLMs).

## Setup

In [None]:
import sys
sys.path.insert(0, '../src')

import numpy as np
import cv2
import matplotlib.pyplot as plt
from pathlib import Path

from preprocessing import ImagePreprocessor
from ocr_engine import OCREngine
from feature_extraction import FeatureExtractor
from crf_model import CRFModel
from postprocessing import QAPairExtractor
from utils import create_synthetic_training_data

print("✓ All modules imported successfully")

## 1. Train CRF Model (Synthetic Data)

For demonstration purposes, we'll train a simple model on synthetic data. In production, you would use real annotated exam images.

In [None]:
# Create synthetic training data
print("Creating synthetic training data...")
X_train, y_train = create_synthetic_training_data(n_samples=50)

print(f"Training samples: {len(X_train)}")
print(f"Features per sample: {len(X_train[0])}")
print(f"\nSample features (first line):")
for key, value in list(X_train[0][0].items())[:5]:
    print(f"  {key}: {value}")

In [None]:
# Train CRF model
print("Training CRF model...")
model = CRFModel(max_iterations=100)
results = model.train(X_train, y_train)

print(f"\n✓ Training complete!")
print(f"Training F1: {results['train_f1']:.4f}")

In [None]:
# Save model
model_path = '../models/demo_crf_model.pkl'
Path(model_path).parent.mkdir(parents=True, exist_ok=True)
model.save(model_path)
print(f"Model saved to: {model_path}")

## 2. Process Sample Exam Image

**Note:** For this demo to work with real images, you need to provide sample exam images in `examples/sample_data/`. For now, we'll demonstrate the pipeline components individually.

### 2.1 Image Preprocessing

In [None]:
# Uncomment and modify this section if you have sample images

# image_paths = [
#     '../examples/sample_data/exam_page1.jpg',
#     '../examples/sample_data/exam_page2.jpg'
# ]

# preprocessor = ImagePreprocessor(target_width=1200, enable_deskew=True)
# processed, intermediate = preprocessor.process(image_paths, return_intermediate=True)

# # Visualize preprocessing steps
# fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# axes[0].imshow(intermediate['resized'][0], cmap='gray')
# axes[0].set_title('Original')
# axes[0].axis('off')

# axes[1].imshow(intermediate['deskewed'][0], cmap='gray')
# axes[1].set_title('Deskewed')
# axes[1].axis('off')

# axes[2].imshow(processed, cmap='gray')
# axes[2].set_title('Final (Stitched & Denoised)')
# axes[2].axis('off')

# plt.tight_layout()
# plt.show()

print("[DEMO] Skipping image preprocessing - no sample images provided")
print("To run with real images, add exam images to examples/sample_data/ and uncomment above code")

### 2.2 OCR Extraction

In [None]:
# Uncomment to run OCR on real images

# ocr = OCREngine(engine='paddleocr', lang='en')
# lines = ocr.extract_lines(processed)

# print(f"Extracted {len(lines)} text lines\n")
# print("Sample lines:")
# for i, line in enumerate(lines[:5]):
#     print(f"  {i+1}. {line.text} (conf: {line.confidence:.2%})")

print("[DEMO] Skipping OCR - no sample images provided")

### 2.3 Feature Extraction Demo

In [None]:
# Demo feature extraction on synthetic line
from ocr_engine import OCRLine

# Create mock OCR lines
mock_lines = [
    OCRLine(text="Q1. What is machine learning?", bbox=(50, 100, 400, 30), confidence=0.95, line_number=0),
    OCRLine(text="Machine learning is a subset of AI", bbox=(80, 150, 450, 25), confidence=0.88, line_number=1),
    OCRLine(text="that enables systems to learn from data.", bbox=(80, 180, 420, 25), confidence=0.90, line_number=2),
]

# Extract features
feature_extractor = FeatureExtractor(image_width=800, image_height=1200)
features = feature_extractor.extract_features(mock_lines)

print("Features for first line:")
for key, value in features[0].items():
    print(f"  {key:25s}: {value}")

### 2.4 CRF Prediction

In [None]:
# Convert features to CRF format and predict
crf_features = feature_extractor.features_to_crf_format(features)
tags = model.predict_single(crf_features)

print("Predicted tags:")
for line, tag in zip(mock_lines, tags):
    print(f"  [{tag:5s}] {line.text}")

### 2.5 QA Pair Extraction

In [None]:
# Extract QA pairs
extractor = QAPairExtractor(min_confidence=0.3)
pairs = extractor.extract_pairs(mock_lines, tags)

# Display results
print(extractor.pairs_to_formatted_text(pairs))

## 3. Feature Importance Analysis

In [None]:
# Show top features for each label
model.print_feature_weights(top_n=10)

## 4. Complete End-to-End Pipeline Function

In [None]:
def process_exam(image_paths, model_path):
    """
    Complete pipeline for processing exam images.
    
    Args:
        image_paths: List of image file paths
        model_path: Path to trained CRF model
        
    Returns:
        List of QA pairs
    """
    # 1. Preprocessing
    print("[1/5] Preprocessing...")
    preprocessor = ImagePreprocessor()
    processed = preprocessor.process(image_paths)
    
    # 2. OCR
    print("[2/5] Running OCR...")
    ocr = OCREngine(engine='paddleocr')
    lines = ocr.extract_lines(processed)
    print(f"  Extracted {len(lines)} lines")
    
    # 3. Feature extraction
    print("[3/5] Extracting features...")
    img_width, img_height = ocr.get_image_dimensions(processed)
    feature_extractor = FeatureExtractor(img_width, img_height)
    features = feature_extractor.extract_features(lines)
    crf_features = feature_extractor.features_to_crf_format(features)
    
    # 4. CRF prediction
    print("[4/5] Running CRF model...")
    model = CRFModel()
    model.load(model_path)
    tags = model.predict_single(crf_features)
    
    # 5. Extract pairs
    print("[5/5] Extracting QA pairs...")
    extractor = QAPairExtractor()
    pairs = extractor.extract_pairs(lines, tags)
    print(f"  Extracted {len(pairs)} pairs")
    
    return pairs

print("✓ Pipeline function defined")
print("\nTo use: pairs = process_exam(['exam1.jpg', 'exam2.jpg'], 'models/crf_model.pkl')")

## Summary

This notebook demonstrated:

1. ✅ **CRF Model Training** on synthetic data
2. ✅ **Feature Extraction** from OCR lines (visual + text features)
3. ✅ **Sequence Labeling** using CRF
4. ✅ **QA Pair Extraction** from tagged sequences
5. ✅ **Feature Importance** analysis

### Next Steps:

- Add real exam images to `examples/sample_data/`
- Annotate training data using `scripts/annotate.py`
- Train production model with real data
- Process full exam sets using `scripts/inference.py`

**Remember:** This system uses classical CV/ML techniques (CRF, handcrafted features), **NOT Large Language Models**, ensuring interpretability and resource efficiency!