# OCR Exploration for Receipt Processing

This notebook explores OCR capabilities for extracting text from receipts.

## Objectives:
1. Test OCR accuracy with different receipt types
2. Compare preprocessing techniques
3. Evaluate text extraction quality
4. Optimize parameters for best results

In [None]:
# Import required libraries
import sys
sys.path.append('..')

import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import pandas as pd

from backend.services.ocr_service import ocr_service
from backend.utils.image_processing import (
    preprocess_image,
    enhance_receipt_image,
    deskew_image
)

%matplotlib inline
plt.rcParams['figure.figsize'] = (15, 10)

## 1. Load Sample Receipts

In [None]:
# Load sample receipt images
data_dir = Path('../data/raw')
sample_receipts = list(data_dir.glob('*.jpg')) + list(data_dir.glob('*.png'))

print(f"Found {len(sample_receipts)} receipt images")
for receipt in sample_receipts[:5]:
    print(f"  - {receipt.name}")

## 2. Test Basic OCR

In [None]:
# Test OCR on first receipt
if sample_receipts:
    test_receipt = str(sample_receipts[0])
    
    # Display image
    img = cv2.imread(test_receipt)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    plt.figure(figsize=(10, 8))
    plt.imshow(img_rgb)
    plt.title('Original Receipt')
    plt.axis('off')
    plt.show()
    
    # Extract text
    result = ocr_service.extract_text(test_receipt)
    
    print("\nExtracted Text:")
    print("=" * 50)
    print(result['full_text'])
    print("=" * 50)
    print(f"\nConfidence: {result['confidence']:.2%}")
    print(f"Number of blocks: {result['num_blocks']}")

## 3. Compare Preprocessing Techniques

In [None]:
# Compare different preprocessing methods
if sample_receipts:
    test_image = str(sample_receipts[0])
    original = cv2.imread(test_image)
    
    # Different preprocessing
    preprocessed = preprocess_image(test_image)
    enhanced = enhance_receipt_image(test_image)
    
    # Display comparison
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    axes[0].imshow(cv2.cvtColor(original, cv2.COLOR_BGR2RGB))
    axes[0].set_title('Original')
    axes[0].axis('off')
    
    axes[1].imshow(preprocessed, cmap='gray')
    axes[1].set_title('Preprocessed')
    axes[1].axis('off')
    
    axes[2].imshow(enhanced, cmap='gray')
    axes[2].set_title('Enhanced')
    axes[2].axis('off')
    
    plt.tight_layout()
    plt.show()

## 4. Batch Processing and Accuracy Analysis

In [None]:
# Process multiple receipts and analyze results
results = []

for receipt_path in sample_receipts[:10]:  # Process first 10
    result = ocr_service.extract_text(str(receipt_path))
    
    results.append({
        'filename': receipt_path.name,
        'confidence': result['confidence'],
        'num_blocks': result['num_blocks'],
        'text_length': len(result['full_text']),
        'success': result['success']
    })

# Create DataFrame
df_results = pd.DataFrame(results)

print("OCR Results Summary:")
print(df_results)
print("\nStatistics:")
print(df_results[['confidence', 'num_blocks', 'text_length']].describe())

## 5. Visualize OCR Confidence Distribution

In [None]:
# Plot confidence distribution
if len(results) > 0:
    plt.figure(figsize=(10, 6))
    plt.hist(df_results['confidence'], bins=20, edgecolor='black')
    plt.xlabel('Confidence Score')
    plt.ylabel('Frequency')
    plt.title('OCR Confidence Distribution')
    plt.axvline(df_results['confidence'].mean(), color='red', 
                linestyle='--', label=f'Mean: {df_results["confidence"].mean():.2f}')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()

## 6. Test Layout Extraction

In [None]:
# Test layout-aware extraction
if sample_receipts:
    blocks = ocr_service.extract_with_layout(str(sample_receipts[0]))
    
    print("Text Blocks (ordered by position):")
    print("=" * 60)
    
    for i, block in enumerate(blocks[:20]):  # Show first 20
        print(f"{i+1:2d}. {block['text']:40s} (conf: {block['confidence']:.2f})")

## 7. Recommendations

Based on the analysis above:

1. **Optimal Preprocessing**: [To be filled based on results]
2. **Confidence Threshold**: [To be determined]
3. **Best Practices**: [To be documented]

## Next Steps

1. Implement automatic quality detection
2. Add retry logic for low-confidence extractions
3. Fine-tune preprocessing parameters
4. Create validation pipeline