# Data Exploration - Supermarket Brochures

This notebook explores the collected brochure data and analyzes OCR results.

In [None]:
import os
import sys
from pathlib import Path
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2

# Add src to path
sys.path.append(str(Path.cwd().parent / 'src'))

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

%matplotlib inline

## 1. Data Collection Summary

In [None]:
# Check available data
data_dir = Path('../data/raw')

if data_dir.exists():
    files = list(data_dir.glob('**/*'))
    image_files = [f for f in files if f.suffix.lower() in ['.jpg', '.jpeg', '.png', '.pdf']]
    
    print(f"Total files: {len(files)}")
    print(f"Image/PDF files: {len(image_files)}")
    
    # Group by supermarket
    supermarkets = {}
    for f in image_files:
        supermarket = f.parent.name
        if supermarket not in supermarkets:
            supermarkets[supermarket] = []
        supermarkets[supermarket].append(f)
    
    print("\nFiles by supermarket:")
    for sm, files in supermarkets.items():
        print(f"  {sm}: {len(files)} files")
else:
    print("No data collected yet. Run the scraper first.")

## 2. Sample Brochure Visualization

In [None]:
# Display a sample brochure
if image_files:
    sample_image = Image.open(image_files[0])
    
    plt.figure(figsize=(12, 8))
    plt.imshow(sample_image)
    plt.axis('off')
    plt.title(f"Sample Brochure: {image_files[0].name}")
    plt.show()
else:
    print("No images to display")

## 3. OCR Results Analysis

In [None]:
# Load OCR results if available
processed_dir = Path('../data/processed')

if processed_dir.exists():
    json_files = list(processed_dir.glob('**/*.json'))
    
    if json_files:
        # Load a sample result
        with open(json_files[0], 'r') as f:
            ocr_result = json.load(f)
        
        print(f"OCR Engine: {ocr_result.get('ocr_engine', 'N/A')}")
        print(f"Number of text boxes: {ocr_result.get('num_boxes', 0)}")
        
        # Display first few text boxes
        print("\nSample extracted text:")
        for i, box in enumerate(ocr_result.get('text_boxes', [])[:10]):
            print(f"{i+1}. {box['text']} (confidence: {box['confidence']:.2f})")
    else:
        print("No OCR results found. Run the OCR pipeline first.")
else:
    print("No processed data yet.")

## 4. Text Confidence Distribution

In [None]:
# Analyze confidence scores
if json_files:
    confidences = [box['confidence'] for box in ocr_result.get('text_boxes', [])]
    
    plt.figure(figsize=(10, 5))
    plt.hist(confidences, bins=20, edgecolor='black', alpha=0.7)
    plt.xlabel('Confidence Score')
    plt.ylabel('Frequency')
    plt.title('Distribution of OCR Confidence Scores')
    plt.axvline(np.mean(confidences), color='red', linestyle='--', label=f'Mean: {np.mean(confidences):.2f}')
    plt.legend()
    plt.show()
    
    print(f"Mean confidence: {np.mean(confidences):.3f}")
    print(f"Median confidence: {np.median(confidences):.3f}")
    print(f"Min confidence: {np.min(confidences):.3f}")
    print(f"Max confidence: {np.max(confidences):.3f}")

## 5. Bounding Box Visualization

In [None]:
# Visualize bounding boxes on image
if image_files and json_files:
    # Load image
    img = cv2.imread(str(image_files[0]))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Draw bounding boxes
    for box in ocr_result.get('text_boxes', []):
        bbox = box['bbox']
        confidence = box['confidence']
        
        # Color based on confidence (green = high, red = low)
        color = (int(255 * (1 - confidence)), int(255 * confidence), 0)
        
        cv2.rectangle(
            img,
            (bbox['x_min'], bbox['y_min']),
            (bbox['x_max'], bbox['y_max']),
            color,
            2
        )
    
    plt.figure(figsize=(15, 10))
    plt.imshow(img)
    plt.axis('off')
    plt.title('OCR Bounding Boxes (Green = High Confidence, Red = Low Confidence)')
    plt.show()

## 6. Entity Pattern Analysis

In [None]:
# Pattern analysis for prices, products, etc.
import re

if json_files:
    texts = [box['text'] for box in ocr_result.get('text_boxes', [])]
    
    # Find potential prices
    price_pattern = r'\d+[,.]\d{2}\s*â‚¬?'
    prices = [text for text in texts if re.search(price_pattern, text)]
    
    # Find percentages (discounts)
    percent_pattern = r'\d+\s*%'
    discounts = [text for text in texts if re.search(percent_pattern, text)]
    
    print(f"Potential prices found: {len(prices)}")
    print("Sample prices:", prices[:10])
    print(f"\nPotential discounts found: {len(discounts)}")
    print("Sample discounts:", discounts[:10])

## Next Steps

1. Run the data collection script to download more brochures
2. Process images with OCR pipeline
3. Manually annotate sample data for training
4. Train entity recognition model
5. Evaluate and refine