In [None]:
import cv2
import numpy as np
from io import BytesIO
from PIL import Image
from typing import Tuple, Dict, Any
from scipy import stats

class WhiteOutDetector:
    """Physical Document Fraud Detection - Detects white-out through drastic white color changes"""
    
    def __init__(self, 
                 white_threshold: int = 220,
                 brightness_jump_threshold: int = 15,
                 min_region_size: int = 50,
                 morph_kernel_size: int = 5):
        """
        Initialize detector focused on brightness discontinuities in white areas
        
        Args:
            white_threshold: Minimum brightness to consider as "white" region
            brightness_jump_threshold: Minimum brightness difference to flag as suspicious
            min_region_size: Minimum pixel area for suspicious regions
            morph_kernel_size: Kernel size for morphological operations
        """
        self.white_threshold = white_threshold
        self.brightness_jump_threshold = brightness_jump_threshold
        self.min_region_size = min_region_size
        self.kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, 
                                                (morph_kernel_size, morph_kernel_size))
    
    def _detect_brightness_discontinuities(self, gray: np.ndarray, white_mask: np.ndarray) -> Tuple[np.ndarray, float]:
        """
        Detect sharp brightness changes within white regions
        This is the KEY indicator of white-out: paper (220-240) vs correction fluid (245-255)
        """
        # Calculate local brightness differences using morphological gradient
        # This finds edges/transitions in brightness
        dilated = cv2.dilate(gray, self.kernel, iterations=1)
        eroded = cv2.erode(gray, self.kernel, iterations=1)
        morphological_gradient = cv2.subtract(dilated, eroded)
        
        # Only consider gradients in white regions
        white_gradients = cv2.bitwise_and(morphological_gradient, morphological_gradient, mask=white_mask)
        
        # Flag strong brightness jumps (paperâ†’whiteout boundaries)
        strong_jumps = (white_gradients > self.brightness_jump_threshold).astype(np.uint8) * 255
        
        # Clean up noise
        strong_jumps = cv2.morphologyEx(strong_jumps, cv2.MORPH_CLOSE, self.kernel)
        strong_jumps = cv2.morphologyEx(strong_jumps, cv2.MORPH_OPEN, self.kernel)
        
        # Calculate jump intensity score
        jump_score = float(np.mean(white_gradients[white_gradients > 0])) if np.sum(white_gradients > 0) > 0 else 0.0
        
        return strong_jumps, jump_score
    
    def _find_brightness_clusters(self, gray: np.ndarray, white_mask: np.ndarray) -> Tuple[np.ndarray, Dict[str, float]]:
        """
        Identify distinct brightness levels in white regions
        Natural paper has consistent brightness, white-out creates 2+ distinct levels
        """
        # Extract white pixel values
        white_pixels = gray[white_mask > 0]
        
        if len(white_pixels) == 0:
            return np.zeros_like(gray), {"num_clusters": 0, "brightness_range": 0}
        
        # Calculate histogram of white region brightness
        hist, bins = np.histogram(white_pixels, bins=36, range=(self.white_threshold, 256))
        
        # Find peaks (distinct brightness levels)
        # Smooth histogram to reduce noise
        hist_smooth = cv2.GaussianBlur(hist.reshape(-1, 1), (5, 1), 0).flatten()
        
        # Find local maxima (peaks = distinct white levels)
        peaks = []
        for i in range(1, len(hist_smooth) - 1):
            if hist_smooth[i] > hist_smooth[i-1] and hist_smooth[i] > hist_smooth[i+1] and hist_smooth[i] > np.max(hist_smooth) * 0.1:
                peaks.append(bins[i])
        
        num_clusters = len(peaks)
        brightness_range = float(np.max(white_pixels) - np.min(white_pixels))
        
        # Create visualization of brightness clusters
        cluster_map = np.zeros_like(gray)
        if num_clusters >= 2:
            # Multiple brightness levels detected - suspicious
            for i, peak in enumerate(peaks):
                mask = (gray >= peak - 5) & (gray <= peak + 5) & (white_mask > 0)
                cluster_map[mask] = int((i + 1) * (255 / num_clusters))
        
        return cluster_map, {
            "num_clusters": num_clusters,
            "brightness_range": brightness_range,
            "cluster_peaks": peaks
        }
    
    def analyze(self, image: np.ndarray) -> Tuple[np.ndarray, Dict[str, Any]]:
        """
        Analyze document for white-out through brightness discontinuities
        
        Returns:
            composite_result: Clean visualization of detected anomalies
            metrics: Dictionary with analysis metrics
        """
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        h, w = gray.shape
        
        # 1. Identify white regions
        white_mask = (gray >= self.white_threshold).astype(np.uint8) * 255
        
        # 2. Detect brightness discontinuities (KEY FEATURE)
        discontinuity_map, jump_score = self._detect_brightness_discontinuities(gray, white_mask)
        
        # 3. Identify brightness clusters
        cluster_map, cluster_info = self._find_brightness_clusters(gray, white_mask)
        
        # 4. Find suspicious regions (areas with strong brightness jumps)
        contours, _ = cv2.findContours(discontinuity_map, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        suspicious_regions = []
        result_overlay = image.copy()
        
        for contour in contours:
            area = cv2.contourArea(contour)
            if area >= self.min_region_size:
                x, y, w_box, h_box = cv2.boundingRect(contour)
                
                # Expand bounding box slightly to capture full region
                padding = 10
                x1 = max(0, x - padding)
                y1 = max(0, y - padding)
                x2 = min(w, x + w_box + padding)
                y2 = min(h, y + h_box + padding)
                
                # Analyze the region around the discontinuity
                region_gray = gray[y1:y2, x1:x2]
                region_white = white_mask[y1:y2, x1:x2]
                
                white_pixels_in_region = region_gray[region_white > 0]
                
                if len(white_pixels_in_region) > 0:
                    min_brightness = float(np.min(white_pixels_in_region))
                    max_brightness = float(np.max(white_pixels_in_region))
                    brightness_range = max_brightness - min_brightness
                    mean_brightness = float(np.mean(white_pixels_in_region))
                    
                    # Confidence: larger brightness range = more suspicious
                    confidence = min(100, (brightness_range / self.brightness_jump_threshold) * 50)
                    
                    suspicious_regions.append({
                        'x': x1, 'y': y1, 'w': x2 - x1, 'h': y2 - y1,
                        'area': area,
                        'min_brightness': min_brightness,
                        'max_brightness': max_brightness,
                        'brightness_range': brightness_range,
                        'mean_brightness': mean_brightness,
                        'confidence': confidence
                    })
                    
                    # Draw bounding box with confidence-based color
                    if confidence > 70:
                        color = (0, 0, 255)  # Red = high confidence
                    elif confidence > 40:
                        color = (0, 165, 255)  # Orange = medium
                    else:
                        color = (0, 255, 255)  # Yellow = low
                    
                    cv2.rectangle(result_overlay, (x1, y1), (x2, y2), color, 2)
                    cv2.putText(result_overlay, f"{confidence:.0f}%", (x1, y1 - 5),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)
        
        # 5. Create clean visualization
        # Discontinuity heatmap (main indicator)
        discontinuity_colored = cv2.applyColorMap(discontinuity_map, cv2.COLORMAP_HOT)
        
        # Cluster visualization (shows different brightness levels)
        if np.max(cluster_map) > 0:
            cluster_colored = cv2.applyColorMap(cluster_map, cv2.COLORMAP_JET)
        else:
            cluster_colored = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
        
        # Composite: Original | Detected | Discontinuities | Clusters
        top_row = np.hstack([image, result_overlay])
        bottom_row = np.hstack([discontinuity_colored, cluster_colored])
        composite_result = np.vstack([top_row, bottom_row])
        
        # 6. Calculate metrics
        total_white_pixels = np.sum(white_mask > 0)
        white_percentage = float(total_white_pixels / gray.size * 100) if gray.size > 0 else 0.0
        
        discontinuity_pixels = np.sum(discontinuity_map > 0)
        discontinuity_percentage = float(discontinuity_pixels / total_white_pixels * 100) if total_white_pixels > 0 else 0.0
        
        # Anomaly score based on key indicators
        anomaly_score = (
            len(suspicious_regions) * 20 +  # Number of suspicious regions
            discontinuity_percentage * 5 +  # Extent of brightness jumps
            jump_score * 2 +  # Intensity of jumps
            max(0, (cluster_info["num_clusters"] - 1)) * 15  # Multiple brightness levels
        )
        
        # Determine fraud likelihood
        if anomaly_score > 100 or len(suspicious_regions) >= 2:
            fraud_likelihood = "HIGH"
        elif anomaly_score > 40 or len(suspicious_regions) >= 1:
            fraud_likelihood = "MEDIUM"
        else:
            fraud_likelihood = "LOW"
        
        metrics = {
            "white_percentage": white_percentage,
            "discontinuity_percentage": discontinuity_percentage,
            "jump_score": jump_score,
            "suspicious_region_count": len(suspicious_regions),
            "suspicious_regions": suspicious_regions,
            "brightness_clusters": cluster_info["num_clusters"],
            "brightness_range": cluster_info["brightness_range"],
            "cluster_peaks": cluster_info.get("cluster_peaks", []),
            "anomaly_score": float(anomaly_score),
            "fraud_likelihood": fraud_likelihood
        }
        
        return composite_result, metrics

In [None]:
detector = WhiteOutDetector()

In [None]:
image = cv2.imread("/Users/Pablo.Vargas2/Documents/isitreal-pablo/data/non_fraud/download (13).jpg")
detector.analyze(image)