<a href="https://colab.research.google.com/github/123DS9472396/123DS9472396/blob/main/Forgery.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install opencv-python-headless numpy matplotlib scipy scikit-image pillow



In [None]:
!apt-get install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 29 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (2,629 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 126209 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

In [1]:
# Install required libraries
!pip install opencv-python-headless numpy matplotlib scipy scikit-image pillow pytesseract

# Import necessary libraries
import cv2
import numpy as np
import os
import logging
import time
import json
from PIL import Image
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor
from scipy.ndimage import sobel
from datetime import datetime
from skimage.feature import local_binary_pattern
import pytesseract

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class DocumentForgeryDetector:
    def __init__(self, output_dir=None, max_workers=4):
        """Initialize the document forgery detector with configurable parameters"""
        self.original_image = None
        self.marked_image = None
        self.gray_image = None
        self.height, self.width = None, None
        self.max_workers = max_workers
        self.suspicious_regions = []
        self.most_tampered_region = None
        self.max_tampering_score = 0
        self.results = {}
        self.compression_level = None

        # Set up output directory with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.output_dir = output_dir or f"forgery_results_{timestamp}"
        os.makedirs(self.output_dir, exist_ok=True)

        # Class-wide visualizations
        self.heatmap = None
        self.analysis_images = {}

    def load_image(self, file_path, max_dimension=1500):
        """Load and preprocess image for analysis"""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Image not found at {file_path}")

        try:
            with Image.open(file_path) as img:
                if img.format not in ['JPEG', 'PNG', 'BMP', 'TIFF']:
                    raise ValueError(f"Unsupported image format: {img.format}")
        except Exception as e:
            raise ValueError(f"Invalid image file: {e}")

        self.original_image = cv2.imread(file_path)
        if self.original_image is None:
            raise ValueError("Failed to load image")

        height, width = self.original_image.shape[:2]
        scale = min(max_dimension / max(height, width), 1.0)
        if scale < 1.0:
            self.original_image = cv2.resize(
                self.original_image, (int(width * scale), int(height * scale)),
                interpolation=cv2.INTER_AREA
            )

        self.marked_image = self.original_image.copy()
        self.gray_image = cv2.cvtColor(self.original_image, cv2.COLOR_BGR2GRAY)
        self.image_path = file_path
        self.height, self.width = self.original_image.shape[:2]
        self.heatmap = np.zeros((self.height, self.width), dtype=np.float32)

        self.analyze_image_statistics()
        logger.info(f"Loaded: {os.path.basename(file_path)} ({self.width}x{self.height})")
        return self.original_image

    def analyze_image_statistics(self):
        """Analyze image statistics to set adaptive thresholds"""
        self.min_val = np.min(self.gray_image)
        self.max_val = np.max(self.gray_image)
        self.contrast_range = self.max_val - self.min_val

        blurred = cv2.GaussianBlur(self.gray_image, (5, 5), 0)
        noise = cv2.absdiff(self.gray_image, blurred)
        self.baseline_noise = np.mean(noise)

        edges = cv2.Canny(self.gray_image, 100, 200)
        self.edge_density = np.count_nonzero(edges) / (self.height * self.width)

        # Estimate compression level
        temp_path = f"{self.output_dir}/temp_comp.jpg"
        cv2.imwrite(temp_path, self.original_image, [cv2.IMWRITE_JPEG_QUALITY, 90])
        comp_size = os.path.getsize(temp_path)
        orig_size = os.path.getsize(self.image_path) if os.path.exists(self.image_path) else comp_size * 2
        self.compression_level = min(1.0, comp_size / orig_size) if orig_size > 0 else 1.0
        os.remove(temp_path)

        logger.info(f"Image statistics - Contrast: {self.contrast_range}, Noise: {self.baseline_noise:.2f}, "
                    f"Edge density: {self.edge_density:.4f}, Compression: {self.compression_level:.2f}")

    def preprocess_image(self):
        """Generate processed versions of the image for analysis"""
        base = os.path.splitext(os.path.basename(self.image_path))[0]
        self.analysis_images['grayscale'] = self.gray_image
        cv2.imwrite(f"{self.output_dir}/{base}_grayscale.jpg", self.gray_image)

        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
        enhanced = clahe.apply(self.gray_image)
        self.analysis_images['enhanced'] = enhanced
        cv2.imwrite(f"{self.output_dir}/{base}_enhanced.jpg", enhanced)

        blurred = cv2.GaussianBlur(self.gray_image, (5, 5), 0)
        noise = cv2.absdiff(self.gray_image, blurred)
        self.analysis_images['noise'] = noise
        cv2.imwrite(f"{self.output_dir}/{base}_noise.jpg", cv2.applyColorMap(noise * 5, cv2.COLORMAP_JET))

        edges = cv2.Canny(enhanced, 50, 200)
        self.analysis_images['edges'] = edges
        cv2.imwrite(f"{self.output_dir}/{base}_edges.jpg", edges)

        lbp = local_binary_pattern(self.gray_image, 8, 1, method='uniform')
        self.analysis_images['texture'] = lbp
        lbp_norm = (lbp * (255 / np.max(lbp))).astype(np.uint8)
        cv2.imwrite(f"{self.output_dir}/{base}_texture.jpg", lbp_norm)

        temp_path = f"{self.output_dir}/temp_ela.jpg"
        quality = int(85 * self.compression_level)  # Adjust based on detected compression
        cv2.imwrite(temp_path, self.original_image, [cv2.IMWRITE_JPEG_QUALITY, max(50, quality)])
        compressed = cv2.imread(temp_path)
        ela = cv2.absdiff(self.original_image, compressed)
        ela_gray = cv2.cvtColor(ela, cv2.COLOR_BGR2GRAY)
        self.analysis_images['ela'] = ela_gray
        cv2.imwrite(f"{self.output_dir}/{base}_ela.jpg", cv2.applyColorMap(ela_gray * 15, cv2.COLORMAP_JET))
        os.remove(temp_path)

        channels = cv2.split(self.original_image)
        diff_rg = cv2.absdiff(channels[0].astype(np.float32), channels[1].astype(np.float32))
        diff_rb = cv2.absdiff(channels[0].astype(np.float32), channels[2].astype(np.float32))
        diff_gb = cv2.absdiff(channels[1].astype(np.float32), channels[2].astype(np.float32))
        color_diff = (diff_rg + diff_rb + diff_gb) / 3
        self.analysis_images['color_diff'] = color_diff.astype(np.uint8)
        cv2.imwrite(f"{self.output_dir}/{base}_color_diff.jpg", cv2.applyColorMap(color_diff.astype(np.uint8), cv2.COLORMAP_JET))

        logger.info(f"Generated preprocessed images in {self.output_dir}")

    def grayscale_analysis(self, block_size=32):
        """Analyze grayscale noise to detect tampering"""
        logger.info("Starting grayscale analysis...")
        base = os.path.splitext(os.path.basename(self.image_path))[0]
        denoised = cv2.fastNlMeansDenoising(self.gray_image, None, 10, 7, 21)
        noise = cv2.absdiff(self.gray_image, denoised)
        noise_heatmap = np.zeros((self.height, self.width), dtype=np.float32)
        step = block_size // 2
        tampering_scores = []
        positions = []

        for i in range(0, self.height - block_size + 1, step):
            for j in range(0, self.width - block_size + 1, step):
                block = noise[i:i + block_size, j:j + block_size]
                block_var = np.var(block)
                norm_var = block_var / (self.baseline_noise * self.baseline_noise + 1e-10)
                threshold = 2.0 if self.baseline_noise < 10 else 3.0
                score = min(100, norm_var * 20) if norm_var > threshold else 0
                if score > 25:
                    tampering_scores.append(score)
                    positions.append((j, i, j + block_size, i + block_size))
                    intensity = score / 100.0
                    noise_heatmap[i:i+block_size, j:j+block_size] += intensity
                    self.heatmap[i:i+block_size, j:j+block_size] += intensity * 0.3

        if np.max(noise_heatmap) > 0:
            norm_heatmap = noise_heatmap / np.max(noise_heatmap)
            cv2.imwrite(f"{self.output_dir}/{base}_grayscale_heatmap.jpg", cv2.applyColorMap((norm_heatmap * 255).astype(np.uint8), cv2.COLORMAP_JET))

        if tampering_scores:
            max_idx = np.argmax(tampering_scores)
            score = tampering_scores[max_idx]
            region = positions[max_idx]
            if score > self.max_tampering_score:
                self.max_tampering_score = score
                self.most_tampered_region = region
            for idx, score in enumerate(tampering_scores):
                if score > 40:
                    self.suspicious_regions.append({"type": "grayscale", "region": positions[idx], "score": float(score)})

        self.results['grayscale_analysis'] = {
            'max_tampering_score': float(max(tampering_scores)) if tampering_scores else 0.0,
            'suspicious': bool(max(tampering_scores) > 50) if tampering_scores else False,
            'suspicious_count': len([s for s in tampering_scores if s > 40]) if tampering_scores else 0
        }
        logger.info(f"Grayscale analysis completed: Max score = {self.results['grayscale_analysis']['max_tampering_score']}")

    def error_level_analysis(self, block_size=32):
        """Perform Error Level Analysis with adaptive thresholding"""
        logger.info("Starting Error Level Analysis...")
        base = os.path.splitext(os.path.basename(self.image_path))[0]
        ela_gray = self.analysis_images['ela']
        ela_heatmap = np.zeros((self.height, self.width), dtype=np.float32)
        step = block_size // 2
        ela_data = []

        for i in range(0, self.height - block_size + 1, step):
            for j in range(0, self.width - block_size + 1, step):
                ela_block = ela_gray[i:i+block_size, j:j+block_size]
                ela_mean = np.mean(ela_block)
                if 'edges' in self.analysis_images:
                    edge_block = self.analysis_images['edges'][i:i+block_size, j:j+block_size]
                    edge_density = np.count_nonzero(edge_block) / edge_block.size
                    if edge_density > 0.3:
                        ela_mean *= (1 - edge_density * 0.5)
                ela_data.append((ela_mean, (j, i, j + block_size, i + block_size)))

        ela_means = [data[0] for data in ela_data]
        mean_ela = np.mean(ela_means)
        std_ela = np.std(ela_means)
        threshold = mean_ela + 3 * std_ela  # Stricter threshold

        tampering_scores = []
        positions = []
        for ela_mean, position in ela_data:
            if ela_mean > threshold:
                score = min(100, (ela_mean - threshold) / (std_ela * 2) * 100)
                if score > 15:  # Minimum score threshold
                    tampering_scores.append(score)
                    positions.append(position)
                    intensity = score / 100.0
                    i_start, j_start = position[1], position[0]
                    ela_heatmap[i_start:i_start+block_size, j_start:j_start+block_size] += intensity
                    self.heatmap[i_start:i_start+block_size, j_start:j_start+block_size] += intensity * 0.4

        if np.max(ela_heatmap) > 0:
            norm_heatmap = ela_heatmap / np.max(ela_heatmap)
            cv2.imwrite(f"{self.output_dir}/{base}_ela_heatmap.jpg", cv2.applyColorMap((norm_heatmap * 255).astype(np.uint8), cv2.COLORMAP_JET))

        if tampering_scores:
            max_idx = np.argmax(tampering_scores)
            score = tampering_scores[max_idx]
            region = positions[max_idx]
            if score > self.max_tampering_score:
                self.max_tampering_score = score
                self.most_tampered_region = region
            for idx, score in enumerate(tampering_scores):
                if score > 40:
                    self.suspicious_regions.append({"type": "ela", "region": positions[idx], "score": float(score)})

        self.results['error_level_analysis'] = {
            'max_tampering_score': float(max(tampering_scores)) if tampering_scores else 0.0,
            'suspicious': bool(max(tampering_scores) > 50) if tampering_scores else False,
            'suspicious_count': len([s for s in tampering_scores if s > 40]) if tampering_scores else 0
        }
        logger.info(f"ELA completed: Max score = {self.results['error_level_analysis']['max_tampering_score']}")

    def color_channel_analysis(self, block_size=32):
        """Analyze color channels with enhanced consistency checks"""
        logger.info("Starting color channel analysis...")
        base = os.path.splitext(os.path.basename(self.image_path))[0]
        b, g, r = cv2.split(self.original_image)
        color_heatmap = np.zeros((self.height, self.width), dtype=np.float32)
        step = block_size // 2
        anomaly_data = []

        r_global, g_global, b_global = np.mean(r), np.mean(g), np.mean(b)
        for i in range(0, self.height - block_size + 1, step):
            for j in range(0, self.width - block_size + 1, step):
                r_block = r[i:i+block_size, j:j+block_size]
                g_block = g[i:i+block_size, j:j+block_size]
                b_block = b[i:i+block_size, j:j+block_size]
                r_mean, g_mean, b_mean = np.mean(r_block), np.mean(g_block), np.mean(b_block)

                rg_dev = abs(r_mean / (g_mean + 1) - r_global / (g_global + 1)) / max(1, r_global / (g_global + 1))
                rb_dev = abs(r_mean / (b_mean + 1) - r_global / (b_global + 1)) / max(1, r_global / (b_global + 1))
                gb_dev = abs(g_mean / (b_mean + 1) - g_global / (b_global + 1)) / max(1, g_global / (b_global + 1))
                diff_mean = np.mean(cv2.absdiff(r_block, g_block)) + np.mean(cv2.absdiff(r_block, b_block)) + np.mean(cv2.absdiff(g_block, b_block))
                anomaly_score = (rg_dev + rb_dev + gb_dev) * 50 + (diff_mean / 128) * 50
                anomaly_data.append((anomaly_score, (j, i, j + block_size, i + block_size)))

        anomaly_scores = [data[0] for data in anomaly_data]
        mean_anomaly = np.mean(anomaly_scores)
        std_anomaly = np.std(anomaly_scores)
        threshold = mean_anomaly + 3 * std_anomaly

        tampering_scores = []
        positions = []
        for anomaly_score, position in anomaly_data:
            if anomaly_score > threshold:
                score = min(100, (anomaly_score - threshold) / (std_anomaly * 2) * 100)
                if score > 15:
                    tampering_scores.append(score)
                    positions.append(position)
                    intensity = score / 100.0
                    i_start, j_start = position[1], position[0]
                    color_heatmap[i_start:i_start+block_size, j_start:j_start+block_size] += intensity
                    self.heatmap[i_start:i_start+block_size, j_start:j_start+block_size] += intensity * 0.3

        if np.max(color_heatmap) > 0:
            norm_heatmap = color_heatmap / np.max(color_heatmap)
            cv2.imwrite(f"{self.output_dir}/{base}_color_heatmap.jpg", cv2.applyColorMap((norm_heatmap * 255).astype(np.uint8), cv2.COLORMAP_JET))

        if tampering_scores:
            max_idx = np.argmax(tampering_scores)
            score = tampering_scores[max_idx]
            region = positions[max_idx]
            if score > self.max_tampering_score:
                self.max_tampering_score = score
                self.most_tampered_region = region
            for idx, score in enumerate(tampering_scores):
                if score > 40:
                    self.suspicious_regions.append({"type": "color", "region": positions[idx], "score": float(score)})

        self.results['color_channel_analysis'] = {
            'max_tampering_score': float(max(tampering_scores)) if tampering_scores else 0.0,
            'suspicious': bool(max(tampering_scores) > 50) if tampering_scores else False,
            'suspicious_count': len([s for s in tampering_scores if s > 40]) if tampering_scores else 0
        }
        logger.info(f"Color analysis completed: Max score = {self.results['color_channel_analysis']['max_tampering_score']}")

    def texture_consistency_analysis(self, block_size=32):
        """Analyze texture consistency using LBP"""
        logger.info("Starting texture consistency analysis...")
        base = os.path.splitext(os.path.basename(self.image_path))[0]
        lbp = self.analysis_images['texture']
        texture_heatmap = np.zeros((self.height, self.width), dtype=np.float32)
        step = block_size // 2
        texture_data = []

        global_hist, _ = np.histogram(lbp, bins=10, range=(0, 10), density=True)
        global_entropy = -np.sum(global_hist * np.log2(global_hist + 1e-10))

        for i in range(0, self.height - block_size + 1, step):
            for j in range(0, self.width - block_size + 1, step):
                lbp_block = lbp[i:i+block_size, j:j+block_size]
                hist, _ = np.histogram(lbp_block, bins=10, range=(0, 10), density=True)
                entropy = -np.sum(hist * np.log2(hist + 1e-10))
                anomaly = abs(entropy - global_entropy) / max(1e-10, global_entropy)
                texture_data.append((anomaly, (j, i, j + block_size, i + block_size)))

        anomalies = [data[0] for data in texture_data]
        mean_anomaly = np.mean(anomalies)
        std_anomaly = np.std(anomalies)
        threshold = mean_anomaly + 3 * std_anomaly

        tampering_scores = []
        positions = []
        for anomaly, position in texture_data:
            if anomaly > threshold:
                score = min(100, (anomaly - threshold) / (std_anomaly * 2) * 100)
                if score > 15:
                    tampering_scores.append(score)
                    positions.append(position)
                    intensity = score / 100.0
                    i_start, j_start = position[1], position[0]
                    texture_heatmap[i_start:i_start+block_size, j_start:j_start+block_size] += intensity
                    self.heatmap[i_start:i_start+block_size, j_start:j_start+block_size] += intensity * 0.3

        if np.max(texture_heatmap) > 0:
            norm_heatmap = texture_heatmap / np.max(texture_heatmap)
            cv2.imwrite(f"{self.output_dir}/{base}_texture_heatmap.jpg", cv2.applyColorMap((norm_heatmap * 255).astype(np.uint8), cv2.COLORMAP_JET))

        if tampering_scores:
            max_idx = np.argmax(tampering_scores)
            score = tampering_scores[max_idx]
            region = positions[max_idx]
            if score > self.max_tampering_score:
                self.max_tampering_score = score
                self.most_tampered_region = region
            for idx, score in enumerate(tampering_scores):
                if score > 40:
                    self.suspicious_regions.append({"type": "texture", "region": positions[idx], "score": float(score)})

        self.results['texture_consistency_analysis'] = {
            'max_tampering_score': float(max(tampering_scores)) if tampering_scores else 0.0,
            'suspicious': bool(max(tampering_scores) > 50) if tampering_scores else False,
            'suspicious_count': len([s for s in tampering_scores if s > 40]) if tampering_scores else 0
        }
        logger.info(f"Texture analysis completed: Max score = {self.results['texture_consistency_analysis']['max_tampering_score']}")

    def font_alignment_analysis(self, block_size=32):
        """Analyze text consistency using OCR"""
        logger.info("Starting font and alignment analysis...")
        base = os.path.splitext(os.path.basename(self.image_path))[0]
        text_heatmap = np.zeros((self.height, self.width), dtype=np.float32)
        step = block_size // 2
        text_data = []

        try:
            text = pytesseract.image_to_data(self.original_image, output_type=pytesseract.Output.DICT)
            text_boxes = [(x, y, x + w, y + h) for x, y, w, h in zip(text['left'], text['top'], text['width'], text['height']) if w > 0 and h > 0]
        except Exception as e:
            logger.warning(f"OCR failed: {e}")
            self.results['font_alignment_analysis'] = {'max_tampering_score': 0.0, 'suspicious': False, 'suspicious_count': 0}
            return

        for i in range(0, self.height - block_size + 1, step):
            for j in range(0, self.width - block_size + 1, step):
                block = self.gray_image[i:i+block_size, j:j+block_size]
                block_text = pytesseract.image_to_data(block, output_type=pytesseract.Output.DICT)
                heights = [h for h in block_text['height'] if h > 0]
                if heights:
                    height_var = np.var(heights)
                    anomaly = height_var / (np.var([box[3] - box[1] for box in text_boxes]) + 1e-10)
                    text_data.append((anomaly, (j, i, j + block_size, i + block_size)))

        anomalies = [data[0] for data in text_data if data[0] > 0]
        if not anomalies:
            self.results['font_alignment_analysis'] = {'max_tampering_score': 0.0, 'suspicious': False, 'suspicious_count': 0}
            return

        mean_anomaly = np.mean(anomalies)
        std_anomaly = np.std(anomalies)
        threshold = mean_anomaly + 3 * std_anomaly

        tampering_scores = []
        positions = []
        for anomaly, position in text_data:
            if anomaly > threshold:
                score = min(100, (anomaly - threshold) / (std_anomaly * 2) * 100)
                if score > 15:
                    tampering_scores.append(score)
                    positions.append(position)
                    intensity = score / 100.0
                    i_start, j_start = position[1], position[0]
                    text_heatmap[i_start:i_start+block_size, j_start:j_start+block_size] += intensity
                    self.heatmap[i_start:i_start+block_size, j_start:j_start+block_size] += intensity * 0.3

        if np.max(text_heatmap) > 0:
            norm_heatmap = text_heatmap / np.max(text_heatmap)
            cv2.imwrite(f"{self.output_dir}/{base}_text_heatmap.jpg", cv2.applyColorMap((norm_heatmap * 255).astype(np.uint8), cv2.COLORMAP_JET))

        if tampering_scores:
            max_idx = np.argmax(tampering_scores)
            score = tampering_scores[max_idx]
            region = positions[max_idx]
            if score > self.max_tampering_score:
                self.max_tampering_score = score
                self.most_tampered_region = region
            for idx, score in enumerate(tampering_scores):
                if score > 40:
                    self.suspicious_regions.append({"type": "font", "region": positions[idx], "score": float(score)})

        self.results['font_alignment_analysis'] = {
            'max_tampering_score': float(max(tampering_scores)) if tampering_scores else 0.0,
            'suspicious': bool(max(tampering_scores) > 50) if tampering_scores else False,
            'suspicious_count': len([s for s in tampering_scores if s > 40]) if tampering_scores else 0
        }
        logger.info(f"Font analysis completed: Max score = {self.results['font_alignment_analysis']['max_tampering_score']}")

    def merge_suspicious_regions(self):
        """Merge overlapping suspicious regions"""
        if not self.suspicious_regions:
            return []

        sorted_regions = sorted(self.suspicious_regions, key=lambda x: x['score'], reverse=True)

        def overlap_ratio(box1, box2):
            x1_1, y1_1, x2_1, y2_1 = box1
            x1_2, y1_2, x2_2, y2_2 = box2
            x_left = max(x1_1, x1_2)
            y_top = max(y1_1, y1_2)
            x_right = min(x2_1, x2_2)
            y_bottom = min(y2_1, y2_2)
            if x_right < x_left or y_bottom < y_top:
                return 0.0
            intersection = (x_right - x_left) * (y_bottom - y_top)
            box1_area = (x2_1 - x1_1) * (y2_1 - y1_1)
            box2_area = (x2_2 - x1_2) * (y2_2 - y1_2)
            return intersection / min(box1_area, box2_area)

        merged_regions = []
        used = [False] * len(sorted_regions)
        for i in range(len(sorted_regions)):
            if used[i]:
                continue
            used[i] = True
            current = sorted_regions[i]
            base_region = list(current['region'])
            score = current['score']
            evidence_types = [current['type']]
            count = 1

            for j in range(i + 1, len(sorted_regions)):
                if used[j]:
                    continue
                if overlap_ratio(base_region, sorted_regions[j]['region']) > 0.3:
                    region_j = sorted_regions[j]['region']
                    base_region[0] = min(base_region[0], region_j[0])
                    base_region[1] = min(base_region[1], region_j[1])
                    base_region[2] = max(base_region[2], region_j[2])
                    base_region[3] = max(base_region[3], region_j[3])
                    score += sorted_regions[j]['score'] * 0.5
                    evidence_types.append(sorted_regions[j]['type'])
                    count += 1
                    used[j] = True

            final_score = min(100, score / (1 + 0.25 * (count - 1)))
            unique_evidence = set(evidence_types)
            if len(unique_evidence) > 1:
                final_score = min(100, final_score + min(20, 10 * (len(unique_evidence) - 1)))

            merged_regions.append({
                'region': tuple(base_region),
                'score': final_score,
                'evidence_types': list(unique_evidence),
                'evidence_count': count
            })

        return sorted(merged_regions, key=lambda x: x['score'], reverse=True)

    def visualize_results(self):
        """Generate visualizations of detected forgeries"""
        logger.info("Generating result visualizations...")
        base = os.path.splitext(os.path.basename(self.image_path))[0]
        merged_regions = self.merge_suspicious_regions()
        marked_image = self.original_image.copy()

        for region in merged_regions:
            x1, y1, x2, y2 = region['region']
            score = region['score']
            color = (0, 0, 255) if score >= 70 else (0, 165, 255) if score >= 50 else (0, 255, 255)
            cv2.rectangle(marked_image, (x1, y1), (x2, y2), color, 2)
            label = f"{score:.1f}% ({','.join(region['evidence_types'])})"
            y_text = max(y1 - 10, 10)
            cv2.putText(marked_image, label, (x1, y_text), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        marked_image_path = f"{self.output_dir}/{base}_marked.jpg"
        cv2.imwrite(marked_image_path, marked_image)

        if np.max(self.heatmap) > 0:
            norm_heatmap = self.heatmap / np.max(self.heatmap)
            heat_colored = cv2.applyColorMap((norm_heatmap * 255).astype(np.uint8), cv2.COLORMAP_JET)
            heatmap_overlay = cv2.addWeighted(self.original_image, 0.7, heat_colored, 0.3, 0)
            cv2.imwrite(f"{self.output_dir}/{base}_heatmap.jpg", heatmap_overlay)
            combined = np.hstack((self.original_image, marked_image, heatmap_overlay))
            cv2.imwrite(f"{self.output_dir}/{base}_summary.jpg", combined)

        plt.figure(figsize=(12, 10))
        plt.subplot(221)
        plt.title("Original Image")
        plt.imshow(cv2.cvtColor(self.original_image, cv2.COLOR_BGR2RGB))
        plt.axis('off')

        plt.subplot(222)
        plt.title("Detected Forgeries")
        plt.imshow(cv2.cvtColor(marked_image, cv2.COLOR_BGR2RGB))
        plt.axis('off')

        plt.subplot(223)
        plt.title("Tampering Heatmap")
        plt.imshow(norm_heatmap if np.max(self.heatmap) > 0 else np.zeros_like(self.gray_image), cmap='jet')
        plt.axis('off')

        plt.subplot(224)
        scores = [region['score'] for region in merged_regions]
        if scores:
            plt.title("Tampering Score Distribution")
            plt.hist(scores, bins=10, range=(0, 100), alpha=0.7)
            plt.xlabel('Confidence Score (%)')
            plt.ylabel('Count')
        else:
            plt.title("No Suspicious Regions Detected")
            plt.axis('off')

        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/{base}_analysis_summary.png", dpi=150)
        plt.close()

        cv2.imwrite(f"{self.output_dir}/{base}_original.jpg", self.original_image)
        return marked_image_path, merged_regions

    def run_all_analyses(self):
        """Run all forgery detection analyses in parallel"""
        if self.original_image is None:
            raise ValueError("No image loaded. Call load_image() first.")
        logger.info("Starting comprehensive analysis...")

        self.preprocess_image()
        analyses = [
            self.grayscale_analysis,
            self.error_level_analysis,
            self.color_channel_analysis,
            self.texture_consistency_analysis,
            self.font_alignment_analysis
        ]

        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            executor.map(lambda func: func(), analyses)

        scores = [result.get('max_tampering_score', 0) for result in self.results.values()]
        avg_score = np.mean(scores) if scores else 0
        total_suspicious = sum(1 for analysis in self.results.values() if analysis.get('suspicious', False))
        merged_regions = self.merge_suspicious_regions()
        region_count = len([r for r in merged_regions if r['score'] > 50])

        # Stricter verdict logic
        if total_suspicious >= 3 and avg_score >= 50 and region_count > 1:
            verdict = "LIKELY FORGED"
            confidence = min(100, avg_score + total_suspicious * 10 + region_count * 5)
        elif total_suspicious >= 2 and avg_score >= 40:
            verdict = "SUSPICIOUS"
            confidence = min(100, avg_score + total_suspicious * 10)
        else:
            verdict = "LIKELY AUTHENTIC"
            confidence = min(100, max(0, 100 - avg_score - total_suspicious * 5))

        self.results['summary'] = {
            'average_score': float(avg_score),
            'verdict': verdict,
            'confidence': float(confidence),
            'most_tampered_region': self.most_tampered_region,
            'max_tampering_score': float(self.max_tampering_score),
            'total_suspicious_methods': total_suspicious,
            'suspicious_region_count': region_count,
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

        logger.info(f"Analysis completed - Verdict: {verdict} (Confidence: {confidence:.1f}%)")
        marked_image_path, merged_regions = self.visualize_results()

        self.results['merged_regions'] = [
            {'region': r['region'], 'score': float(r['score']), 'evidence_types': r['evidence_types'], 'evidence_count': r['evidence_count']}
            for r in merged_regions
        ]

        with open(f"{self.output_dir}/{os.path.splitext(os.path.basename(self.image_path))[0]}_report.json", 'w') as f:
            json.dump(self.results, f, indent=4)

        return marked_image_path, self.results

    def print_summary(self):
        """Print a summary of the analysis results"""
        if not self.results or 'summary' not in self.results:
            logger.error("No analysis results available. Run analysis first.")
            return

        summary = self.results['summary']
        print("\n" + "="*70)
        print("DOCUMENT FORGERY DETECTION RESULTS")
        print("="*70)
        print(f"\nVERDICT: {summary['verdict']} (Confidence: {summary['confidence']:.1f}%)")

        print("\nANALYSIS BREAKDOWN:")
        print("-"*70)
        print(f"{'Analysis Method':<30} {'Score':<10} {'Suspicious'}")
        print("-"*70)
        for method, results in self.results.items():
            if method in ['summary', 'merged_regions']:
                continue
            score = results.get('max_tampering_score', 0)
            suspicious = "✓" if results.get('suspicious', False) else "-"
            print(f"{method.replace('_', ' ').title():<30} {score:<10.1f} {suspicious}")

        if 'merged_regions' in self.results and self.results['merged_regions']:
            print("\nSUSPICIOUS REGIONS:")
            print("-"*70)
            print(f"{'Region (x1,y1,x2,y2)':<30} {'Score':<10} {'Evidence Types'}")
            print("-"*70)
            for idx, region in enumerate(self.results['merged_regions'][:5]):
                region_str = f"({region['region'][0]},{region['region'][1]},{region['region'][2]},{region['region'][3]})"
                evidence = ", ".join(region['evidence_types'])
                print(f"{region_str:<30} {region['score']:<10.1f} {evidence}")
            if len(self.results['merged_regions']) > 5:
                print(f"...and {len(self.results['merged_regions']) - 5} more regions")

        print("\n" + "-"*70)
        print(f"Detailed results and visualizations saved in: {self.output_dir}/")
        print("="*70 + "\n")

def process_image(file_path, output_dir=None, max_dimension=2000):
    """Process an image for forgery detection"""
    start_time = time.time()
    detector = DocumentForgeryDetector(output_dir=output_dir)
    try:
        detector.load_image(file_path, max_dimension=max_dimension)
        marked_image_path, results = detector.run_all_analyses()
        detector.print_summary()
        logger.info(f"Processing completed in {time.time() - start_time:.2f} seconds")
        return marked_image_path, results
    except Exception as e:
        logger.error(f"Processing failed: {str(e)}")
        raise

def main():
    """Main function for running in Colab"""
    from google.colab import files
    uploaded = files.upload()
    image_path = list(uploaded.keys())[0]
    try:
        marked_image_path, results = process_image(image_path)
        print(f"\nMarked image saved at: {marked_image_path}")
    except Exception as e:
        print(f"Error: {e}")
        return 1
    return 0

if __name__ == "__main__":
    exit(main())

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


Saving dipesh_aadhar.png to dipesh_aadhar.png





DOCUMENT FORGERY DETECTION RESULTS

VERDICT: LIKELY AUTHENTIC (Confidence: 75.0%)

ANALYSIS BREAKDOWN:
----------------------------------------------------------------------
Analysis Method                Score      Suspicious
----------------------------------------------------------------------
Error Level Analysis           0.0        -
Font Alignment Analysis        0.0        -
Color Channel Analysis         100.0      ✓
Texture Consistency Analysis   0.0        -
Grayscale Analysis             0.0        -

SUSPICIOUS REGIONS:
----------------------------------------------------------------------
Region (x1,y1,x2,y2)           Score      Evidence Types
----------------------------------------------------------------------
(0,80,96,112)                  100.0      color
(208,80,304,112)               100.0      color

----------------------------------------------------------------------
Detailed results and visualizations saved in: forgery_results_20250320_231812/


Marked image