In [None]:
import cv2
import numpy as np
import pytesseract
from pytesseract import Output
import re
import os
import fitz  # PyMuPDF for PDF rendering

# -----------------------------
# CONFIGURATION
# -----------------------------
# You can set this to a single file (image or PDF) or a directory containing
# multiple templates for different countries. We'll try them all and pick the best.
ID_TEMPLATE_PATH = "id_template1.pdf"
IMAGE_PATH = "sample_ids/id_scan4.pdf" # Change the input image here
OUTPUT_DIR = "outputs"

# OCR Patterns
# Default DOB pattern (legacy, using slashes). Country-specific patterns are built via get_dob_pattern().
DOB_PATTERN = r"\d{2}/\d{2}/\d{4}"
# Accept either a simple ID (2 letters + 6 digits) OR PAN format (AAAAA9999A)
ID_PATTERN = r"([A-Z]{2}\d{6}|[A-Z]{5}\d{4}[A-Z])"

# Thresholds
# Template match threshold on normalized correlation score [0..1]
# Lowered slightly to accept more legitimate variation
TEMPLATE_MATCH_THRESHOLD = 0.35
# Edge ratio threshold (fraction of edge pixels)
EDGE_RATIO_THRESHOLD = 0.20
# Noise threshold for variance map and the fraction threshold of pixels above it
NOISE_VARIANCE_THRESHOLD = 50.0
NOISE_FRACTION_THRESHOLD = 0.35

# Scoring weights
WEIGHT_EDGES = 20
WEIGHT_NOISE = 15
WEIGHT_OCR = 10
WEIGHT_TEMPLATE = 20
MAX_SCORE = WEIGHT_EDGES + WEIGHT_NOISE + WEIGHT_OCR + WEIGHT_TEMPLATE

# Ensure output directories exist
os.makedirs(os.path.join(OUTPUT_DIR, "edges"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "noise"), exist_ok=True)
os.makedirs(os.path.join(OUTPUT_DIR, "annotated"), exist_ok=True)

# Target normalized size for perspective-warped documents (width, height)
WARP_SIZE = (900, 600)

# -----------------------------
# HELPER FUNCTIONS
# -----------------------------
def detect_edges(image_gray):
    blurred = cv2.GaussianBlur(image_gray, (5, 5), 0)
    edges = cv2.Canny(blurred, 50, 150)
    edge_count = int(np.sum(edges > 0))
    h, w = image_gray.shape[:2]
    edge_ratio = edge_count / float(h * w)
    return edges, edge_count, edge_ratio

def save_edge_heatmap(edges, filename="edges/edge_heatmap.jpg"):
    heatmap = cv2.applyColorMap(edges, cv2.COLORMAP_JET)
    path = os.path.join(OUTPUT_DIR, filename)
    cv2.imwrite(path, heatmap)
    return path

def local_noise_variance(image_gray, ksize=5):
    mean = cv2.blur(image_gray.astype(float), (ksize, ksize))
    mean_sq = cv2.blur((image_gray.astype(float) ** 2), (ksize, ksize))
    variance = mean_sq - mean**2
    return variance

def save_noise_heatmap(noise_map, filename="noise/noise_heatmap.jpg"):
    norm_noise = cv2.normalize(noise_map, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    heatmap = cv2.applyColorMap(norm_noise, cv2.COLORMAP_JET)
    path = os.path.join(OUTPUT_DIR, filename)
    cv2.imwrite(path, heatmap)
    return path

def perform_ocr(image_color):
    # Use RGB for better OCR results
    if len(image_color.shape) == 3:
        rgb = cv2.cvtColor(image_color, cv2.COLOR_BGR2RGB)
    else:
        rgb = image_color
    text = pytesseract.image_to_string(rgb)
    return text

def check_ocr_fields(text, dob_pattern):
    dob_found = bool(re.search(dob_pattern, text))
    id_found = bool(re.search(ID_PATTERN, text))
    # Be lenient: consider OCR OK if EITHER a DOB OR a valid ID-like string is detected
    missing_fields = not (dob_found or id_found)
    return missing_fields, dob_found, id_found

def highlight_ocr_fields(image, dob_pattern=DOB_PATTERN, id_pattern=ID_PATTERN, filename="annotated/ocr_fields.jpg"):
    # Run OCR in RGB for consistency
    rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) if len(image.shape) == 3 else image
    data = pytesseract.image_to_data(rgb, output_type=Output.DICT)
    img_copy = image.copy()
    for i, word in enumerate(data['text']):
        x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
        text = data['text'][i]
        # Use search instead of fullmatch as OCR tokens are often partial segments
        if re.search(dob_pattern, text) or re.search(id_pattern, text):
            color = (0, 255, 0)  # Green = valid
        else:
            color = (0, 0, 255)  # Red = invalid/missing
        cv2.rectangle(img_copy, (x, y), (x+w, y+h), color, 2)
    annotated_path = os.path.join(OUTPUT_DIR, filename)
    cv2.imwrite(annotated_path, img_copy)
    return annotated_path, img_copy

def _is_image_file(path):
    return os.path.splitext(path)[1].lower() in {'.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff', '.webp'}

def _load_pdf_pages_as_grayscale_images(pdf_path, dpi=200):
    doc = fitz.open(pdf_path)
    pages = []
    for page_index in range(len(doc)):
        page = doc[page_index]
        mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_RGBA2GRAY)
        elif pix.n == 3:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        else:
            img = img  # already grayscale
        pages.append(img)
    doc.close()
    return pages

def load_template_images(template_path_or_dir=ID_TEMPLATE_PATH, dpi=200):
    """Load one or more grayscale template images.
    - If directory: load all images and PDFs inside (first page of each PDF).
    - If file: load as image or all pages if PDF.
    Returns list of tuples: (template_name, gray_image)
    """
    templates = []
    if template_path_or_dir is None:
        return templates
    if os.path.isdir(template_path_or_dir):
        for fname in sorted(os.listdir(template_path_or_dir)):
            fpath = os.path.join(template_path_or_dir, fname)
            if os.path.isdir(fpath):
                continue
            ext = os.path.splitext(fname)[1].lower()
            if ext == '.pdf':
                pages = _load_pdf_pages_as_grayscale_images(fpath, dpi=dpi)
                for i, page in enumerate(pages, start=1):
                    templates.append((fname + f"#p{i}", page))
            elif _is_image_file(fpath):
                img = cv2.imread(fpath, 0)
                if img is not None:
                    templates.append((fname, img))
    else:
        ext = os.path.splitext(template_path_or_dir)[1].lower()
        if ext == '.pdf':
            pages = _load_pdf_pages_as_grayscale_images(template_path_or_dir, dpi=dpi)
            for i, page in enumerate(pages, start=1):
                templates.append((os.path.basename(template_path_or_dir) + f"#p{i}", page))
        elif _is_image_file(template_path_or_dir):
            img = cv2.imread(template_path_or_dir, 0)
            if img is not None:
                templates.append((os.path.basename(template_path_or_dir), img))
    return templates

def _order_quad_points(pts):
    pts = np.array(pts, dtype="float32")
    s = pts.sum(axis=1)
    diff = np.diff(pts, axis=1)
    rect = np.zeros((4, 2), dtype="float32")
    rect[0] = pts[np.argmin(s)]      # top-left
    rect[2] = pts[np.argmax(s)]      # bottom-right
    rect[1] = pts[np.argmin(diff)]   # top-right
    rect[3] = pts[np.argmax(diff)]   # bottom-left
    return rect

def warp_and_resize(gray_img, target_size=WARP_SIZE):
    """Find a document-like quad and warp to a fixed size. Fallback to resized image."""
    try:
        img = gray_img.copy()
        # Edge detection and contour finding
        blurred = cv2.GaussianBlur(img, (5, 5), 0)
        edges = cv2.Canny(blurred, 50, 150)
        contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        contours = sorted(contours, key=cv2.contourArea, reverse=True)
        quad = None
        for cnt in contours[:10]:
            peri = cv2.arcLength(cnt, True)
            approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)
            if len(approx) == 4:
                quad = approx.reshape(4, 2)
                break
        if quad is not None:
            rect = _order_quad_points(quad)
            (tw, th) = target_size
            dst = np.array([[0, 0], [tw-1, 0], [tw-1, th-1], [0, th-1]], dtype="float32")
            M = cv2.getPerspectiveTransform(rect, dst)
            warped = cv2.warpPerspective(gray_img, M, (tw, th))
            return warped
    except Exception:
        pass
    # Fallback: simple resize with preserved aspect into target canvas
    tw, th = target_size
    h, w = gray_img.shape[:2]
    scale = min(tw / w, th / h)
    nw, nh = max(1, int(w * scale)), max(1, int(h * scale))
    resized = cv2.resize(gray_img, (nw, nh), interpolation=cv2.INTER_AREA)
    canvas = np.full((th, tw), 255, dtype=np.uint8)
    y0 = (th - nh) // 2
    x0 = (tw - nw) // 2
    canvas[y0:y0+nh, x0:x0+nw] = resized
    return canvas

def get_dob_pattern(country: str = "others") -> str:
    """Return a country-specific DOB regex pattern.
    - USA: MM--DD--YYYY
    - Others: DD--MM--YYYY
    Uses double hyphens as separators as per requirement.
    """
    c = (country or "").strip().lower()
    if c in {"usa", "us", "united states", "united states of america"}:
        # Month 01-12, Day 01-31, Year 4 digits
        return r"\b(0[1-9]|1[0-2])--(0[1-9]|[12]\d|3[01])--\d{4}\b"
    # Default to others: Day 01-31, Month 01-12, Year 4 digits
    return r"\b(0[1-9]|[12]\d|3[01])--(0[1-9]|1[0-2])--\d{4}\b"

def template_match_multi(image_gray, templates, threshold=TEMPLATE_MATCH_THRESHOLD):
    """Match against multiple templates and return the best match.
    templates: list of (name, gray_template)
    Returns: (match_ok, best_score, best_name)
    """
    if image_gray is None or not templates:
        # No template to compare, don't penalize
        return True, 0.0, None
    # Normalize both image and templates by warping to a standard canvas
    warped_img = warp_and_resize(image_gray, target_size=WARP_SIZE)
    best_score = -1.0
    best_name = None
    for name, template in templates:
        if template is None:
            continue
        warped_tpl = warp_and_resize(template, target_size=WARP_SIZE)
        # With equal sizes, matchTemplate yields a single 1x1 response
        res = cv2.matchTemplate(warped_img, warped_tpl, cv2.TM_CCOEFF_NORMED)
        max_val = float(res[0, 0])
        if max_val > best_score:
            best_score = max_val
            best_name = name
    match_ok = bool(best_score >= threshold)
    return match_ok, best_score, best_name

def analyze_image_arrays(img_color, img_gray, label="img", source="image", dob_pattern=DOB_PATTERN):
    tamper_score = 0
    results = {}

    # Normalize input via perspective warp to reduce layout variance and page noise
    working_gray = warp_and_resize(img_gray, target_size=WARP_SIZE)
    # Create a pseudo-color version for OCR/annotations
    working_color = cv2.cvtColor(working_gray, cv2.COLOR_GRAY2BGR)

    # ---- Edge Analysis ----
    edges, edge_count, edge_ratio = detect_edges(working_gray)
    edge_path = save_edge_heatmap(edges, filename=f"edges/edge_heatmap_{label}.jpg")
    results['edges_heatmap'] = edge_path
    results['edge_count'] = edge_count
    results['edge_ratio'] = round(edge_ratio, 4)
    edge_thresh = EDGE_RATIO_THRESHOLD + (0.05 if source == "pdf" else 0.0)
    if edge_ratio > edge_thresh:
        tamper_score += WEIGHT_EDGES
        results['edges_flag'] = True
    else:
        results['edges_flag'] = False

    # ---- Noise Analysis ----
    noise_map = local_noise_variance(working_gray)
    noise_path = save_noise_heatmap(noise_map, filename=f"noise/noise_heatmap_{label}.jpg")
    results['noise_heatmap'] = noise_path
    high_noise_mask = (noise_map > NOISE_VARIANCE_THRESHOLD)
    high_noise_area = int(np.sum(high_noise_mask))
    h, w = working_gray.shape[:2]
    noise_fraction = high_noise_area / float(h * w)
    results['noise_fraction'] = round(noise_fraction, 4)
    if noise_fraction > NOISE_FRACTION_THRESHOLD:
        tamper_score += WEIGHT_NOISE
        results['noise_flag'] = True
    else:
        results['noise_flag'] = False

    # ---- OCR + Field Checks ----
    ocr_text = perform_ocr(working_color)
    missing_fields, dob_found, id_found = check_ocr_fields(ocr_text, dob_pattern)
    annotated_path, annotated_img = highlight_ocr_fields(working_color, dob_pattern=dob_pattern, filename=f"annotated/ocr_fields_{label}.jpg")
    results['ocr_annotated'] = annotated_path
    results['ocr_text'] = ocr_text
    # If OCR text is very short, don't penalize (could be low-quality scan)
    ocr_len = len(ocr_text.strip()) if ocr_text else 0
    # Relax OCR penalty for PDFs due to noisy full-page text
    min_len = 60 if source == "pdf" else 20
    if missing_fields and ocr_len >= min_len:
        tamper_score += WEIGHT_OCR
        results['ocr_flag'] = True
    else:
        results['ocr_flag'] = False

    # ---- Template Matching (multi-template support) ----
    templates = load_template_images(ID_TEMPLATE_PATH, dpi=200)
    template_ok, template_score, template_name = template_match_multi(img_gray, templates)
    results['template_ok'] = template_ok
    results['template_score'] = round(template_score, 3)
    results['template_name'] = template_name
    if not template_ok:
        tamper_score += WEIGHT_TEMPLATE
        results['template_flag'] = True
    else:
        results['template_flag'] = False

    # ---- Final Tampering Score ----
    # Normalize to 0-100
    results['tamper_score_raw'] = tamper_score
    normalized_score = int(round(100.0 * tamper_score / float(MAX_SCORE))) if MAX_SCORE > 0 else 0

    # Compute flags count before finalization
    flags_count = int(results['edges_flag']) + int(results['noise_flag']) + int(results['ocr_flag']) + int(results['template_flag'])

    # Critical template mismatch override
    if (not results['template_ok']) and (results.get('template_score', 0.0) <= 0.12):
        normalized_score = 100

    results['tamper_score'] = normalized_score
    results['flags_count'] = flags_count
    results['tamper_warning'] = (normalized_score >= 70) and (flags_count >= 2)

    # Print summary for this label
    print(f"\n=== Tampered Document Analysis ({label}) ===")
    print(f"Tampering Score: {results['tamper_score']}/100 (raw={results['tamper_score_raw']}/{MAX_SCORE})")
    print(f"Edge Anomaly: {results['edges_flag']}")
    print(f"Noise Anomaly: {results['noise_flag']}")
    print(f"OCR Field Issue: {results['ocr_flag']}")
    if results.get('template_name'):
        print(f"Best Template: {results['template_name']} (score={results['template_score']})")
    print(f"Template Mismatch: {results['template_flag']}")
    if results['tamper_warning']:
        print("WARNING: Document potentially tampered!")
    else:
        print("Document appears normal.")

    return results

def render_pdf_to_images(pdf_path, dpi=200):
    """Render each page of a PDF to a BGR image array using PyMuPDF."""
    doc = fitz.open(pdf_path)
    images = []
    for page_index in range(len(doc)):
        page = doc[page_index]
        mat = fitz.Matrix(dpi / 72.0, dpi / 72.0)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        # Convert to numpy BGR image
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
        if pix.n == 4:
            img = cv2.cvtColor(img, cv2.COLOR_RGBA2BGR)
        elif pix.n == 3:
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        else:
            # Grayscale
            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
        images.append(img)
    doc.close()
    return images

# -----------------------------
# MAIN PIPELINE
# -----------------------------
def analyze_document(image_path=IMAGE_PATH, country: str = "others"):
    # Build country-specific DOB pattern
    dob_pattern = get_dob_pattern(country)
    # Handle PDF vs Image paths
    ext = os.path.splitext(image_path)[1].lower()
    if ext == ".pdf":
        pages = render_pdf_to_images(image_path, dpi=200)
        all_results = []
        overall = {
            'tamper_score': 0,
            'tamper_warning': False,
            'pages': []
        }
        for idx, img_color in enumerate(pages, start=1):
            label = f"p{idx}"
            img_gray = cv2.cvtColor(img_color, cv2.COLOR_BGR2GRAY)
            res = analyze_image_arrays(img_color, img_gray, label=label, source="pdf", dob_pattern=dob_pattern)
            all_results.append(res)
            overall['pages'].append(res)
        # Aggregate: take the max tamper score and warning if any page warns
        if all_results:
            overall['tamper_score'] = max(r.get('tamper_score', 0) for r in all_results)
            overall['tamper_warning'] = any(r.get('tamper_warning', False) for r in all_results)
        print("\n=== Overall PDF Analysis Summary ===")
        print(f"Max Tampering Score across pages: {overall['tamper_score']}")
        print(f"Any Page Warning: {overall['tamper_warning']}")
        return overall
    else:
        # Load as standard image
        img_gray = cv2.imread(image_path, 0)
        img_color = cv2.imread(image_path)
        if img_gray is None or img_color is None:
            raise FileNotFoundError(f"Could not read image at path: {image_path}")
        return analyze_image_arrays(img_color, img_gray, label="img", dob_pattern=dob_pattern)

# -----------------------------
# RUN STANDALONE
# -----------------------------
if _name_ == "_main_":
    analyze_document()