# Document Fraud Detection — EfficientNet-B3 Training on Colab

Trains on **CASIA v2.0** (~7,491 genuine + 5,123 tampered) and **COVERAGE** (100+100) datasets.
Expected result: val AUC > 0.90 by epoch 30.

In [None]:
# Cell 1: GPU check
import torch
print('CUDA available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
    total_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f'VRAM: {total_mem:.1f} GB')
else:
    print('WARNING: No GPU detected. Go to Runtime > Change runtime type > GPU')

In [None]:
# Cell 2: Install dependencies
!pip install -q torch torchvision scipy scikit-learn loguru PyMuPDF opencv-python-headless pillow

In [None]:
# Cell 3: Mount Google Drive (for saving model checkpoints)
from google.colab import drive
drive.mount('/content/drive')

import os
DRIVE_SAVE_PATH = '/content/drive/MyDrive/fraud_detection_models'
os.makedirs(DRIVE_SAVE_PATH, exist_ok=True)
print(f'Model will be saved to: {DRIVE_SAVE_PATH}')

In [None]:
# Cell 4: Download CASIA v2.0 via Kaggle API
# Step 1: Upload your kaggle.json file
from google.colab import files
print('Upload your kaggle.json API key file:')
uploaded = files.upload()  # Upload kaggle.json

import os
os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download CASIA v2.0
# Dataset: https://www.kaggle.com/datasets/divg07/casia-20-image-tampering-detection-dataset
!kaggle datasets download -d divg07/casia-20-image-tampering-detection-dataset -p /content/casia_raw
!unzip -q /content/casia_raw/*.zip -d /content/casia_raw/
print('CASIA v2.0 downloaded.')

In [None]:
# Cell 5: Clone COVERAGE dataset (100 genuine + 100 tampered)
!git clone https://github.com/wenbihan/coverage.git /content/coverage_raw

import os
coverage_image_dir = '/content/coverage_raw/image'
genuine_files = [f for f in os.listdir(coverage_image_dir)
                 if f.endswith('.tif') and not f[:-4].endswith('f')]
tampered_files = [f for f in os.listdir(coverage_image_dir)
                  if f.endswith('.tif') and f[:-4].endswith('f')]
print(f'COVERAGE: {len(genuine_files)} genuine, {len(tampered_files)} tampered')

In [None]:
# Cell 6: Dataset preparation — stratified 75/12.5/12.5 train/val/test split
import os, shutil, random
from pathlib import Path

OUTPUT_DIR = '/content/dataset'

def prepare_dataset(casia_au_dir, casia_tp_dir, coverage_image_dir, output_dir,
                    train_ratio=0.75, val_ratio=0.125):
    """Build stratified split dataset from CASIA v2.0 + COVERAGE."""
    valid_exts = {'.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff'}

    def collect(directory):
        return [os.path.join(directory, f) for f in os.listdir(directory)
                if Path(f).suffix.lower() in valid_exts]

    genuine_files = collect(casia_au_dir)
    tampered_files = collect(casia_tp_dir)

    # Add COVERAGE images
    for f in os.listdir(coverage_image_dir):
        p = os.path.join(coverage_image_dir, f)
        if Path(f).suffix.lower() == '.tif':
            base = Path(f).stem
            if base.endswith('f'):
                tampered_files.append(p)
            else:
                genuine_files.append(p)

    print(f'Total: {len(genuine_files)} genuine, {len(tampered_files)} tampered')

    def split_files(files):
        random.shuffle(files)
        n = len(files)
        n_train = int(n * train_ratio)
        n_val = int(n * val_ratio)
        return files[:n_train], files[n_train:n_train+n_val], files[n_train+n_val:]

    g_train, g_val, g_test = split_files(genuine_files)
    t_train, t_val, t_test = split_files(tampered_files)

    splits = {
        'train': {'genuine': g_train, 'tampered': t_train},
        'val':   {'genuine': g_val,   'tampered': t_val},
        'test':  {'genuine': g_test,  'tampered': t_test},
    }

    for split, classes in splits.items():
        for cls, files in classes.items():
            dest_dir = os.path.join(output_dir, split, cls)
            os.makedirs(dest_dir, exist_ok=True)
            for src in files:
                shutil.copy2(src, os.path.join(dest_dir, os.path.basename(src)))
        g_count = len(classes['genuine'])
        t_count = len(classes['tampered'])
        print(f'  {split}: {g_count} genuine, {t_count} tampered')

    print(f'Dataset prepared at {output_dir}')

# Adjust paths based on actual CASIA zip structure after extraction
CASIA_AU  = '/content/casia_raw/Au'
CASIA_TP  = '/content/casia_raw/Tp'
COVERAGE_IMG = '/content/coverage_raw/image'

random.seed(42)
prepare_dataset(CASIA_AU, CASIA_TP, COVERAGE_IMG, OUTPUT_DIR)

In [None]:
# Cell 7: Clone or upload project code
# Option A: clone from GitHub (replace with your repo URL)
# !git clone https://github.com/YOUR_USERNAME/document_fraud_ai.git /content/document_fraud_ai

# Option B: upload the project zip
from google.colab import files
print('Upload your document_fraud_ai project zip:')
uploaded = files.upload()
zip_name = list(uploaded.keys())[0]
!unzip -q "{zip_name}" -d /content/

import sys
sys.path.insert(0, '/content/document_fraud_ai')
print('Project code ready.')

In [None]:
# Cell 8: Train EfficientNet-B3
!cd /content/document_fraud_ai && python train_model.py \
    --data_dir /content/dataset \
    --epochs 50 \
    --batch_size 16 \
    --freeze_epochs 5 \
    --patience 10 \
    --output_dir /content/document_fraud_ai/fraud_model

In [None]:
# Cell 9: Copy trained model to Google Drive
import shutil, os

model_src = '/content/document_fraud_ai/fraud_model/fraud_efficientnet_b3_best.pth'
if os.path.exists(model_src):
    dest = os.path.join(DRIVE_SAVE_PATH, 'fraud_efficientnet_b3_best.pth')
    shutil.copy2(model_src, dest)
    size_mb = os.path.getsize(dest) / 1e6
    print(f'Model saved to Drive: {dest} ({size_mb:.1f} MB)')
else:
    print('ERROR: Model file not found. Check training output above.')

In [None]:
# Cell 10: Evaluate on test split
import sys, os
sys.path.insert(0, '/content/document_fraud_ai')

import torch
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, confusion_matrix
from torch.utils.data import DataLoader

from fraud_model.cnn_model import FraudEfficientNetB3, IMAGENET_MEAN, IMAGENET_STD
from train_model import ELADataset

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = FraudEfficientNetB3(pretrained=False)
model_path = '/content/document_fraud_ai/fraud_model/fraud_efficientnet_b3_best.pth'
state_dict = torch.load(model_path, map_location=device, weights_only=True)
model.load_state_dict(state_dict)
model.to(device)
model.eval()

test_set = ELADataset('/content/dataset', split='test')
test_loader = DataLoader(test_set, batch_size=16, shuffle=False, num_workers=0)

all_preds, all_labels = [], []
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images.to(device)).squeeze(1).cpu().numpy()
        all_preds.extend(outputs.tolist())
        all_labels.extend(labels.numpy().tolist())

all_preds = np.array(all_preds)
all_labels = np.array(all_labels)
binary_preds = (all_preds > 0.5).astype(int)

acc  = accuracy_score(all_labels, binary_preds)
auc  = roc_auc_score(all_labels, all_preds)
f1   = f1_score(all_labels, binary_preds)
cm   = confusion_matrix(all_labels, binary_preds)

print(f'Test Accuracy : {acc:.4f} ({acc*100:.1f}%)')
print(f'Test AUC-ROC  : {auc:.4f}')
print(f'Test F1       : {f1:.4f}')
print(f'Confusion Matrix:')
print(f'  TN={cm[0,0]}  FP={cm[0,1]}')
print(f'  FN={cm[1,0]}  TP={cm[1,1]}')