In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split
from pathlib import Path
import json

# Step 1: Define paths and hyperparameters
DATA_ROOT = "/content/drive/MyDrive/NTU-Roselab-Dataset"
IMG_SIZE = 224
BATCH_SIZE = 16
EPOCHS = 8
LEARNING_RATE = 1e-4
N_FOLDS = 5
MODEL_NAME = "RGB_EfficientNetB0"
OUTPUT_DIR = f"/content/drive/MyDrive/Recapture_Photo_Detection/{MODEL_NAME}/results"
SPLIT_DIR = "/content/drive/MyDrive/Recapture_Photo_Detection"
CHECKPOINT_FILE = f"{OUTPUT_DIR}/checkpoint.json"
PREPROCESSING = "RGB"
HYPERPARAMETERS = {
    "learning_rate": LEARNING_RATE,
    "batch_size": BATCH_SIZE,
    "optimizer": "Adam",
    "epochs": EPOCHS,
    "n_folds": N_FOLDS,
    "dropout_rate": 0.4
}

# Step 2: Mount Google Drive and verify dataset path
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
except ImportError:
    raise ImportError("This script must be run in Google Colab with Google Drive mounted.")
if not os.path.exists(DATA_ROOT):
    raise FileNotFoundError(f"Dataset directory {DATA_ROOT} does not exist. Please check the path.")

# Step 3: Check dataset balance
def check_dataset_balance(data_root):
    originals_path = os.path.join(data_root, 'originals')
    recaptures_path = os.path.join(data_root, 'recaptures')
    originals_count = sum(len(files) for _, _, files in os.walk(originals_path))
    recaptures_count = sum(len(files) for _, _, files in os.walk(recaptures_path))
    print(f"Dataset Balance: {originals_count} originals, {recaptures_count} recaptures")
    return originals_count, recaptures_count

originals_count, recaptures_count = check_dataset_balance(DATA_ROOT)

# Step 4: Define preprocessing function
@tf.function
def preprocess(img, label):
    img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
    img = tf.keras.applications.efficientnet.preprocess_input(img)
    return img, label

# Step 5: Define custom rotation function
@tf.function
def random_rotation(img, max_angle=0.1):  # max_angle in radians (~10 degrees)
    angles = [0, np.pi/2, np.pi, 3*np.pi/2]  # 0°, 90°, 180°, 270°
    k = tf.random.uniform(shape=(), minval=0, maxval=len(angles), dtype=tf.int32)
    img = tf.image.rot90(img, k)
    return img

# Step 6: Define data augmentation
@tf.function
def augment_image(img, label):
    img = tf.image.random_flip_left_right(img)
    img = random_rotation(img, max_angle=0.1)
    return img, label

# Step 7: Load or create train-test split
def load_or_create_split():
    split_files = ['X_train.npy', 'X_test.npy', 'y_train.npy', 'y_test.npy']
    if all(os.path.exists(os.path.join(SPLIT_DIR, f)) for f in split_files):
        print("Loading existing train-test split...")
        X_train = np.load(os.path.join(SPLIT_DIR, 'X_train.npy'))
        X_test = np.load(os.path.join(SPLIT_DIR, 'X_test.npy'))
        y_train = np.load(os.path.join(SPLIT_DIR, 'y_train.npy'))
        y_test = np.load(os.path.join(SPLIT_DIR, 'y_test.npy'))
    else:
        print("Creating new train-test split...")
        dataset = image_dataset_from_directory(
            DATA_ROOT,
            labels='inferred',
            label_mode='binary',
            image_size=(IMG_SIZE, IMG_SIZE),
            batch_size=BATCH_SIZE,
            shuffle=True,
            seed=42
        )
        images, labels = [], []
        for img_batch, label_batch in dataset:
            images.append(img_batch.numpy())
            labels.append(label_batch.numpy())
        images = np.concatenate(images, axis=0)
        labels = np.concatenate(labels, axis=0).flatten()

        X_train, X_test, y_train, y_test = train_test_split(
            images, labels, test_size=0.2, stratify=labels, random_state=42
        )

        Path(SPLIT_DIR).mkdir(parents=True, exist_ok=True)
        np.save(os.path.join(SPLIT_DIR, 'X_train.npy'), X_train)
        np.save(os.path.join(SPLIT_DIR, 'X_test.npy'), X_test)
        np.save(os.path.join(SPLIT_DIR, 'y_train.npy'), y_train)
        np.save(os.path.join(SPLIT_DIR, 'y_test.npy'), y_test)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = load_or_create_split()

# Create test dataset
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(BATCH_SIZE).map(preprocess).prefetch(tf.data.AUTOTUNE)

# Step 8: Define function to create RGB EfficientNetB0 model
def create_model():
    input_layer = Input(shape=(IMG_SIZE, IMG_SIZE, 3), name='rgb_input')
    base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
    for layer in base_model.layers[:-20]:
        layer.trainable = False
    x = GlobalAveragePooling2D()(base_model(input_layer))
    x = Dropout(HYPERPARAMETERS['dropout_rate'])(x)
    x = Dense(128, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=input_layer, outputs=out)
    model.compile(optimizer=tf.keras.optimizers.Adam(LEARNING_RATE),
                  loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Step 9: Convert NumPy types to JSON-serializable types
def convert_to_serializable(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    return obj

# Step 10: Define function to save results
def save_model_results(model, dataset, history, model_name, output_dir, fold=None, preprocessing='None', hyperparameters=None):
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    fold_str = f"_fold_{fold}" if fold is not None else ""

    # Evaluate on dataset
    y_true, y_pred = [], []
    for imgs, labels in dataset:
        preds = (model.predict(imgs, verbose=0) > 0.5).astype(int)
        y_true.extend(labels.numpy().astype(int))
        y_pred.extend(preds.flatten())

    # Check prediction distribution
    originals_pred = sum(1 for p in y_pred if p == 0)
    recaptures_pred = sum(1 for p in y_pred if p == 1)
    print(f"Predictions {fold_str}: {originals_pred} originals, {recaptures_pred} recaptures")

    # Classification report
    class_report = classification_report(y_true, y_pred, target_names=['originals', 'recaptured'], output_dict=True)
    class_report_df = pd.DataFrame(class_report).transpose()
    class_report_df.to_csv(f'{output_dir}/{model_name}_classification_report{fold_str}.csv')

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['originals', 'recaptured'], yticklabels=['originals', 'recaptured'])
    plt.title(f'Confusion Matrix - {model_name}{fold_str}')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(f'{output_dir}/{model_name}_confusion_matrix{fold_str}.png')
    plt.close()

    # Save confusion matrix as CSV
    cm_df = pd.DataFrame(cm, index=['True_originals', 'True_recaptured'], columns=['Pred_originals', 'Pred_recaptured'])
    cm_df.to_csv(f'{output_dir}/{model_name}_confusion_matrix{fold_str}.csv')

    # Model summary (only for final model)
    if fold is None:
        summary_file = f'{output_dir}/{model_name}_summary.txt'
        with open(summary_file, 'w') as f:
            model.summary(print_fn=lambda x: f.write(x + '\n'))

    # Calculate total and trainable parameters
    total_params = model.count_params()
    trainable_params = sum([tf.keras.backend.count_params(w) for w in model.trainable_weights])

    # Aggregate results
    results = {
        'Model': model_name,
        'Preprocessing': preprocessing,
        'Accuracy': class_report['accuracy'],
        'Total_Parameters': total_params,
        'Trainable_Parameters': trainable_params,
        'Fold': fold if fold is not None else 'Final'
    }
    if hyperparameters:
        results.update(hyperparameters)
    for label, metrics in class_report.items():
        if isinstance(metrics, dict):
            results.update({
                f'Precision_{label}': metrics['precision'],
                f'Recall_{label}': metrics['recall'],
                f'F1-Score_{label}': metrics['f1-score'],
                f'Support_{label}': metrics['support']
            })

    # Convert NumPy types to JSON-serializable types
    results = {k: convert_to_serializable(v) for k, v in results.items()}

    # Save results to JSON
    with open(f'{output_dir}/{model_name}_results{fold_str}.json', 'w') as f:
        json.dump(results, f, indent=4)

    # Plot and save accuracy/loss curves
    if history is not None:
        plt.figure(figsize=(10, 4))
        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Train Accuracy')
        if 'val_accuracy' in history.history:
            plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
        plt.title(f'Accuracy Curve - {model_name}{fold_str}')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.grid(True)

        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Train Loss')
        if 'val_loss' in history.history:
            plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title(f'Loss Curve - {model_name}{fold_str}')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f'{output_dir}/{model_name}_accuracy_loss_curve{fold_str}.png')
        plt.close()

    # Save full model for final model (using .keras format)
    if fold is None:
        model.save(f'{output_dir}/{model_name}_model.keras', overwrite=True)

    return results

# Step 11: Checkpointing and resumption logic
def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            checkpoint = json.load(f)
        last_completed_fold = checkpoint.get('last_completed_fold', 0)
        print(f"Resuming from checkpoint: Last completed fold = {last_completed_fold}")
        return last_completed_fold
    return 0

def save_checkpoint(fold):
    Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
    checkpoint = {'last_completed_fold': fold}
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(checkpoint, f, indent=4)

# Step 12: Perform 5-fold cross-validation with checkpointing
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
fold_results = []
last_completed_fold = load_checkpoint()

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    if fold <= last_completed_fold:
        print(f"Skipping fold {fold} (already completed)")
        # Load results from previous run
        fold_str = f"_fold_{fold}"
        result_file = f'{OUTPUT_DIR}/{MODEL_NAME}_results{fold_str}.json'
        if os.path.exists(result_file):
            with open(result_file, 'r') as f:
                fold_results.append(json.load(f))
        continue

    print(f"\nTraining Fold {fold}/{N_FOLDS}")

    # Create train and validation datasets for this fold
    X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
    y_fold_train, y_fold_val = y_train[train_idx], y_train[val_idx]

    train_ds = tf.data.Dataset.from_tensor_slices((X_fold_train, y_fold_train)).batch(BATCH_SIZE).map(preprocess).map(augment_image).prefetch(tf.data.AUTOTUNE)
    val_ds = tf.data.Dataset.from_tensor_slices((X_fold_val, y_fold_val)).batch(BATCH_SIZE).map(preprocess).prefetch(tf.data.AUTOTUNE)

    # Create model and load weights if available
    model = create_model()
    fold_str = f"_fold_{fold}"
    checkpoint_path = f'{OUTPUT_DIR}/{MODEL_NAME}_model{fold_str}.weights.h5'
    if os.path.exists(checkpoint_path):
        print(f"Loading weights for fold {fold} from {checkpoint_path}")
        model.load_weights(checkpoint_path)

    # Train model with early stopping and checkpointing
    checkpoint_callback = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=True)
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, class_weight={0: 1.0, 1: 1.5}, verbose=1, callbacks=[early_stopping, checkpoint_callback])

    # Save results for this fold
    results = save_model_results(
        model, val_ds, history, MODEL_NAME, OUTPUT_DIR,
        fold=fold, preprocessing=PREPROCESSING, hyperparameters=HYPERPARAMETERS
    )
    fold_results.append(results)

    # Update checkpoint
    save_checkpoint(fold)

# Step 13: Train final model on full training set if not already done
if last_completed_fold < N_FOLDS + 1:
    print("\nTraining final model on full training set")
    final_model_path = f'{OUTPUT_DIR}/{MODEL_NAME}_model.keras'
    if os.path.exists(final_model_path) and last_completed_fold == 'final':
        print(f"Final model already saved at {final_model_path}, skipping training")
        # Load results from previous run
        result_file = f'{OUTPUT_DIR}/{MODEL_NAME}_results.json'
        if os.path.exists(result_file):
            with open(result_file, 'r') as f:
                results = json.load(f)
            print(f"Final Results for {MODEL_NAME}:", results)
    else:
        train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(BATCH_SIZE).map(preprocess).map(augment_image).prefetch(tf.data.AUTOTUNE)
        model = create_model()
        checkpoint_path = f'{OUTPUT_DIR}/{MODEL_NAME}_model.weights.h5'
        if os.path.exists(checkpoint_path):
            print(f"Loading weights for final model from {checkpoint_path}")
            model.load_weights(checkpoint_path)

        checkpoint_callback = ModelCheckpoint(checkpoint_path, monitor='loss', save_best_only=True, save_weights_only=True)
        history = model.fit(train_ds, epochs=EPOCHS, class_weight={0: 1.0, 1: 1.5}, verbose=1, callbacks=[checkpoint_callback])

        # Save results for final model
        results = save_model_results(
            model, test_ds, history, MODEL_NAME, OUTPUT_DIR,
            preprocessing=PREPROCESSING, hyperparameters=HYPERPARAMETERS
        )
        print(f"Final Results for {MODEL_NAME}:", results)

        # Update checkpoint
        save_checkpoint('final')

# Step 14: Aggregate cross-validation results
if fold_results:
    fold_df = pd.DataFrame(fold_results)
    mean_results = {
        'Model': MODEL_NAME,
        'Preprocessing': PREPROCESSING,
        'Mean_Accuracy': fold_df['Accuracy'].mean(),
        'Std_Accuracy': fold_df['Accuracy'].std(),
        'Mean_Precision_recaptured': fold_df['Precision_recaptured'].mean(),
        'Mean_Recall_recaptured': fold_df['Recall_recaptured'].mean(),
        'Mean_F1-Score_recaptured': fold_df['F1-Score_recaptured'].mean(),
        'Mean_Total_Parameters': fold_df['Total_Parameters'].mean(),
        'Mean_Trainable_Parameters': fold_df['Trainable_Parameters'].mean()
    }
    # Convert NumPy types in mean_results
    mean_results = {k: convert_to_serializable(v) for k, v in mean_results.items()}
    with open(f'{OUTPUT_DIR}/{MODEL_NAME}_cv_summary.json', 'w') as f:
        json.dump(mean_results, f, indent=4)
    fold_df.to_csv(f'{OUTPUT_DIR}/{MODEL_NAME}_cv_results.csv', index=False)
    print("\nCross-Validation Summary:", mean_results)

Mounted at /content/drive
Dataset Balance: 1202 originals, 1199 recaptures
Loading existing train-test split...

Training Fold 1/5
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Epoch 1/8
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 92ms/step - accuracy: 0.5709 - loss: 0.8704 - val_accuracy: 0.6771 - val_loss: 0.6718
Epoch 2/8
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 56ms/step - accuracy: 0.7658 - loss: 0.5653 - val_accuracy: 0.6823 - val_loss: 0.8121
Epoch 3/8
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 37ms/step - accuracy: 0.8637 - loss: 0.4165 - val_accuracy: 0.7135 - val_loss: 0.8620
Epoch 4/8
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 39ms/step - accuracy: 0.8940 - loss: 0.3503 - val_accuracy: 0.7083 - 

Final Results for RGB_MobileNetV2: {'Model': 'RGB_MobileNetV2', 'Preprocessing': 'RGB', 'Accuracy': 0.8565488565488566, 'Total_Parameters': 2422081, 'Trainable_Parameters': 1370177, 'Fold': 'Final', 'learning_rate': 0.0001, 'batch_size': 16, 'optimizer': 'Adam', 'epochs': 8, 'n_folds': 5, 'dropout_rate': 0.4, 'Precision_originals': 0.8138686131386861, 'Recall_originals': 0.9253112033195021, 'F1-Score_originals': 0.8660194174757282, 'Support_originals': 241.0, 'Precision_recaptured': 0.9130434782608695, 'Recall_recaptured': 0.7875, 'F1-Score_recaptured': 0.8456375838926175, 'Support_recaptured': 240.0, 'Precision_macro avg': 0.8634560456997777, 'Recall_macro avg': 0.856405601659751, 'F1-Score_macro avg': 0.8558285006841728, 'Support_macro avg': 481.0, 'Precision_weighted avg': 0.8633529533243909, 'Recall_weighted avg': 0.8565488565488566, 'F1-Score_weighted avg': 0.8558496876213695, 'Support_weighted avg': 481.0}

Cross-Validation Summary: {'Model': 'RGB_MobileNetV2', 'Preprocessing': '