In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!nvidia-smi
import tensorflow as tf
print("GPUs detected:", tf.config.list_physical_devices('GPU'))


In [None]:
rm -rf /kaggle/working/*


# **Pre-processing + Model training**

In [None]:
# ========== Install Required Packages with Correct Protobuf ==============
!pip install protobuf==3.20.3 mediapipe tensorflow opencv-python-headless scikit-learn matplotlib seaborn albumentations -q

In [None]:
# ======================================================================
# PROTOBUF FIX 
# ======================================================================
import sys
import subprocess

def fix_protobuf():
    """Fix protobuf compatibility issue with MediaPipe"""
    try:
        import google.protobuf
        protobuf_version = google.protobuf.__version__
        print(f"Current protobuf version: {protobuf_version}")
        
        if protobuf_version.startswith('4.') or protobuf_version.startswith('5.'):
            print("⚠️  Incompatible protobuf version detected. Downgrading...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", 
                                 "protobuf==3.20.3", "-q"])
            print("✓ Protobuf downgraded to 3.20.3")
            print("⚠️  Please restart the kernel and run again!")
            return False
        else:
            print(f"✓ Protobuf version {protobuf_version} is compatible")
            return True
    except Exception as e:
        print(f"Error checking protobuf: {e}")
        return True

if not fix_protobuf():
    print("\n" + "="*80)
    print("KERNEL RESTART REQUIRED")
    print("="*80)
    sys.exit(0)

# ======================================================================
# IMPORT ALL LIBRARIES INCLUDING TF BEFORE USING tf.config
# ======================================================================
import os
import glob
import cv2
import numpy as np
import tensorflow as tf
import gc
import json
from collections import defaultdict
from tqdm import tqdm

# Only AFTER TensorFlow is imported:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input as efficientnet_preprocess
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, BatchNormalization, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras import regularizers

import mediapipe as mp
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns

print("✓ All libraries imported successfully")


In [None]:

def configure_cpu():
    """Configure TensorFlow for CPU-only operation"""
    print("\n" + "="*80)
    print("CPU CONFIGURATION")
    print("="*80)
    
    # Disable GPU completely
    tf.config.set_visible_devices([], 'GPU')
    
    # Set CPU threading for stability
    tf.config.threading.set_intra_op_parallelism_threads(2)
    tf.config.threading.set_inter_op_parallelism_threads(2)
    
    # Disable XLA and mixed precision
    tf.config.optimizer.set_jit(False)
    
    print("✓ CPU-only mode enabled")
    print("✓ Threading: 2 intra-op, 2 inter-op")
    print("✓ XLA disabled for stability")
    
    return True

configure_cpu()


In [None]:
import platform
import tensorflow as tf
import numpy as np

print("Python version:", platform.python_version())
print("TensorFlow version:", tf.__version__)
print("NumPy version:", np.__version__)


In [None]:
class MinimalConfig:
    """Ultra-minimal configuration for CPU training"""
    
    # Paths
    BASE_INPUT_PATH = "/kaggle/input/deep-fake-detection-dfd-entire-original-dataset"
    BASE_OUTPUT_PATH = '/kaggle/working'
    
    # Preprocessing 
    FRAMES_PER_VIDEO = 10  
    TARGET_SIZE = (224, 224)
    FACE_PADDING = 0.2
    MIN_FACE_SIZE = 100
    FACE_CONFIDENCE = 0.5
    JPEG_QUALITY = 90
    
    # Training 
    BATCH_SIZE = 8 
    EPOCHS = 15
    LEARNING_RATE = 1e-4
    EARLY_STOP_PATIENCE = 4
    REDUCE_LR_PATIENCE = 2
    VALIDATION_SPLIT = 0.2
    TEST_SPLIT = 0.2
    
    # Model
    BACKBONE = 'EfficientNetB0'
    UNFREEZE_LAYERS = 10
    DROPOUT_RATE = 0.3
    L2_REGULARIZATION = 0.0001
    DENSE_UNITS = 128
    
    # Memory management
    MAX_VIDEOS_TO_PROCESS = None 


In [None]:

# ============================================================================
# SIMPLE PREDICTION MONITOR
# ============================================================================

class SimplePredictionMonitor(Callback):
    
    def __init__(self, val_files, val_labels, max_samples=50):
        super().__init__()
        self.val_files = val_files[:max_samples]
        self.val_labels = val_labels[:max_samples]
        
    def on_epoch_end(self, epoch, logs=None):
        predictions = []
        
        for i, filepath in enumerate(self.val_files):
            try:
                img = cv2.imread(filepath)
                if img is None:
                    continue
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = img.astype(np.float32)
                img = np.expand_dims(img, axis=0)
                img = efficientnet_preprocess(img)
                
                pred = self.model.predict(img, verbose=0)[0][0]
                predictions.append(pred)
            except:
                continue
            
            if i >= 49:  # Only check 50 samples max
                break
        
        if predictions:
            pred_array = np.array(predictions)
            print(f"  [Pred] mean={pred_array.mean():.4f}, std={pred_array.std():.4f}")
            
            if pred_array.std() < 0.05:
                print(f"WARNING: Prediction collapse detected")


In [None]:
# ============================================================================
# MINIMAL FACE EXTRACTOR
# ============================================================================

class MinimalFaceExtractor:
    
    def __init__(self, config=MinimalConfig):
        self.config = config
        self.mp_face = mp.solutions.face_detection
        self.detector = self.mp_face.FaceDetection(
            model_selection=1,
            min_detection_confidence=config.FACE_CONFIDENCE
        )
        self.stats = defaultdict(int)
    
    def extract_faces_minimal(self, video_path):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            return None
        
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if total_frames < self.config.FRAMES_PER_VIDEO:
            cap.release()
            return None
        
        # Calculate frame indices to extract
        frame_indices = np.linspace(0, total_frames - 1, self.config.FRAMES_PER_VIDEO, dtype=int)
        
        faces = []
        
        for target_idx in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, target_idx)
            ret, frame = cap.read()
            
            if not ret or frame is None:
                continue
            
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = self.detector.process(rgb)
            
            if results.detections:
                detection = results.detections[0]
                bbox = detection.location_data.relative_bounding_box
                
                h, w = frame.shape[:2]
                padding = self.config.FACE_PADDING
                
                x = max(0, int((bbox.xmin - padding * bbox.width) * w))
                y = max(0, int((bbox.ymin - padding * bbox.height) * h))
                bw = int(bbox.width * (1 + 2 * padding) * w)
                bh = int(bbox.height * (1 + 2 * padding) * h)
                
                x2 = min(w, x + bw)
                y2 = min(h, y + bh)
                
                if (x2 - x) >= self.config.MIN_FACE_SIZE and (y2 - y) >= self.config.MIN_FACE_SIZE:
                    face = frame[y:y2, x:x2]
                    
                    if face.size > 0:
                        face_resized = cv2.resize(
                            face, 
                            self.config.TARGET_SIZE, 
                            interpolation=cv2.INTER_LINEAR
                        )
                        faces.append(face_resized)
        
        cap.release()
        
        del rgb, frame
        gc.collect()
        
        return faces if len(faces) == self.config.FRAMES_PER_VIDEO else None

In [None]:
 def process_video(self, video_path, output_dir, label, video_id):
        """Process video and save exactly FRAMES_PER_VIDEO face crops"""
        faces = self.extract_faces_minimal(video_path)
        
        if faces is None:
            self.stats['videos_no_faces'] += 1
            return 0
        
        self.stats['videos_processed'] += 1
        saved_count = 0
        
        for idx, face_img in enumerate(faces):
            filename = f"{label}_{video_id:08d}_{idx:04d}_face.jpg"
            face_path = os.path.join(output_dir, filename)
            
            success = cv2.imwrite(
                face_path, 
                face_img,
                [cv2.IMWRITE_JPEG_QUALITY, self.config.JPEG_QUALITY]
            )
            
            if success:
                saved_count += 1
                self.stats['faces_saved'] += 1
        
        del faces
        gc.collect()
        
        return saved_count


In [None]:

# ============================================================================
# MINIMAL DATA GENERATOR
# ============================================================================

class UltraMinimalGenerator(tf.keras.utils.Sequence):
    """One-image-at-a-time generator - no batch preloading"""
    
    def __init__(self, file_paths, labels, batch_size=8, shuffle=True, augment=False):
        self.file_paths = file_paths
        self.labels = labels
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.augment = augment
        self.indices = np.arange(len(file_paths))
        self.on_epoch_end()
    
    def __len__(self):
        return int(np.ceil(len(self.file_paths) / self.batch_size))
    
    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        
        X_batch = []
        y_batch = []
        
        for idx in batch_indices:
            face_path = self.file_paths[idx]
            label = self.labels[idx]
            
            # Load ONE image at a time
            face_img = cv2.imread(face_path)
            
            if face_img is None:
                continue
            
            face_img = cv2.cvtColor(face_img, cv2.COLOR_BGR2RGB)
            
            # Only horizontal flip augmentation
            if self.augment and np.random.rand() > 0.5:
                face_img = cv2.flip(face_img, 1)
            
            face_img = face_img.astype(np.float32)
            
            X_batch.append(face_img)
            y_batch.append(label)
        
        if len(y_batch) == 0:
            dummy_img = np.zeros((224, 224, 3), dtype=np.float32)
            return np.array([dummy_img]), np.array([0], dtype=np.float32)
        
        X_batch = np.array(X_batch, dtype=np.float32)
        y_batch = np.array(y_batch, dtype=np.float32)
        
        # Preprocess
        X_batch = efficientnet_preprocess(X_batch)
        
        return X_batch, y_batch
    
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)


In [None]:

# ============================================================================
# LIGHTWEIGHT MODEL BUILDER
# ============================================================================

def build_minimal_model(config=MinimalConfig):
    """Build EfficientNetB0 model for CPU"""
    
    input_layer = Input(shape=(*config.TARGET_SIZE, 3), name='face_input')
    
    base = EfficientNetB0(
        weights='imagenet', 
        include_top=False, 
        input_tensor=input_layer
    )
    
    # Freeze most layers
    total_layers = len(base.layers)
    freeze_until = total_layers - config.UNFREEZE_LAYERS
    
    for i, layer in enumerate(base.layers):
        layer.trainable = (i >= freeze_until)
    
    x = GlobalAveragePooling2D(name='gap')(base.output)
    x = BatchNormalization(name='bn1')(x)
    x = Dropout(config.DROPOUT_RATE, name='dropout1')(x)
    
    x = Dense(
        config.DENSE_UNITS, 
        activation='relu',
        kernel_regularizer=regularizers.l2(config.L2_REGULARIZATION),
        name='fc1'
    )(x)
    x = BatchNormalization(name='bn2')(x)
    x = Dropout(config.DROPOUT_RATE, name='dropout2')(x)
    
    output = Dense(1, activation='sigmoid', name='output')(x)
    
    model = Model(inputs=input_layer, outputs=output, name='DeepfakeDetector_B0_CPU')
    
    return model

In [None]:

# ============================================================================
# CPU-FRIENDLY TRAINING
# ============================================================================

def train_minimal_model(train_gen, val_gen, val_files, val_labels, config=MinimalConfig):
    """CPU-optimized training"""
    
    print("\nBuilding minimal model...")
    model = build_minimal_model(config)
    
    trainable_params = sum([tf.keras.backend.count_params(w) for w in model.trainable_weights])
    total_params = model.count_params()
    
    print(f"Total params: {total_params:,}")
    print(f"Trainable params: {trainable_params:,}")
    
    optimizer = Adam(learning_rate=config.LEARNING_RATE)
    
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )
    
    callbacks = [
        SimplePredictionMonitor(val_files, val_labels, max_samples=50),
        EarlyStopping(
            monitor='val_auc',
            patience=config.EARLY_STOP_PATIENCE,
            restore_best_weights=True,
            mode='max',
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=config.REDUCE_LR_PATIENCE,
            min_lr=1e-7,
            verbose=1
        ),
        ModelCheckpoint(
            os.path.join(config.BASE_OUTPUT_PATH, 'best_model_cpu.keras'),
            monitor='val_auc',
            save_best_only=True,
            mode='max',
            verbose=1
        )
    ]
    
    print("\n" + "="*80)
    print("TRAINING ON CPU")
    print(f"  Batch Size: {config.BATCH_SIZE}")
    print(f"  Epochs: {config.EPOCHS}")
    print("="*80 + "\n")
    
    history = model.fit(
        train_gen,
        validation_data=val_gen,
        epochs=config.EPOCHS,
        callbacks=callbacks,
        verbose=1
    )
    
    # Free memory
    gc.collect()
    
    return model, history

In [None]:

# ============================================================================
# SIMPLE DATA LOADING
# ============================================================================

def load_preprocessed_data_simple(preproc_dir):
    """Load preprocessed data without verification"""
    all_face_files = sorted(glob.glob(os.path.join(preproc_dir, "*_face.jpg")))
    
    if len(all_face_files) == 0:
        print("ERROR: No face files found!")
        return None, None, None
    
    file_paths = []
    labels = []
    video_groups = []
    
    for face_path in all_face_files:
        basename = os.path.basename(face_path)
        parts = basename.replace('_face.jpg', '').split('_')
        
        if len(parts) < 3:
            continue
        
        label_str = parts[0]
        video_id_str = parts[1]
        
        if label_str not in ['real', 'fake']:
            continue
        
        label = 0 if label_str == 'real' else 1
        video_id = f"{label_str}_{video_id_str}"
        
        file_paths.append(face_path)
        labels.append(label)
        video_groups.append(video_id)
    
    print(f"\nLoaded {len(file_paths)} samples")
    print(f"  Real: {labels.count(0)}, Fake: {labels.count(1)}")
    
    return file_paths, np.array(labels), np.array(video_groups)

In [None]:


# ============================================================================
# SIMPLE FILENAME-BASED SPLIT
# ============================================================================

def simple_split_by_video(file_paths, labels, video_groups, test_size=0.2, val_size=0.2):
    """Simple deterministic split based on video IDs"""
    
    unique_videos = sorted(set(video_groups))
    np.random.seed(42)
    np.random.shuffle(unique_videos)
    
    n_videos = len(unique_videos)
    n_test = int(n_videos * test_size)
    n_val = int(n_videos * val_size)
    
    test_videos = set(unique_videos[:n_test])
    val_videos = set(unique_videos[n_test:n_test+n_val])
    train_videos = set(unique_videos[n_test+n_val:])
    
    train_paths, train_labels = [], []
    val_paths, val_labels = [], []
    test_paths, test_labels = [], []
    
    for i, video_id in enumerate(video_groups):
        if video_id in train_videos:
            train_paths.append(file_paths[i])
            train_labels.append(labels[i])
        elif video_id in val_videos:
            val_paths.append(file_paths[i])
            val_labels.append(labels[i])
        elif video_id in test_videos:
            test_paths.append(file_paths[i])
            test_labels.append(labels[i])
    
    print(f"\nSplit complete:")
    print(f"  Train: {len(train_paths)} samples")
    print(f"  Val:   {len(val_paths)} samples")
    print(f"  Test:  {len(test_paths)} samples")
    
    return (train_paths, np.array(train_labels)), (val_paths, np.array(val_labels)), (test_paths, np.array(test_labels))


In [None]:
# ============================================================================
# SIMPLE EVALUATION (NO TTA)
# ============================================================================

def evaluate_simple(model, test_gen, config=MinimalConfig):
    """Simple evaluation without TTA"""
    
    print("\nEvaluating model...")
    
    y_pred_list = []
    y_true_list = []
    
    for i in tqdm(range(len(test_gen)), desc="Evaluating"):
        X_batch, y_batch = test_gen[i]
        pred_batch = model.predict(X_batch, verbose=0)
        
        y_pred_list.extend(pred_batch.flatten())
        y_true_list.extend(y_batch)
    
    y_pred_proba = np.array(y_pred_list)
    y_true = np.array(y_true_list)
    
    # Find best threshold
    best_threshold = 0.5
    best_acc = 0
    
    for thresh in np.linspace(0.3, 0.7, 9):
        y_pred_thresh = (y_pred_proba > thresh).astype(int)
        acc = accuracy_score(y_true, y_pred_thresh)
        if acc > best_acc:
            best_acc = acc
            best_threshold = thresh
    
    y_pred = (y_pred_proba > best_threshold).astype(int)
    
    accuracy = accuracy_score(y_true, y_pred)
    auc_score = roc_auc_score(y_true, y_pred_proba)
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred, target_names=['Real', 'Fake'], output_dict=True)
    
    print("\n" + "="*80)
    print("EVALUATION RESULTS")
    print("="*80)
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"AUC:       {auc_score:.4f}")
    print(f"Threshold: {best_threshold:.2f}")
    print(f"\nConfusion Matrix:")
    print(f"  TN: {cm[0,0]:4d}  FP: {cm[0,1]:4d}")
    print(f"  FN: {cm[1,0]:4d}  TP: {cm[1,1]:4d}")
    print(f"\nReal - Precision: {report['Real']['precision']:.4f}, Recall: {report['Real']['recall']:.4f}")
    print(f"Fake - Precision: {report['Fake']['precision']:.4f}, Recall: {report['Fake']['recall']:.4f}")
    print("="*80 + "\n")
    
    # Simple confusion matrix plot
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
    plt.title(f'Confusion Matrix\nAcc: {accuracy:.4f} | AUC: {auc_score:.4f}')
    plt.savefig(os.path.join(config.BASE_OUTPUT_PATH, 'confusion_matrix.png'), dpi=100)
    plt.close()
    
    results = {
        'accuracy': float(accuracy),
        'auc': float(auc_score),
        'best_threshold': float(best_threshold),
        'confusion_matrix': cm.tolist(),
        'classification_report': report
    }
    
    return results


In [None]:
# ============================================================================
# MAIN PIPELINE
# ============================================================================

def run_minimal_pipeline():
    """Complete CPU-friendly pipeline"""
    
    print("="*80)
    print("CPU-OPTIMIZED DEEPFAKE DETECTION PIPELINE")
    print("="*80)
    
    config = MinimalConfig()
    
    preproc_dir = os.path.join(config.BASE_OUTPUT_PATH, "minimal_preprocessed")
    
    # ========================================================================
    # STEP 1: PREPROCESSING
    # ========================================================================
    
    if not os.path.exists(preproc_dir):
        print("\n" + "="*80)
        print("STEP 1: PREPROCESSING")
        print("="*80)
        
        os.makedirs(preproc_dir, exist_ok=True)
        
        extractor = MinimalFaceExtractor(config)
        
        real_dir = os.path.join(config.BASE_INPUT_PATH, "DFD_original sequences")
        fake_dir = os.path.join(
            config.BASE_INPUT_PATH, 
            "DFD_manipulated_sequences", 
            "DFD_manipulated_sequences"
        )
        
        real_videos = sorted([os.path.join(real_dir, f)
                              for f in os.listdir(real_dir) if f.endswith(".mp4")])
        
        fake_videos = sorted([os.path.join(fake_dir, f)
                              for f in os.listdir(fake_dir) if f.endswith(".mp4")])
        
        # Balance datasets
        min_count = min(len(real_videos), len(fake_videos))
        if config.MAX_VIDEOS_TO_PROCESS:
            min_count = min(min_count, config.MAX_VIDEOS_TO_PROCESS)
        
        real_videos = real_videos[:min_count]
        fake_videos = fake_videos[:min_count]
        
        print(f"Processing {min_count} real and {min_count} fake videos")
        print(f"Extracting {config.FRAMES_PER_VIDEO} frames per video\n")
        
        print("Processing real videos...")
        for video_id, video_path in enumerate(tqdm(real_videos, desc="Real")):
            extractor.process_video(video_path, preproc_dir, 'real', video_id)
            
            # Free memory every 10 videos
            if video_id % 10 == 0:
                gc.collect()
        
        print("Processing fake videos...")
        for video_id, video_path in enumerate(tqdm(fake_videos, desc="Fake")):
            extractor.process_video(video_path, preproc_dir, 'fake', video_id)
            
            if video_id % 10 == 0:
                gc.collect()
        
        print("\nPreprocessing complete!")
        for key, value in sorted(extractor.stats.items()):
            print(f"  {key}: {value}")
        
        del extractor
        gc.collect()
    else:
        print(f"\n✓ Using existing preprocessed data: {preproc_dir}")
    
    # ========================================================================
    # STEP 2: LOAD DATA
    # ========================================================================
    
    print("\n" + "="*80)
    print("STEP 2: LOADING DATA")
    print("="*80)
    
    file_paths, labels, video_groups = load_preprocessed_data_simple(preproc_dir)
    
    if file_paths is None or len(file_paths) < 50:
        print("ERROR: Insufficient data!")
        return None
    
    train_data, val_data, test_data = simple_split_by_video(
        file_paths, labels, video_groups,
        test_size=config.TEST_SPLIT,
        val_size=config.VALIDATION_SPLIT
    )
    
    train_paths, train_labels = train_data
    val_paths, val_labels = val_data
    test_paths, test_labels = test_data
    
    print("\nCreating minimal generators...")
    train_gen = UltraMinimalGenerator(
        train_paths, train_labels,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        augment=True
    )
    
    val_gen = UltraMinimalGenerator(
        val_paths, val_labels,
        batch_size=config.BATCH_SIZE,
        shuffle=False,
        augment=False
    )
    
    test_gen = UltraMinimalGenerator(
        test_paths, test_labels,
        batch_size=config.BATCH_SIZE,
        shuffle=False,
        augment=False
    )
    
    # ========================================================================
    # STEP 3: TRAIN
    # ========================================================================
    
    print("\n" + "="*80)
    print("STEP 3: TRAINING")
    print("="*80)
    
    tf.keras.backend.clear_session()
    gc.collect()
    
    model, history = train_minimal_model(train_gen, val_gen, val_paths, val_labels, config)
    
    # Save history
    history_dict = {k: [float(v) for v in vals] for k, vals in history.history.items()}
    with open(os.path.join(config.BASE_OUTPUT_PATH, "history_minimal.json"), "w") as f:
        json.dump(history_dict, f, indent=4)
    
    # ========================================================================
    # STEP 4: EVALUATE
    # ========================================================================
    
    print("\n" + "="*80)
    print("STEP 4: EVALUATION")
    print("="*80)
    
    results = evaluate_simple(model, test_gen, config)
    
    with open(os.path.join(config.BASE_OUTPUT_PATH, "results_minimal.json"), "w") as f:
        json.dump(results, f, indent=4)
    
    model.save(os.path.join(config.BASE_OUTPUT_PATH, "final_model_cpu.keras"))
    
    print("\n" + "="*80)
    print("PIPELINE COMPLETE!")
    print("="*80)
    
    gc.collect()
    
    return results

# ============================================================================
# RUN PIPELINE
# ============================================================================

if __name__ == "__main__":
    print("\nStarting CPU-optimized pipeline...")
    print("This is designed to run for 12+ hours on Kaggle CPU without crashes.\n")
    
    results = run_minimal_pipeline()
    
    print("\nAll done! Check /kaggle/working for outputs.")

# **TESTING**

In [None]:
import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications.efficientnet import preprocess_input

# ============================================================================
# Load face detector
# ============================================================================

PROTOTXT = "/kaggle/working/deploy.prototxt"
CAFFEMODEL = "/kaggle/working/res10_300x300_ssd_iter_140000.caffemodel"

dnn_face_net = cv2.dnn.readNetFromCaffe(PROTOTXT, CAFFEMODEL)

def detect_face(frame):
    """Return the largest detected face or None."""
    h, w = frame.shape[:2]
    blob = cv2.dnn.blobFromImage(frame, 1.0, (300, 300),
                                 (104.0, 177.0, 123.0), swapRB=False)
    dnn_face_net.setInput(blob)
    detections = dnn_face_net.forward()

    best = None
    best_area = 0

    for i in range(detections.shape[2]):
        conf = detections[0, 0, i, 2]
        if conf < 0.3:
            continue

        box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
        x1, y1, x2, y2 = box.astype(int)
        area = (x2 - x1) * (y2 - y1)

        if area > best_area:
            best = frame[y1:y2, x1:x2]
            best_area = area

    return best


# ============================================================================
# Predict on video (simple)
# ============================================================================

def test_video(model, video_path, num_frames=15):
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    indices = np.linspace(0, total - 1, num_frames).astype(int)

    preds = []

    for idx in indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            continue

        face = detect_face(frame)
        if face is None:
            continue

        face = cv2.resize(face, (224, 224))
        face = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
        face = preprocess_input(np.expand_dims(face.astype("float32"), 0))
        pred = float(model.predict(face, verbose=0)[0][0])
        preds.append(pred)

    cap.release()

    if len(preds) == 0:
        print(f"{video_path}: No face detected")
        return None, None

    preds = np.array(preds)

    # MAX score strategy
    max_score = preds.max()
    threshold = 0.35
    label = "FAKE" if max_score > threshold else "REAL"

    print(f"{os.path.basename(video_path)} → {label} (score={max_score:.4f})")
    return label, max_score


# ============================================================================
# Batch testing (simple)
# ============================================================================

def test_videos(model, video_list):
    for v in video_list:
        test_video(model, v)


# ============================================================================
# Usage
# ============================================================================

if __name__ == "__main__":
    model = tf.keras.models.load_model("/kaggle/working/final_model_cpu.keras")

    videos = [
        "/kaggle/input/testing/real3.mp4",
        "/kaggle/input/testing/fake1.mp4",
        "/kaggle/input/testing/fake2.mp4",
        "/kaggle/input/testing/fake3.mp4",
        "/kaggle/input/testing/real2.mp4",
        "/kaggle/input/testing/real.mp4",
        "/kaggle/input/testing/Deepfake video of Volodymyr Zelensky surrendering surfaces on social media (1).mp4",
        "/kaggle/input/testing/Deepfake Example Presented by Senator Richard Blumenthal.mp4",
        "/kaggle/input/testing/MrBeast and BBC stars used in deepfake scam videos - BBC News.mp4"
    ]

    test_videos(model, videos)


In [None]:
# Root Cause: Your model was trained on DFD dataset (studio-quality), 
# but real videos are from YouTube/TikTok (compression-heavy).

# DFD real videos =
# high resolution
# perfect lighting
# DSLR-quality
# no compression artifacts

# Our real-world videos =
# overcompressed
# color-shifted
# blurry
# different lighting
# different distribution entirely

# ➡ Our model has learned “compression artifacts = FAKE”
# Because in DFD:
# REAL videos are clean
# FAKE videos are compressed or manipulated

# So dataset biasness is there.

# real2.mp4 → FAKE (0.8260)
# real.mp4 → FAKE (0.9996)
# This is NOT a face-detection problem
# This is NOT a testing code problem either.

# **Final Metrics and Visualization** 

In [None]:
import glob
import numpy as np

preproc_dir = "/kaggle/working/minimal_preprocessed"

file_paths = sorted(glob.glob(preproc_dir + "/*_face.jpg"))
labels = np.array([0 if "real_" in f else 1 for f in file_paths])

print("Recovered:", len(file_paths), "files")
print("Real:", (labels == 0).sum(), "Fake:", (labels == 1).sum())


In [None]:
import random
from matplotlib import pyplot as plt

real_samples = [f for f in file_paths if "/real_" in f]
fake_samples = [f for f in file_paths if "/fake_" in f]

real_sample = random.choice(real_samples)
fake_sample = random.choice(fake_samples)

def show_image(img_path, title):
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    plt.title(title)
    plt.axis("off")

plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
show_image(real_sample, "Sample Real Face")

plt.subplot(1,2,2)
show_image(fake_sample, "Sample Fake Face")
plt.savefig("/kaggle/working/sample_real_fake.png", dpi=120)
plt.show()


In [None]:
import os
import glob

base_input = "/kaggle/input/deep-fake-detection-dfd-entire-original-dataset"

real_dir = os.path.join(base_input, "DFD_original sequences")
fake_dir = os.path.join(base_input, "DFD_manipulated_sequences", "DFD_manipulated_sequences")

real_videos = sorted(glob.glob(real_dir + "/*.mp4"))
fake_videos = sorted(glob.glob(fake_dir + "/*.mp4"))

print("Real videos:", len(real_videos))
print("Fake videos:", len(fake_videos))


In [None]:
import json

with open("/kaggle/working/history_minimal.json") as f:
    history_data = json.load(f)

print(history_data.keys())


In [None]:
with open("/kaggle/working/results_minimal.json") as f:
    results = json.load(f)

y_true = np.array(results["y_true"]) if "y_true" in results else None
y_pred_proba = np.array(results["y_pred_proba"]) if "y_pred_proba" in results else None


In [None]:
import glob
import numpy as np
import os

preproc_dir = "/kaggle/working/minimal_preprocessed"

all_face_files = sorted(glob.glob(preproc_dir + "/*_face.jpg"))

file_paths = []
labels = []
video_groups = []

for face_path in all_face_files:
    basename = os.path.basename(face_path)
    parts = basename.replace("_face.jpg", "").split("_")

    label = 0 if "real" in parts[0] else 1
    video_id = f"{parts[0]}_{parts[1]}"

    file_paths.append(face_path)
    labels.append(label)
    video_groups.append(video_id)

file_paths = np.array(file_paths)
labels = np.array(labels)
video_groups = np.array(video_groups)


In [None]:
def simple_split_by_video(file_paths, labels, video_groups, test_size=0.2, val_size=0.2):
    unique_videos = sorted(set(video_groups))
    np.random.seed(42)
    np.random.shuffle(unique_videos)

    n = len(unique_videos)
    n_test = int(n * test_size)
    n_val = int(n * val_size)

    test_videos = set(unique_videos[:n_test])
    val_videos = set(unique_videos[n_test:n_test+n_val])
    train_videos = set(unique_videos[n_test+n_val:])

    train_paths, train_labels = [], []
    val_paths, val_labels = [], []
    test_paths, test_labels = [], []

    for i, vid in enumerate(video_groups):
        if vid in train_videos:
            train_paths.append(file_paths[i])
            train_labels.append(labels[i])
        elif vid in val_videos:
            val_paths.append(file_paths[i])
            val_labels.append(labels[i])
        elif vid in test_videos:
            test_paths.append(file_paths[i])
            test_labels.append(labels[i])

    return (train_paths, np.array(train_labels)), (val_paths, np.array(val_labels)), (test_paths, np.array(test_labels))


(train_paths, train_labels), (val_paths, val_labels), (test_paths, test_labels) = simple_split_by_video(
    file_paths, labels, video_groups, test_size=0.2, val_size=0.2
)


In [None]:
model = tf.keras.models.load_model("/kaggle/working/final_model_cpu.keras")

test_gen = UltraMinimalGenerator(test_paths, test_labels, batch_size=8, shuffle=False)

y_pred_proba = []
y_true = []

for i in range(len(test_gen)):
    X, y = test_gen[i]
    pred = model.predict(X, verbose=0).flatten()
    y_pred_proba.extend(pred)
    y_true.extend(y)

y_pred_proba = np.array(y_pred_proba)
y_true = np.array(y_true)


In [None]:
import json
import matplotlib.pyplot as plt

# Load saved training history
with open("/kaggle/working/history_minimal.json") as f:
    history_data = json.load(f)

plt.figure(figsize=(12,5))

# Loss Curve
plt.subplot(1,2,1)
plt.plot(history_data["loss"], label="Train Loss")
plt.plot(history_data["val_loss"], label="Val Loss")
plt.legend()
plt.title("Loss Curve")
plt.xlabel("Epochs")
plt.ylabel("Loss")

# Accuracy Curve
plt.subplot(1,2,2)
plt.plot(history_data["accuracy"], label="Train Accuracy")
plt.plot(history_data["val_accuracy"], label="Val Accuracy")
plt.legend()
plt.title("Accuracy Curve")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")

plt.savefig("/kaggle/working/training_curves.png", dpi=120)
plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, lw=2, label=f"AUC = {roc_auc:.4f}")
plt.plot([0,1], [0,1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

plt.savefig("/kaggle/working/roc_curve.png", dpi=120)
plt.show()


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score

thresholds = np.linspace(0.3, 0.7, 50)
acc_list = []

for t in thresholds:
    preds = (y_pred_proba > t).astype(int)
    acc_list.append(accuracy_score(y_true, preds))

plt.figure(figsize=(6,4))
plt.plot(thresholds, acc_list)
plt.xlabel("Threshold")
plt.ylabel("Accuracy")
plt.title("Threshold vs Accuracy")
plt.grid()

plt.savefig("/kaggle/working/threshold_vs_accuracy.png", dpi=120)
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

# Compute best threshold (same logic as your pipeline)
thresholds = np.linspace(0.3, 0.7, 50)
best_acc = 0
best_threshold = 0.5

for t in thresholds:
    preds = (y_pred_proba > t).astype(int)
    acc = accuracy_score(y_true, preds)
    if acc > best_acc:
        best_acc = acc
        best_threshold = t

print("Best threshold:", best_threshold)


In [None]:
y_pred = (y_pred_proba > best_threshold).astype(int)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Real', 'Fake'],
            yticklabels=['Real', 'Fake'])

plt.title(f"Confusion Matrix\nAccuracy={best_acc:.4f}, Threshold={best_threshold:.2f}")
plt.xlabel("Predicted")
plt.ylabel("True")

plt.savefig("/kaggle/working/confusion_matrix.png", dpi=120)
plt.show()

print("TN, FP, FN, TP:", cm.ravel())


In [None]:
!rm -rf /kaggle/working/full_dataset_preprocessed
