##### Random Forest deepfake detection


> Add blockquote



In [1]:
# Install required packages
!pip install kagglehub scikit-learn scikit-image opencv-python scipy joblib matplotlib seaborn tqdm




In [2]:
import numpy as np
import cv2
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score
)
from skimage.feature import local_binary_pattern, hog
from scipy import fftpack
import kagglehub

In [3]:
# Paths
WORK_DIR = '/content/deepfake_detection'
DATA_DIR = f'{WORK_DIR}/data'
MODELS_DIR = f'{WORK_DIR}/models'
RESULTS_DIR = f'{WORK_DIR}/results'


# Create directories
for directory in [WORK_DIR, DATA_DIR, MODELS_DIR, RESULTS_DIR]:
    os.makedirs(directory, exist_ok=True)



# Dataset configuration
IMAGE_SIZE = (256, 256)
RANDOM_STATE = 42

# NOTE: Dataset is already split into Train/Validation/Test
# use the pre-existing splits

# Model configuration
RF_N_ESTIMATORS = 100
RF_MAX_DEPTH = 20
USE_FEATURE_SELECTION = True
N_FEATURES_TO_SELECT = 50

print("✅ Configuration complete!")
print(f"\nWorking directory: {WORK_DIR}")
print(f"Image size: {IMAGE_SIZE}")
print(f"Random Forest trees: {RF_N_ESTIMATORS}")
print(f"Feature selection: {USE_FEATURE_SELECTION}")
if USE_FEATURE_SELECTION:
    print(f"  Features to select: {N_FEATURES_TO_SELECT}")


✅ Configuration complete!

Working directory: /content/deepfake_detection
Image size: (256, 256)
Random Forest trees: 100
Feature selection: True
  Features to select: 50


In [4]:
from google.colab import files
print("Please upload your kaggle.json file:")
uploaded = files.upload()

# Setup Kaggle credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset
print("Downloading dataset...")
dataset_path = kagglehub.dataset_download("manjilkarki/deepfake-and-real-images")
print(f"Dataset downloaded to: {dataset_path}")

# Explore dataset structure
print("\nDataset contents:")
for item in os.listdir(dataset_path):
    item_path = os.path.join(dataset_path, item)
    if os.path.isdir(item_path):
        num_files = len(os.listdir(item_path))
        print(f"   {item}/ ({num_files} files)")

Please upload your kaggle.json file:


Saving kaggle.json to kaggle.json
Downloading dataset...
Using Colab cache for faster access to the 'deepfake-and-real-images' dataset.
Dataset downloaded to: /kaggle/input/deepfake-and-real-images

Dataset contents:
   Dataset/ (3 files)


In [5]:
import gc
import time

class FastFeatureExtractor:


    def extract_features(self, image):
        features = []

        # === COLOR FEATURES (Fast) ===
        for channel in cv2.split(image):
            features.extend([
                np.mean(channel),
                np.std(channel),
                np.min(channel),
                np.max(channel),
                np.percentile(channel, 25),
                np.percentile(channel, 75),
            ])

        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # === GRAYSCALE STATISTICS ===
        features.extend([
            np.mean(gray),
            np.std(gray),
            np.var(gray),
            np.median(gray),
        ])

        # === FREQUENCY FEATURES (IMPORTANT!) ===
        # DCT - catches compression artifacts (KEY for deepfakes)
        dct = cv2.dct(np.float32(gray))
        dct_low = dct[:32, :32]  # Use smaller region for speed
        features.extend([
            np.mean(dct_low),
            np.std(dct_low),
            np.max(dct_low),
            np.min(dct_low),
            np.median(dct_low),
        ])

        # FFT - frequency domain analysis
        f_transform = np.fft.fft2(gray)
        f_shift = np.fft.fftshift(f_transform)
        magnitude = np.abs(f_shift)

        # Radial frequency features
        center = np.array(magnitude.shape) // 2
        y, x = np.ogrid[:magnitude.shape[0], :magnitude.shape[1]]
        r = np.sqrt((x - center[1])**2 + (y - center[0])**2)

        for radius in [20, 50, 80]:
            mask = (r >= radius-10) & (r < radius+10)
            if np.sum(mask) > 0:
                features.append(np.mean(magnitude[mask]))

        # === TEXTURE FEATURES (IMPORTANT!) ===
        # Simplified LBP
        from skimage.feature import local_binary_pattern
        radius = 2
        n_points = 8 * radius
        lbp = local_binary_pattern(gray, n_points, radius, method='uniform')

        # LBP histogram (reduced bins for speed)
        n_bins = 10
        hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins))
        hist = hist.astype(float) / (hist.sum() + 1e-7)
        features.extend(hist)

        # === EDGE FEATURES ===
        edges = cv2.Canny(gray, 100, 200)
        features.append(np.sum(edges > 0) / edges.size)

        # Gradient magnitude
        sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=3)
        sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=3)
        gradient_mag = np.sqrt(sobelx**2 + sobely**2)
        features.extend([
            np.mean(gradient_mag),
            np.std(gradient_mag),
            np.max(gradient_mag),
        ])

        # === COLOUR CORRELATIONS ===
        b, g, r = cv2.split(image)
        features.append(np.corrcoef(b.flatten(), g.flatten())[0, 1])
        features.append(np.corrcoef(g.flatten(), r.flatten())[0, 1])
        features.append(np.corrcoef(r.flatten(), b.flatten())[0, 1])

        return np.array(features, dtype=np.float32)

    def extract_from_batch_vectorized(self, images):
        batch_features = []
        for img in images:
            features = self.extract_features(img)
            batch_features.append(features)
        return np.array(batch_features, dtype=np.float32)

def load_and_extract_folder_fast(folder_path, label, extractor,
                                 img_size=(256, 256), chunk_size=5000):
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Folder not found: {folder_path}")

    image_files = [f for f in os.listdir(folder_path)
                  if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp'))]

    total_images = len(image_files)
    print(f"Processing {total_images} images from {os.path.basename(folder_path)}...")

    all_features = []
    all_labels = []

    # Process in large chunks
    for i in tqdm(range(0, total_images, chunk_size)):
        chunk_files = image_files[i:i+chunk_size]
        chunk_images = []

        # Load chunk
        for img_file in chunk_files:
            img_path = os.path.join(folder_path, img_file)
            try:
                img = cv2.imread(img_path)
                if img is not None:
                    img = cv2.resize(img, img_size)
                    chunk_images.append(img)
            except:
                continue

        if chunk_images:
            # Extract features from chunk
            chunk_images = np.array(chunk_images)
            chunk_features = extractor.extract_from_batch_vectorized(chunk_images)
            chunk_labels = np.full(len(chunk_features), label)

            all_features.append(chunk_features)
            all_labels.append(chunk_labels)

            # Clear memory
            del chunk_images, chunk_features, chunk_labels
            gc.collect()

    # Combine all chunks
    features = np.vstack(all_features)
    labels = np.concatenate(all_labels)

    del all_features, all_labels
    gc.collect()

    return features, labels


def load_split_fast(dataset_path, split_name, extractor, img_size=(256, 256)):
    split_path = os.path.join(dataset_path, 'Dataset', split_name)
    real_path = os.path.join(split_path, 'Real')
    fake_path = os.path.join(split_path, 'Fake')

    print(f"\n{'='*70}")
    print(f"Processing {split_name} Set")
    print(f"{'='*70}")

    if not os.path.exists(real_path):
        raise FileNotFoundError(f"Real folder not found: {real_path}")
    if not os.path.exists(fake_path):
        raise FileNotFoundError(f"Fake folder not found: {fake_path}")

    # Process Real images
    start = time.time()
    print("Processing Real images...")
    X_real, y_real = load_and_extract_folder_fast(real_path, 0, extractor, img_size)
    real_time = time.time() - start
    print(f"  Time: {real_time:.2f}s ({len(X_real)/real_time:.0f} images/sec)")

    # Process Fake images
    start = time.time()
    print("Processing Fake images...")
    X_fake, y_fake = load_and_extract_folder_fast(fake_path, 1, extractor, img_size)
    fake_time = time.time() - start
    print(f"  Time: {fake_time:.2f}s ({len(X_fake)/fake_time:.0f} images/sec)")

    # Combine
    X = np.vstack([X_real, X_fake])
    y = np.concatenate([y_real, y_fake])

    del X_real, X_fake, y_real, y_fake
    gc.collect()

    print(f"\n{split_name} Summary:")
    print(f"  Total: {len(X)} samples | Features: {X.shape[1]}")
    print(f"  Real: {np.sum(y==0)} | Fake: {np.sum(y==1)}")

    return X, y


# ========== MAIN PROCESSING ==========

print("\n" + "="*70)
print("ULTRA-FAST FEATURE EXTRACTION")
print("="*70)

# Initialize fast extractor
extractor = FastFeatureExtractor()

# Test on a few images first
print("\nTesting extractor speed...")
test_img = np.random.randint(0, 255, (256, 256, 3), dtype=np.uint8)
start = time.time()
for _ in range(100):
    _ = extractor.extract_features(test_img)
test_time = time.time() - start
print(f"Speed test: {100/test_time:.0f} images/second per core")

# Process all splits
splits_to_try = {
    'train': ['Train', 'train'],
    'val': ['Validation', 'validation', 'Val', 'val'],
    'test': ['Test', 'test']
}

def load_split_safe(base_path, split_variations, extractor):
    for variation in split_variations:
        try:
            return load_split_fast(base_path, variation, extractor)
        except FileNotFoundError:
            continue
    raise FileNotFoundError(f"Could not find split")

# Process each split with timing
total_start = time.time()

print("\n" + "="*70)
print("PROCESSING TRAINING SET")
print("="*70)
X_train, y_train = load_split_safe(dataset_path, splits_to_try['train'], extractor)

print("\n" + "="*70)
print("PROCESSING VALIDATION SET")
print("="*70)
X_val, y_val = load_split_safe(dataset_path, splits_to_try['val'], extractor)

print("\n" + "="*70)
print("PROCESSING TEST SET")
print("="*70)
X_test, y_test = load_split_safe(dataset_path, splits_to_try['test'], extractor)

total_time = time.time() - total_start
total_samples = len(X_train) + len(X_val) + len(X_test)

print(f"\n⏱️  Total extraction time: {total_time/60:.2f} minutes")
print(f"⚡ Speed: {total_samples/total_time:.0f} images/second")

# ========== FEATURE ENGINEERING ==========

print("\n" + "="*70)
print("FEATURE ENGINEERING")
print("="*70)

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Normalize
print("Normalizing features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("Normalized")

# Feature selection
if USE_FEATURE_SELECTION and N_FEATURES_TO_SELECT < X_train_scaled.shape[1]:
    print(f"\nSelecting top {N_FEATURES_TO_SELECT} features...")
    selector = SelectKBest(score_func=f_classif, k=N_FEATURES_TO_SELECT)
    X_train_selected = selector.fit_transform(X_train_scaled, y_train)
    X_val_selected = selector.transform(X_val_scaled)
    X_test_selected = selector.transform(X_test_scaled)
    print(f"Selected: {X_train_scaled.shape[1]} → {X_train_selected.shape[1]}")
else:
    X_train_selected = X_train_scaled
    X_val_selected = X_val_scaled
    X_test_selected = X_test_scaled

# Clean up
del X_train, X_val, X_test, X_train_scaled, X_val_scaled, X_test_scaled
gc.collect()

# ========== SUMMARY ==========

print("\n" + "="*70)
print("READY FOR TRAINING")
print("="*70)
print(f"\nTotal samples: {total_samples}")
print(f"Features per sample: {X_train_selected.shape[1]}")
print(f"Extraction time: {total_time/60:.2f} minutes")
print(f"Processing speed: {total_samples/total_time:.0f} images/sec")

print(f"\nData splits:")
print(f"  Train: {len(X_train_selected):>6} ({len(X_train_selected)/total_samples*100:.1f}%)")
print(f"  Val:   {len(X_val_selected):>6} ({len(X_val_selected)/total_samples*100:.1f}%)")
print(f"  Test:  {len(X_test_selected):>6} ({len(X_test_selected)/total_samples*100:.1f}%)")

print("\nFeature extraction complete - ready for model training!")
gc.collect()


ULTRA-FAST FEATURE EXTRACTION

Testing extractor speed...
Speed test: 20 images/second per core

PROCESSING TRAINING SET

Processing Train Set
Processing Real images...
Processing 70001 images from Real...


100%|██████████| 15/15 [54:42<00:00, 218.85s/it]


  Time: 3283.45s (21 images/sec)
Processing Fake images...
Processing 70001 images from Fake...


100%|██████████| 15/15 [55:39<00:00, 222.61s/it]


  Time: 3339.59s (21 images/sec)

Train Summary:
  Total: 140002 samples | Features: 47
  Real: 70001 | Fake: 70001

PROCESSING VALIDATION SET

Processing Validation Set
Processing Real images...
Processing 19787 images from Real...


100%|██████████| 4/4 [15:57<00:00, 239.45s/it]


  Time: 958.24s (21 images/sec)
Processing Fake images...
Processing 19641 images from Fake...


100%|██████████| 4/4 [15:52<00:00, 238.04s/it]


  Time: 952.60s (21 images/sec)

Validation Summary:
  Total: 39428 samples | Features: 47
  Real: 19787 | Fake: 19641

PROCESSING TEST SET

Processing Test Set
Processing Real images...
Processing 5413 images from Real...


100%|██████████| 2/2 [04:18<00:00, 129.03s/it]


  Time: 258.22s (21 images/sec)
Processing Fake images...
Processing 5492 images from Fake...


100%|██████████| 2/2 [04:25<00:00, 132.80s/it]


  Time: 265.80s (21 images/sec)

Test Summary:
  Total: 10905 samples | Features: 47
  Real: 5413 | Fake: 5492

⏱️  Total extraction time: 150.97 minutes
⚡ Speed: 21 images/second

FEATURE ENGINEERING
Normalizing features...
Normalized

READY FOR TRAINING

Total samples: 190335
Features per sample: 47
Extraction time: 150.97 minutes
Processing speed: 21 images/sec

Data splits:
  Train: 140002 (73.6%)
  Val:    39428 (20.7%)
  Test:   10905 (5.7%)

Feature extraction complete - ready for model training!


0

In [6]:
print("\n" + "="*70)
print("TRAINING IMPROVED RANDOM FOREST")
print("="*70)

# Better hyperparameters for deepfake detection
model = RandomForestClassifier(
    n_estimators=500,          # More trees = more stable
    max_depth=12,              # SHALLOWER trees (was 20)
    min_samples_split=20,      # Need MORE samples to split (was 2)
    min_samples_leaf=10,       # LARGER leaves (was 1)
    max_features='sqrt',       # Use sqrt(n) features per tree
    max_samples=0.7,           # Use only 70% of data per tree (bootstrap)
    class_weight='balanced',   # Handle class imbalance
    bootstrap=True,
    oob_score=True,            # Out-of-bag score for validation
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print(f"Model configuration:")
print(f"  Trees: {model.n_estimators}")
print(f"  Max depth: {model.max_depth}")
print(f"  Class weight: balanced")

import time
start_time = time.time()
model.fit(X_train_selected, y_train)
training_time = time.time() - start_time

print(f"\nTraining complete in {training_time:.2f} seconds")

# Training accuracy
train_pred = model.predict(X_train_selected)
train_accuracy = accuracy_score(y_train, train_pred)
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")



TRAINING IMPROVED RANDOM FOREST
Model configuration:
  Trees: 500
  Max depth: 12
  Class weight: balanced


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   32.9s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.9min finished



Training complete in 366.67 seconds


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    2.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    6.3s


Training Accuracy: 0.7527 (75.27%)


[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    7.0s finished


In [7]:
def evaluate_model(model, X, y, dataset_name="Test"):
    """Evaluate model and print metrics"""
    print(f"\n{'='*60}")
    print(f"{dataset_name} Set Evaluation")
    print(f"{'='*60}")

    # Predictions
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)[:, 1]

    # Metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    auc = roc_auc_score(y, y_pred_proba)

    print(f"\nMetrics:")
    print(f"  Accuracy:  {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  ROC-AUC:   {auc:.4f}")

    # Confusion Matrix
    cm = confusion_matrix(y, y_pred)
    print(f"\nConfusion Matrix:")
    print(f"  TN (Real→Real): {cm[0,0]:>5}  |  FP (Real→Fake): {cm[0,1]:>5}")
    print(f"  FN (Fake→Real): {cm[1,0]:>5}  |  TP (Fake→Fake): {cm[1,1]:>5}")

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc,
        'confusion_matrix': cm,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

# Evaluate on validation and test sets
print("\n=== Model Evaluation ===")
val_metrics = evaluate_model(model, X_val_selected, y_val, "Validation")
test_metrics = evaluate_model(model, X_test_selected, y_test, "Test")

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s



=== Model Evaluation ===

Validation Set Evaluation


[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.5s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    1.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.7s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    1.6s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    1.7s finished



Metrics:
  Accuracy:  0.7091 (70.91%)
  Precision: 0.6711
  Recall:    0.8161
  F1-Score:  0.7365
  ROC-AUC:   0.7905

Confusion Matrix:
  TN (Real→Real): 11930  |  FP (Real→Fake):  7857
  FN (Fake→Real):  3612  |  TP (Fake→Fake): 16029

Test Set Evaluation


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.6s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.2s



Metrics:
  Accuracy:  0.5644 (56.44%)
  Precision: 0.5475
  Recall:    0.7788
  F1-Score:  0.6430
  ROC-AUC:   0.5837

Confusion Matrix:
  TN (Real→Real):  1878  |  FP (Real→Fake):  3535
  FN (Fake→Real):  1215  |  TP (Fake→Fake):  4277


[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.5s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.5s finished
