In [None]:
# !pip uninstall -y dlib
# !sudo apt-get update -qq
# !sudo apt-get install -y -qq build-essential cmake pkg-config python3-dev \
#   libopenblas-dev liblapack-dev

# import os, sys

# os.environ["CUDA_HOME"] = "/usr/local/cuda"
# os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
# os.environ["FORCE_CMAKE"] = "1"

# # Key: 75-real => SASS only for Tesla T4, avoids PTX JIT (the thing throwing code 222)
# os.environ["CMAKE_ARGS"] = " ".join([
#     "-DDLIB_USE_CUDA=1",
#     "-DDLIB_NO_GUI_SUPPORT=1",
#     f"-DPYTHON_EXECUTABLE={sys.executable}",
#     "-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda",
#     "-DCMAKE_BUILD_TYPE=Release",
#     "-DCMAKE_CUDA_ARCHITECTURES=75-real",
# ])


# !pip install -v --no-cache-dir --no-build-isolation --no-binary :all: dlib==19.24.6

# import dlib
# print("dlib:", dlib.__version__)
# print("DLIB_USE_CUDA:", getattr(dlib, "DLIB_USE_CUDA", None))


In [None]:
!pip install timm



In [None]:
import os
import cv2
import math
import time
import random
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import timm
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score, classification_report, roc_auc_score, confusion_matrix,
    roc_curve, auc, precision_score, recall_score
)

import dlib


In [None]:
ROOT_PATH = '/content/drive/MyDrive/ff_c23_data'
REAL_PATH = os.path.join(ROOT_PATH, "original")
FAKE_PATH = os.path.join(ROOT_PATH, "Deepfakes")
OUTPUT_FRAME_SIZE = (224, 224)
FRAME_COUNT = 30
MAX_VIDEOS = 500
NUM_WORKERS = 2
BATCH_SIZE = 8
PADDING_FACTOR = 1.3
LR_FINE_TUNE = 1e-6 # Learning Rate ban đầu cho Fine-Tuning

SEED = 42

# Training
EPOCHS_PHASE1 = 20
EPOCHS_PHASE2 = 60
LR_PHASE1 = 1e-4
LR_PHASE2 = 1e-6

PADDING_FACTOR = 1.3
EARLY_STOP_PATIENCE = 10

# Model options
BACKBONE_NAME = "xception"   # from timm
HIDDEN_SIZE = 128
DROPOUT = 0.4
USE_SPATIAL_FLATTEN = False  # True = match original (huge vector), False = GAP (recommended)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

seed_everything(SEED)


device: cpu


In [None]:
# --------------------
# Frame extraction with dlib face cropping (same logic as original)
# --------------------
face_detector_dlib = dlib.get_frontal_face_detector()

def extract_frames(video_path, output_size=OUTPUT_FRAME_SIZE, frame_count=FRAME_COUNT, padding_factor=PADDING_FACTOR):
    cap = cv2.VideoCapture(video_path)
    frames = []
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total_frames == 0:
        return np.array([])

    step = max(total_frames // frame_count, 1)

    for i in range(frame_count):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)
        ret, frame = cap.read()
        if not ret:
            break

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = face_detector_dlib(gray)

        frame_h, frame_w, _ = frame.shape

        if len(faces) > 0:
            d = faces[0]

            y_center = (d.top() + d.bottom()) // 2
            x_center = (d.left() + d.right()) // 2
            h = d.bottom() - d.top()
            w = d.right() - d.left()

            h_pad = int(h * padding_factor)
            w_pad = int(w * padding_factor)

            y1 = max(0, y_center - h_pad // 2)
            y2 = min(frame_h, y_center + h_pad // 2)
            x1 = max(0, x_center - w_pad // 2)
            x2 = min(frame_w, x_center + w_pad // 2)

            cropped_face = frame[y1:y2, x1:x2]

            if cropped_face.size != 0:
                resized_frame = cv2.resize(cropped_face, output_size)
                frames.append(resized_frame)
                continue

        resized_frame = cv2.resize(frame, output_size)
        frames.append(resized_frame)

    cap.release()
    return np.array(frames) if len(frames) == frame_count else np.array([])

# !wget -q http://dlib.net/files/mmod_human_face_detector.dat.bz2
# !bzip2 -dk mmod_human_face_detector.dat.bz2  # produces mmod_human_face_detector.dat
# !ls -lh mmod_human_face_detector.dat*

# CNN_MODEL_PATH = "mmod_human_face_detector.dat"
# cnn_face_detector = dlib.cnn_face_detection_model_v1(CNN_MODEL_PATH)

# def extract_frames(video_path, output_size=OUTPUT_FRAME_SIZE, frame_count=FRAME_COUNT, padding_factor=PADDING_FACTOR):
#     cap = cv2.VideoCapture(video_path)
#     frames = []
#     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

#     if total_frames == 0:
#         cap.release()
#         return np.array([])

#     step = max(total_frames // frame_count, 1)

#     for i in range(frame_count):
#         cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)
#         ret, frame = cap.read()
#         if not ret:
#             break

#         # CNN detector expects RGB
#         rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#         detections = cnn_face_detector(rgb, 1)  # upsample=1

#         frame_h, frame_w, _ = frame.shape

#         if len(detections) > 0:
#             # pick the first detection (same “take first face” logic as before)
#             rect = detections[0].rect  # mmod_rectangle -> .rect is a dlib.rectangle

#             top = rect.top()
#             bottom = rect.bottom()
#             left = rect.left()
#             right = rect.right()

#             y_center = (top + bottom) // 2
#             x_center = (left + right) // 2
#             h = bottom - top
#             w = right - left

#             h_pad = int(h * padding_factor)
#             w_pad = int(w * padding_factor)

#             y1 = max(0, y_center - h_pad // 2)
#             y2 = min(frame_h, y_center + h_pad // 2)
#             x1 = max(0, x_center - w_pad // 2)
#             x2 = min(frame_w, x_center + w_pad // 2)

#             cropped_face = frame[y1:y2, x1:x2]

#             if cropped_face.size != 0:
#                 resized_frame = cv2.resize(cropped_face, output_size)
#                 frames.append(resized_frame)
#                 continue

#         resized_frame = cv2.resize(frame, output_size)
#         frames.append(resized_frame)

#     cap.release()
#     return np.array(frames) if len(frames) == frame_count else np.array([])


In [None]:
# --------------------
# Load data into memory (same split: 70/15/15)
# --------------------
data, labels = [], []

print(f"Loading up to {MAX_VIDEOS} REAL videos...")
for video_file in tqdm(os.listdir(REAL_PATH)[:MAX_VIDEOS]):
    frames = extract_frames(os.path.join(REAL_PATH, video_file))
    if frames.size != 0:
        data.append(frames)
        labels.append(0)

print(f"Loading up to {MAX_VIDEOS} FAKE videos...")
for video_file in tqdm(os.listdir(FAKE_PATH)[:MAX_VIDEOS]):
    frames = extract_frames(os.path.join(FAKE_PATH, video_file))
    if frames.size != 0:
        data.append(frames)
        labels.append(1)

print("Total loaded videos:", len(data))

data = np.array(data)     # (N, T, H, W, C) uint8
labels = np.array(labels) # (N,)

X_train, X_temp, y_train, y_temp = train_test_split(
    data, labels, test_size=0.3, random_state=SEED, stratify=labels
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=SEED, stratify=y_temp
)

print("Train/Val/Test:", X_train.shape, X_val.shape, X_test.shape)


Loading up to 500 REAL videos...


100%|██████████| 500/500 [2:34:53<00:00, 18.59s/it]


Loading up to 500 FAKE videos...


 32%|███▏      | 160/500 [43:14<1:38:34, 17.40s/it]

In [None]:
torch.save(X_train, ROOT_PATH + '/X_train_full.pt')
torch.save(y_train, ROOT_PATH + '/y_train_full.pt')
torch.save(X_val, ROOT_PATH + '/X_val_full.pt')
torch.save(y_val, ROOT_PATH + '/y_val_full.pt')
torch.save(X_test, ROOT_PATH + '/X_test_full.pt')
torch.save(y_test, ROOT_PATH + '/y_test_full.pt')

In [None]:
# --------------------
# PyTorch Dataset / DataLoader
# --------------------
from torchvision import transforms

# ImageNet normalization (timm's xception expects this style)
imagenet_mean = (0.485, 0.456, 0.406)
imagenet_std  = (0.229, 0.224, 0.225)

train_frame_tfms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.ToTensor(),  # uint8 -> float in [0,1]
    transforms.Normalize(mean=imagenet_mean, std=imagenet_std),
])

val_frame_tfms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(mean=imagenet_mean, std=imagenet_std),
])

class VideoNumpyDataset(Dataset):
    def __init__(self, X, y, frame_transform):
        self.X = X
        self.y = y
        self.frame_transform = frame_transform

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        video = self.X[idx]  # (T,H,W,C) uint8
        label = int(self.y[idx])

        frames = []
        for t in range(video.shape[0]):
            frame = video[t]  # (H,W,C) BGR from cv2
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(self.frame_transform(frame))  # (C,H,W)

        video_tensor = torch.stack(frames, dim=0)  # (T,C,H,W)
        return video_tensor, torch.tensor(label, dtype=torch.long)

train_ds = VideoNumpyDataset(X_train, y_train, train_frame_tfms)
val_ds   = VideoNumpyDataset(X_val,   y_val,   val_frame_tfms)
test_ds  = VideoNumpyDataset(X_test,  y_test,  val_frame_tfms)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=True, drop_last=False)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False,
                        num_workers=NUM_WORKERS, pin_memory=True, drop_last=False)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False,
                         num_workers=NUM_WORKERS, pin_memory=True, drop_last=False)

# Class weights (for imbalance)
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(y_train), y=y_train)
class_weights_t = torch.tensor(class_weights, dtype=torch.float32, device=device)
print("class_weights:", class_weights)


class_weights: [1.00359712 0.99642857]
