In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls drive/MyDrive

In [None]:
''' unzip the dataset '''

# !unzip -q /content/drive/MyDrive/Deepfake.zip \
#        -d /content/drive/MyDrive/Deepfake/


In [None]:
# !pip install mediapipe

In [None]:
# Libraries
import os, random
import cv2
import mediapipe as mp
import matplotlib.pyplot as plt


In [None]:
# Paths
RAW_ROOT    = '/content/drive/MyDrive/Deepfake/deepFake_data'
OUTPUT_ROOT = '/content/drive/MyDrive/Deepfake'

In [None]:
# Subfolder names
VIDEO_FOLDERS = {
    'real': 'real',
    'fake': 'fake'
}

In [None]:
# 3️⃣ Initialize MediaPipe

mp_face = mp.solutions.face_detection.FaceDetection(
    model_selection=0,
    min_detection_confidence=0.5,
)

In [None]:
from tqdm import tqdm

# method for face extraction
def extract_faces(label, max_faces=30, skip_frames=5):
    in_dir = os.path.join(RAW_ROOT, VIDEO_FOLDERS[label])
    out_dir = os.path.join(OUTPUT_ROOT, label)
    videos = [f for f in os.listdir(in_dir) if f.lower().endswith('.mp4')]  # important-1

    for vid in tqdm(videos, desc=f"Processing {label} videos"):
        cap = cv2.VideoCapture(os.path.join(in_dir, vid))
        base = os.path.splitext(vid)[0]  #important-2

        saved = 0
        frame_idx = 0

        while cap.isOpened() and saved < max_faces:
            ret, frame = cap.read()  # important-3
            if not ret:
                break

            if frame_idx % skip_frames == 0:
                rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                result = mp_face.process(rgb)  # important-4
                h, w, _ = frame.shape

                if result.detections:
                    for i, det in enumerate(result.detections):
                        bb = det.location_data.relative_bounding_box  # important-5
                        x1 = max(0, int(bb.xmin * w))
                        y1 = max(0, int(bb.ymin * h))
                        x2 = min(w, x1 + int(bb.width * w))
                        y2 = min(h, y1 + int(bb.height * h))

                        face = frame[y1:y2, x1:x2]  # important-6
                        if face.size == 0:
                            continue

                        face = cv2.resize(face, (128, 128))
                        fname = f"{base}_f{frame_idx}_i{i}.jpg"
                        cv2.imwrite(os.path.join(out_dir, fname), face)  # important-7
                        saved += 1
                        if saved >= max_faces:
                            break

            frame_idx += 1
        cap.release()
        print(f"{label.upper()} | {vid}: saved {saved}/{max_faces} faces")


In [None]:
extract_faces('real')
extract_faces('fake')

In [None]:
for label in ['real', 'fake']:
    path = os.path.join(OUTPUT_ROOT, label)
    count = len(os.listdir(path)) if os.path.isdir(path) else 0
    print(f"{label.title()} faces extracted: {count}")


In [None]:
def show_samples(label, num=4):
    folder = os.path.join(OUTPUT_ROOT, label)
    files  = random.sample(os.listdir(folder), min(len(os.listdir(folder)), num))
    plt.figure(figsize=(6,6))
    for i, f in enumerate(files):
        img = cv2.cvtColor(
            cv2.imread(os.path.join(folder, f)),
            cv2.COLOR_BGR2RGB
        )
        plt.subplot(2,2,i+1)
        plt.imshow(img)
        plt.title(f"{label}: {f}")
        plt.axis('off')
    plt.show()

print("\nSample REAL faces:")
show_samples('real')
print("Sample FAKE faces:")
show_samples('fake')