# STF-Mamba V8.0 — Preprocessing (CPU Only)

**Purpose:** Extract face crops + build SBI cache → save as Kaggle dataset.

**Settings:** CPU only (no GPU needed), Internet ON.

**Attached datasets:**
- `ff-c23` (FaceForensics++ C23)
- `shape-predictor81` (dlib landmarks)

**Output:** `/kaggle/working/stf_cache/` → upload as Kaggle dataset `stf-mamba-v8-cache`

In [None]:
# ============================================================================
# Cell 1: Install dependencies (CPU only — no mamba needed)
# ============================================================================
!pip install dlib imutils albumentations einops opencv-python-headless -q
import os, sys, time
print("✓ Dependencies installed")

In [None]:
# ============================================================================
# Cell 2: Clone repo + configure paths
# ============================================================================
REPO_URL = "https://github.com/AbdelRahman-Madboly/STF-Mamba_V8.0.git"
REPO_DIR = "/kaggle/working/STF-Mamba_V8.0"

if not os.path.exists(REPO_DIR):
    !git clone {REPO_URL} {REPO_DIR}
else:
    print(f"Repo exists at {REPO_DIR}")

sys.path.insert(0, REPO_DIR)
os.chdir(REPO_DIR)

from data.splits import load_all_splits, get_video_ids
from data.preprocessing import FacePreprocessor
from data.sbi_dataset import SBIVideoDataset
from data.augmentation import get_train_transforms, get_val_transforms

print("✓ Imports OK")

In [None]:
# ============================================================================
# Cell 3: Dataset paths (HARDCODED — verified correct)
# ============================================================================

# FF++ ORIGINAL real videos
FF_VIDEO_DIR = "/kaggle/input/datasets/xdxd003/ff-c23/FaceForensics++_C23/original"

# dlib predictor
PREDICTOR_PATH = "/kaggle/working/shape_predictor_81_face_landmarks.dat"
src = "/kaggle/input/datasets/zeyadkhalid/shape-predictor81/shape_predictor_81_face_landmarks.dat"
if not os.path.exists(PREDICTOR_PATH) and os.path.exists(src):
    os.system(f"cp '{src}' '{PREDICTOR_PATH}'")

# Output cache directory (will become Kaggle dataset)
CACHE_DIR = "/kaggle/working/stf_cache"
os.makedirs(os.path.join(CACHE_DIR, "crops"), exist_ok=True)
os.makedirs(os.path.join(CACHE_DIR, "sbi_seed42"), exist_ok=True)

# Verify
SPLITS_DIR = os.path.join(REPO_DIR, "splits")
splits = load_all_splits(SPLITS_DIR)
train_ids = get_video_ids(splits['train'])
val_ids = get_video_ids(splits['val'])
test_ids = get_video_ids(splits['test'])
all_ids = sorted(set(train_ids + val_ids + test_ids))

mp4s = [f for f in os.listdir(FF_VIDEO_DIR) if f.endswith('.mp4')]
found = sum(1 for v in train_ids[:10] if os.path.exists(os.path.join(FF_VIDEO_DIR, f"{v}.mp4")))

print(f"FF++ videos: {len(mp4s)} at {FF_VIDEO_DIR}")
print(f"ID check: {found}/10 ✓" if found >= 8 else f"WARNING: only {found}/10!")
print(f"Predictor: {'✓' if os.path.exists(PREDICTOR_PATH) else '✗'}")
print(f"Splits: train={len(splits['train'])}, val={len(splits['val'])}, test={len(splits['test'])}")
print(f"Total unique video IDs: {len(all_ids)}")
print(f"\n✓ Cell 3 complete")

In [None]:
# ============================================================================
# Cell 4: Phase A — Extract face crops (PARALLEL — 4 workers)
# ============================================================================
import multiprocessing as mp
from functools import partial
from pathlib import Path

NUM_FRAMES = 32
IMG_SIZE = 224
CROP_DIR = os.path.join(CACHE_DIR, "crops")

def process_one_video(vid_id, video_dir, crop_dir, num_frames, img_size, predictor_path):
    """Process a single video — runs in subprocess."""
    import cv2, numpy as np, dlib
    from imutils import face_utils
    
    crop_path = os.path.join(crop_dir, f"{vid_id}_crops.npz")
    land_path = os.path.join(crop_dir, f"{vid_id}_landmarks.npz")
    if os.path.exists(crop_path) and os.path.exists(land_path):
        return vid_id, "cached"
    
    # Find video
    video_path = os.path.join(video_dir, f"{vid_id}.mp4")
    if not os.path.exists(video_path):
        return vid_id, "not_found"
    
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total < 1:
        cap.release()
        return vid_id, "empty"
    
    frame_idxs = np.linspace(0, total - 1, num_frames, endpoint=True, dtype=int)
    
    detector = dlib.get_frontal_face_detector()
    predictor = dlib.shape_predictor(predictor_path)
    
    crops_list, landmarks_list = [], []
    
    for idx in frame_idxs:
        cap.set(cv2.CAP_PROP_POS_FRAMES, int(idx))
        ret, frame = cap.read()
        if not ret:
            continue
        
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        h, w = frame_rgb.shape[:2]
        
        faces = detector(frame_rgb, 0)  # upsample=0 (faster, faces are large in FF++)
        
        if len(faces) == 0:
            # Center crop fallback
            s = min(h, w)
            y0, x0 = (h - s) // 2, (w - s) // 2
            crop = cv2.resize(frame_rgb[y0:y0+s, x0:x0+s], (img_size, img_size))
            landmark = np.zeros((81, 2), dtype=np.float32)
        else:
            face = max(faces, key=lambda f: (f.right()-f.left()) * (f.bottom()-f.top()))
            shape = predictor(frame_rgb, face)
            landmark = face_utils.shape_to_np(shape).astype(np.float32)
            
            x0l, y0l = landmark[:, 0].min(), landmark[:, 1].min()
            x1l, y1l = landmark[:, 0].max(), landmark[:, 1].max()
            cx, cy = (x0l + x1l) / 2, (y0l + y1l) / 2
            half = max(x1l - x0l, y1l - y0l) * 1.3 / 2
            
            x0c = max(0, int(cx - half))
            y0c = max(0, int(cy - half))
            x1c = min(w, int(cx + half))
            y1c = min(h, int(cy + half))
            crop = cv2.resize(frame_rgb[y0c:y1c, x0c:x1c], (img_size, img_size))
        
        crops_list.append(crop)
        landmarks_list.append(landmark)
    
    cap.release()
    
    # Pad if needed
    while len(crops_list) < num_frames:
        crops_list.append(crops_list[-1] if crops_list else np.zeros((img_size, img_size, 3), dtype=np.uint8))
        landmarks_list.append(landmarks_list[-1] if landmarks_list else np.zeros((81, 2), dtype=np.float32))
    
    crops = np.stack(crops_list[:num_frames])
    landmarks = np.stack(landmarks_list[:num_frames])
    
    np.savez_compressed(crop_path, crops=crops)
    np.savez_compressed(land_path, landmarks=landmarks)
    return vid_id, "done"

# How many CPUs?
N_WORKERS = min(mp.cpu_count(), 4)
print(f"Using {N_WORKERS} parallel workers (CPUs: {mp.cpu_count()})")
print(f"Videos: {len(all_ids)}, Frames/video: {NUM_FRAMES}")
print(f"Output: {CROP_DIR}")
print()

# Filter out already cached
to_process = [v for v in all_ids if not os.path.exists(os.path.join(CROP_DIR, f"{v}_crops.npz"))]
already_cached = len(all_ids) - len(to_process)
print(f"Already cached: {already_cached}, To process: {len(to_process)}")

t0 = time.time()

if to_process:
    worker_fn = partial(
        process_one_video,
        video_dir=FF_VIDEO_DIR,
        crop_dir=CROP_DIR,
        num_frames=NUM_FRAMES,
        img_size=IMG_SIZE,
        predictor_path=PREDICTOR_PATH,
    )
    
    done, failed = 0, 0
    with mp.Pool(N_WORKERS) as pool:
        for vid_id, status in pool.imap_unordered(worker_fn, to_process):
            if status == "done":
                done += 1
            else:
                failed += 1
            total_done = done + already_cached
            if (done + failed) % 50 == 0 or (done + failed) == len(to_process):
                elapsed = time.time() - t0
                rate = done / elapsed if elapsed > 0 else 0
                eta = (len(to_process) - done - failed) / rate / 60 if rate > 0 else 0
                print(f"  [{total_done}/{len(all_ids)}] {rate:.1f} vid/s, ETA: {eta:.0f} min")
    
    print(f"\nProcessed: {done}, Failed: {failed}")

elapsed = time.time() - t0
n_crops = len([f for f in os.listdir(CROP_DIR) if f.endswith("_crops.npz")])
print(f"Face crops done: {n_crops} videos in {elapsed/60:.1f} min")
print(f"✓ Phase A complete")

In [None]:
# ============================================================================
# Cell 5: Phase B — Build SBI fake cache (train + val)
# ============================================================================
# SBI only needed for train and val (test uses Celeb-DF, not SBI)

train_tf = get_train_transforms(IMG_SIZE)
val_tf = get_val_transforms(IMG_SIZE)

train_ds = SBIVideoDataset(
    split_path=os.path.join(SPLITS_DIR, "Dataset_Split_train.json"),
    video_dir=FF_VIDEO_DIR,
    cache_dir=CACHE_DIR,
    phase="train",
    num_frames=NUM_FRAMES,
    img_size=IMG_SIZE,
    transform=train_tf,
    sbi_seed=42,
    predictor_path=PREDICTOR_PATH,
)

val_ds = SBIVideoDataset(
    split_path=os.path.join(SPLITS_DIR, "Dataset_Split_val.json"),
    video_dir=FF_VIDEO_DIR,
    cache_dir=CACHE_DIR,
    phase="val",
    num_frames=NUM_FRAMES,
    img_size=IMG_SIZE,
    transform=val_tf,
    sbi_seed=42,
    predictor_path=PREDICTOR_PATH,
)

print(f"Building SBI cache for {len(train_ds.pairs)} train + {len(val_ds.pairs)} val pairs...")
print(f"Output: {CACHE_DIR}/sbi_seed42/")
print()

t0 = time.time()
train_ds.build_cache(show_progress=True)
val_ds.build_cache(show_progress=True)
elapsed = time.time() - t0

n_sbi = len([f for f in os.listdir(os.path.join(CACHE_DIR, "sbi_seed42")) if f.endswith(".npz")])
print(f"\nSBI cache done: {n_sbi} files in {elapsed/60:.1f} min")
print(f"✓ Phase B complete")

In [None]:
# ============================================================================
# Cell 6: Sanity check + cache stats
# ============================================================================
import numpy as np

# Check a sample
sample = train_ds[0]
print("Sanity check:")
print(f"  frames: {sample['frames'].shape}")
print(f"  label:  {sample['label']}")
print(f"  id:     {sample['video_id']}")

# Cache size
total_size = 0
for root, dirs, files in os.walk(CACHE_DIR):
    for f in files:
        total_size += os.path.getsize(os.path.join(root, f))

n_crops = len([f for f in os.listdir(os.path.join(CACHE_DIR, "crops")) if f.endswith(".npz")])
n_sbi = len([f for f in os.listdir(os.path.join(CACHE_DIR, "sbi_seed42")) if f.endswith(".npz")])

print(f"\nCache summary:")
print(f"  crops/     : {n_crops} files")
print(f"  sbi_seed42/: {n_sbi} files")
print(f"  Total size : {total_size / 1e9:.2f} GB")
print(f"  Location   : {CACHE_DIR}")
print(f"\nDataset sizes: train={len(train_ds)}, val={len(val_ds)}")

## Save as Kaggle Dataset

After this notebook finishes:
1. Click **"Save Version"** (top right) → Save & Run All → **Quick Save**
2. Go to the notebook output → click **"New Dataset"**
3. Name it: **`stf-mamba-v8-cache`**
4. The `stf_cache/` folder with all NPZ files becomes your dataset

Then in your GPU training notebook, attach `stf-mamba-v8-cache` as an input dataset.

In [None]:
# ============================================================================
# Cell 7: Final verification — list what we built
# ============================================================================
print("=" * 60)
print("  Preprocessing Complete!")
print("=" * 60)
print(f"\n  Output directory: {CACHE_DIR}")
print(f"  → crops/      : {n_crops} video face crop files")
print(f"  → sbi_seed42/ : {n_sbi} SBI fake cache files")
print(f"  → Total size  : {total_size / 1e9:.2f} GB")
print(f"\n  Next steps:")
print(f"  1. Save this notebook version")
print(f"  2. Create dataset from output: 'stf-mamba-v8-cache'")
print(f"  3. Open GPU notebook, attach the cache dataset")
print(f"  4. Train!")