In [277]:
import os
import cv2
import sys
import pandas as pd
import numpy as np
import tabulate as tb
import re

SEED = 42
np.random.seed(SEED)

In [278]:
def extract_frames_from_video(video_path, out_dir, selected_frames):
    """
    Extract only specific frames from a video.

    Parameters:
        video_path (str): Path to the input video.
        out_dir (str): Output directory where frames will be saved.
        selected_frames (list[int] or set[int]): Frame indices to extract (e.g., [5, 10, 25]).
    """
    vid = cv2.VideoCapture(video_path)
    if not vid.isOpened():
        print(f"Error: failed to open {video_path}")
        return

    video_name = os.path.splitext(os.path.basename(video_path))[0]
    target_dir = os.path.join(out_dir, video_name)
    os.makedirs(target_dir, exist_ok=True)

    count = 1
    saved_count = 0
    selected_set = set(selected_frames)  # convert to set for faster lookup

    while True:
        ret, frame = vid.read()
        if not ret:
            break
        if count in selected_set:
            resized = cv2.resize(frame, (224, 224))
            frame_path = os.path.join(target_dir, f"frame{count}.jpg")
            cv2.imwrite(frame_path, resized, [int(cv2.IMWRITE_JPEG_QUALITY), 90])
            saved_count += 1
        count += 1

        # Optional early exit if all selected frames are found
        if saved_count == len(selected_set):
            break

    vid.release()
    print(f"Extracted {saved_count} frame(s) from {video_name}")

In [279]:
org_df = pd.read_csv('original_data/A-FF++.csv')
org_df = org_df[org_df["path"].str.contains("FaceForensics++")]

df_org = org_df[org_df["path"].str.contains("original")].copy()
df_org.loc[:, "path"] = df_org["path"].apply(lambda x: 'original/' +'/'.join(x.rsplit('/', 2)[-2:]))
df_org["deepfake"] = -1

df_dee = org_df[org_df["path"].str.contains("Deepfake")]
df_dee.loc[:, "path"] = df_dee["path"].apply(lambda x: 'deepfake/' + '/'.join(x.rsplit('/', 2)[-2:]))
df_dee["deepfake"] = 1



print(tb.tabulate(df_org.head(), headers='keys', tablefmt='psql'))
print(len(df_org))

print(tb.tabulate(df_dee.head(), headers='keys', tablefmt='psql'))
print(len(df_dee))


+--------+--------------+---------------------------+---------+--------+---------+---------------+----------+---------+---------+---------+--------------+--------+-------------+---------------------+---------+--------------+--------------+-------------+------------+------------+----------+-------------+---------------+----------------+----------+-----------------------+--------------------------+--------------+-------------------+------------------+-------------------+----------------+-----------+------------+------------+---------------+----------------+---------------+-------------------+--------------------+--------------+--------------+--------------+------------+
|        |   Unnamed: 0 | path                      |   label |   male |   young |   middle_aged |   senior |   asian |   white |   black |   shiny_skin |   bald |   wavy_hair |   receding_hairline |   bangs |   black_hair |   blond_hair |   gray_hair |   no_beard |   mustache |   goatee |   oval_face |   square_face |   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dee["deepfake"] = 1


In [280]:
df = pd.concat([df_org, df_dee], ignore_index=True)
df = df.sample(frac=1, random_state=SEED).reset_index(drop=True)

df["file_name"] = df["path"].apply(lambda x: x.split('/')[-2])

frame_num_pattern = re.compile(r'frame(\d+)\.png$')
df["frame_num"] = df["path"].apply(lambda x: int(frame_num_pattern.search(x).group(1)) if frame_num_pattern.search(x) else -1)

df["path"] = df["path"].apply(lambda x: x.replace('.png', '.jpg')) 

df = df[df['frame_num'] <= 100]

print(tb.tabulate(df.head(), headers='keys', tablefmt='psql'))
print(f"Total number of samples: {len(df)}")
print(f"Unique videos: {len(df['file_name'].unique())}")
print()


print("================ ORIGINAL ================")
male, female = len(df[(df['deepfake'] == -1) & (df['male'] == 1)]), len(df[(df['deepfake'] == -1) & (df['male'] == -1)]) 
print(f"Male:Female = {male}:{female} ({100*male/(male+female):.0f}:{100*female/(male+female):.0f})")

white, black, asian = len(df[(df['deepfake'] == -1) & (df['white'] == 1)]), len(df[(df['deepfake'] == -1) & (df['black'] == 1)]), len(df[(df['deepfake'] == -1) & (df['asian'] == 1)])
print(f"White:Black:Asian = {white}:{black}:{asian} ({100*white/(white+black+asian):.0f}:{100*black/(white+black+asian):.0f}:{100*asian/(white+black+asian):.0f})")

black_hair, non_black_hair = len(df[(df['deepfake'] == -1) & (df['black_hair'] == 1)]), len(df[(df['deepfake'] == -1) & (df['black_hair'] == -1)])
print(f"Black hair:Non-black hair = {black_hair}:{non_black_hair} ({100*black_hair/(black_hair+non_black_hair):.0f}:{100*non_black_hair/(black_hair+non_black_hair):.0f})")

eyeglasses, no_eyeglasses = len(df[(df['deepfake'] == -1) & (df['eyeglasses'] == 1)]), len(df[(df['deepfake'] == -1) & (df['eyeglasses'] == -1)])
print(f"Eyeglasses:No eyeglasses = {eyeglasses}:{no_eyeglasses} ({100*eyeglasses/(eyeglasses+no_eyeglasses):.0f}:{100*no_eyeglasses/(eyeglasses+no_eyeglasses):.0f})")

heavy_makeup, no_heavy_makeup = len(df[(df['deepfake'] == -1) & (df['heavy_makeup'] == 1)]), len(df[(df['deepfake'] == -1) & (df['heavy_makeup'] == -1)])
print(f"Heavy makeup:No heavy makeup = {heavy_makeup}:{no_heavy_makeup} ({100*heavy_makeup/(heavy_makeup+no_heavy_makeup):.0f}:{100*no_heavy_makeup/(heavy_makeup+no_heavy_makeup):.0f})")

big_lips, no_big_lips = len(df[(df['deepfake'] == -1) & (df['big_lips'] == 1)]), len(df[(df['deepfake'] == -1) & (df['big_lips'] == -1)])
print(f"Big lips:No big lips = {big_lips}:{no_big_lips} ({100*big_lips/(big_lips+no_big_lips):.0f}:{100*no_big_lips/(big_lips+no_big_lips):.0f})")
print()

print("================ DEEPFAKE ================")
male, female = len(df[(df['deepfake'] == 1) & (df['male'] == 1)]), len(df[(df['deepfake'] == 1) & (df['male'] == -1)]) 
print(f"Male:Female = {male}:{female} ({100*male/(male+female):.0f}:{100*female/(male+female):.0f})")

white, black, asian = len(df[(df['deepfake'] == 1) & (df['white'] == 1)]), len(df[(df['deepfake'] == 1) & (df['black'] == 1)]), len(df[(df['deepfake'] == 1) & (df['asian'] == 1)])
print(f"White:Black:Asian = {white}:{black}:{asian} ({100*white/(white+black+asian):.0f}:{100*black/(white+black+asian):.0f}:{100*asian/(white+black+asian):.0f})")

black_hair, non_black_hair = len(df[(df['deepfake'] == 1) & (df['black_hair'] == 1)]), len(df[(df['deepfake'] == 1) & (df['black_hair'] == -1)])
print(f"Black hair:Non-black hair = {black_hair}:{non_black_hair} ({100*black_hair/(black_hair+non_black_hair):.0f}:{100*non_black_hair/(black_hair+non_black_hair):.0f})")

eyeglasses, no_eyeglasses = len(df[(df['deepfake'] == 1) & (df['eyeglasses'] == 1)]), len(df[(df['deepfake'] == 1) & (df['eyeglasses'] == -1)])
print(f"Eyeglasses:No eyeglasses = {eyeglasses}:{no_eyeglasses} ({100*eyeglasses/(eyeglasses+no_eyeglasses):.0f}:{100*no_eyeglasses/(eyeglasses+no_eyeglasses):.0f})")

heavy_makeup, no_heavy_makeup = len(df[(df['deepfake'] == 1) & (df['heavy_makeup'] == 1)]), len(df[(df['deepfake'] == 1) & (df['heavy_makeup'] == -1)])
print(f"Heavy makeup:No heavy makeup = {heavy_makeup}:{no_heavy_makeup} ({100*heavy_makeup/(heavy_makeup+no_heavy_makeup):.0f}:{100*no_heavy_makeup/(heavy_makeup+no_heavy_makeup):.0f})")

big_lips, no_big_lips = len(df[(df['deepfake'] == 1) & (df['big_lips'] == 1)]), len(df[(df['deepfake'] == 1) & (df['big_lips'] == -1)])
print(f"Big lips:No big lips = {big_lips}:{no_big_lips} ({100*big_lips/(big_lips+no_big_lips):.0f}:{100*no_big_lips/(big_lips+no_big_lips):.0f})")


df.to_csv('data/metadata.csv', index=False)

+----+--------------+-----------------------------+---------+--------+---------+---------------+----------+---------+---------+---------+--------------+--------+-------------+---------------------+---------+--------------+--------------+-------------+------------+------------+----------+-------------+---------------+----------------+----------+-----------------------+--------------------------+--------------+-------------------+------------------+-------------------+----------------+-----------+------------+------------+---------------+----------------+---------------+-------------------+--------------------+--------------+--------------+--------------+------------+-------------+-------------+
|    |   Unnamed: 0 | path                        |   label |   male |   young |   middle_aged |   senior |   asian |   white |   black |   shiny_skin |   bald |   wavy_hair |   receding_hairline |   bangs |   black_hair |   blond_hair |   gray_hair |   no_beard |   mustache |   goatee |   oval_f

In [281]:
meta = pd.read_csv('data/metadata.csv')
print(tb.tabulate(meta.head(), headers='keys', tablefmt='psql'))

+----+--------------+-----------------------------+---------+--------+---------+---------------+----------+---------+---------+---------+--------------+--------+-------------+---------------------+---------+--------------+--------------+-------------+------------+------------+----------+-------------+---------------+----------------+----------+-----------------------+--------------------------+--------------+-------------------+------------------+-------------------+----------------+-----------+------------+------------+---------------+----------------+---------------+-------------------+--------------------+--------------+--------------+--------------+------------+-------------+-------------+
|    |   Unnamed: 0 | path                        |   label |   male |   young |   middle_aged |   senior |   asian |   white |   black |   shiny_skin |   bald |   wavy_hair |   receding_hairline |   bangs |   black_hair |   blond_hair |   gray_hair |   no_beard |   mustache |   goatee |   oval_f

In [282]:
org_data_original_dir = 'original_data/original'
org_data_deepfake_dir = 'original_data/Deepfakes'

files_amount = len(df['file_name'].unique())

for i, (file_name, group) in enumerate(df.groupby('file_name')):
    frames = group['frame_num'].tolist()
    is_deepfake = group['deepfake'].iloc[0] == 1
    file_path = os.path.join(org_data_deepfake_dir if is_deepfake else org_data_original_dir, file_name) + '.mp4'
    
    print(f"[{i+1}/{files_amount}] Processing video: {file_name} (Deepfake: {is_deepfake})")
    
    if is_deepfake:
        extract_frames_from_video(file_path, 'data/deepfake', frames)
    else:
        extract_frames_from_video(file_path, 'data/original', frames)

[1/1999] Processing video: 000 (Deepfake: False)
Extracted 10 frame(s) from 000
[2/1999] Processing video: 000_003 (Deepfake: True)
Extracted 10 frame(s) from 000_003
[3/1999] Processing video: 001 (Deepfake: False)
Extracted 10 frame(s) from 001
[4/1999] Processing video: 001_870 (Deepfake: True)
Extracted 10 frame(s) from 001_870
[5/1999] Processing video: 002 (Deepfake: False)
Extracted 10 frame(s) from 002
[6/1999] Processing video: 002_006 (Deepfake: True)
Extracted 10 frame(s) from 002_006
[7/1999] Processing video: 003 (Deepfake: False)
Extracted 10 frame(s) from 003
[8/1999] Processing video: 003_000 (Deepfake: True)
Extracted 10 frame(s) from 003_000
[9/1999] Processing video: 004 (Deepfake: False)
Extracted 10 frame(s) from 004
[10/1999] Processing video: 004_982 (Deepfake: True)
Extracted 10 frame(s) from 004_982
[11/1999] Processing video: 005 (Deepfake: False)
Extracted 10 frame(s) from 005
[12/1999] Processing video: 005_010 (Deepfake: True)
Extracted 10 frame(s) from 005

KeyboardInterrupt: 