In [145]:
REMOVE_RES_CONTENT = True
SEED = 42

RES_PATH = '/content/drive/MyDrive/Studia/MAGISTER/PracaMagisterska/res/'

In [146]:
from google.colab import drive
import os

# Montuj Google Drive (tylko raz)
drive.mount('/content/drive')

# Ścieżki
zip_path = '/content/data.zip'
dataset_dir = '/content/'
zip_on_drive = '/content/drive/MyDrive/Studia/MAGISTER/PracaMagisterska/data.zip'

# Sprawdź, czy katalog z danymi już istnieje
if not os.path.exists(dataset_dir+'data') or len(os.listdir(dataset_dir)) == 0:
    print("Pliki jeszcze nie wypakowane — kopiuję i rozpakowuję...")
    # Skopiuj zip z Drive do RAM
    if not os.path.exists(zip_path):
        !cp "$zip_on_drive" "$zip_path"
    # Rozpakuj zip
    !unzip -q "$zip_path" -d "$dataset_dir"
else:
    print("Pliki już są rozpakowane — pomijam kopiowanie i rozpakowywanie.")

# Podejrzyj zawartość
!ls -lh "$dataset_dir"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Pliki już są rozpakowane — pomijam kopiowanie i rozpakowywanie.
total 890M
drwxrwxr-x 4 root root 4.0K Jul 16 20:22 data
-rw------- 1 root root 890M Jul 17 10:51 data.zip
drwx------ 6 root root 4.0K Jul 17 10:51 drive
drwxr-xr-x 2 root root 4.0K Jul 17 10:52 res
drwxr-xr-x 1 root root 4.0K Jul 15 13:41 sample_data


In [147]:
!pip install -q fairlearn tabulate

import pandas as pd
import numpy as np
import tabulate as tb
from typing import Dict
import tensorflow as tf
import re
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.applications import MobileNetV2
from fairlearn.metrics import demographic_parity_difference, equalized_odds_difference
from tensorflow.keras.applications import Xception, EfficientNetB0, ResNet50
import os
import shutil

np.random.seed(SEED)
tf.random.set_seed(SEED)

# Informacja o dostępności GPU
device_name = tf.test.gpu_device_name()
if device_name:
    print(f"GPU jest dostępne: {device_name}")
else:
    print("GPU NIE jest dostępne, sprawdź ustawienia środowiska")

GPU jest dostępne: /device:GPU:0


In [148]:
file_path = '/content/data/metadata.csv'
df_tmp = pd.read_csv(file_path, sep=',')
df_tmp['path'] = '/content/data/' + df_tmp['path']

df_tmp = df_tmp[df_tmp['deepfake'] != 0]

df_tmp['ethnicity'] = df_tmp.apply(
    lambda row: 'white' if row['white'] == 1 else ('black' if row['black'] == 1 else (
        'asian' if row['asian'] == 1 else None)), axis=1)

df = df_tmp[['deepfake', 'male', 'ethnicity', 'eyeglasses', 'heavy_makeup', 'big_lips', 'path']]

df = df.rename(columns={'deepfake': 'type', 'male': 'sex', 'heavy_makeup': 'makeup', 'big_lips': 'lips',})

df['type'] = df['type'].replace({1: 'fake', -1: 'real'})
df['sex'] = df['sex'].replace({-1: 'female', 0: None, 1: 'male'})
df['makeup'] = df['makeup'].replace({-1: 'no', 0: None, 1: 'yes'})
df['lips'] = df['lips'].replace({-1: 'small', 0: None, 1: 'big'})
df['eyeglasses'] = df['eyeglasses'].replace({-1: 'no', 0: None, 1: 'yes'})


print(tb.tabulate(df.head(), headers='keys', tablefmt='psql'))
print(f"Dataset size: {len(df)}")

+----+--------+--------+-------------+--------------+----------+--------+-----------------------------------------+
|    | type   | sex    | ethnicity   | eyeglasses   | makeup   | lips   | path                                    |
|----+--------+--------+-------------+--------------+----------+--------+-----------------------------------------|
|  0 | real   | male   |             | yes          | no       | big    | /content/data/original/805/frame271.jpg |
|  1 | real   | female | white       | no           |          | big    | /content/data/original/083/frame191.jpg |
|  2 | real   | male   | white       | no           | no       | small  | /content/data/original/878/frame111.jpg |
|  3 | real   | female | white       | no           |          |        | /content/data/original/158/frame201.jpg |
|  4 | real   | female | white       | no           |          |        | /content/data/original/606/frame71.jpg  |
+----+--------+--------+-------------+--------------+----------+--------

In [149]:
def get_balanced_subset(
    df, class_col, feature_col, feature_value,
    samples_per_class, randomize=True, reset_index=False
):
    """
    Select a balanced subset of the data for a given feature value, with equal number of samples per class.

    Args:
        df: DataFrame
        class_col: column name of class labels
        feature_col: column name of feature
        feature_value: specific feature value to filter
        samples_per_class: number of samples per class
        randomize: whether to shuffle within class before selecting
        reset_index: whether to reset index of returned DataFrame
        seed: random seed for reproducibility

    Returns:
        Balanced DataFrame subset
    """
    tmp = df[df[feature_col] == feature_value]

    counts = tmp[class_col].value_counts()
    for cl, count in counts.items():
        if count < samples_per_class:
            raise ValueError(f"Not enough samples for class '{cl}' in feature '{feature_value}'. "
                             f"Required: {samples_per_class}, Available: {count}")

    tmp = pd.concat([
        (g.sample(frac=1, random_state=SEED).head(samples_per_class) if randomize else g.head(samples_per_class))
        for _, g in tmp.groupby(class_col)
    ])

    if reset_index:
        tmp = tmp.reset_index(drop=True)

    return tmp

tmp_test = get_balanced_subset(
    df=df, class_col='type', feature_col='sex', feature_value='male',
    samples_per_class=2, randomize=True, reset_index=True)
print(tb.tabulate(tmp_test, headers='keys', tablefmt='psql'))

+----+--------+-------+-------------+--------------+----------+--------+---------------------------------------------+
|    | type   | sex   | ethnicity   | eyeglasses   | makeup   | lips   | path                                        |
|----+--------+-------+-------------+--------------+----------+--------+---------------------------------------------|
|  0 | fake   | male  | white       | no           | no       | small  | /content/data/deepfake/374_407/frame41.jpg  |
|  1 | fake   | male  | white       | no           | no       | small  | /content/data/deepfake/015_919/frame281.jpg |
|  2 | real   | male  |             | no           | no       | big    | /content/data/original/995/frame11.jpg      |
|  3 | real   | male  | white       | no           | no       |        | /content/data/original/579/frame201.jpg     |
+----+--------+-------+-------------+--------------+----------+--------+---------------------------------------------+


In [150]:
def get_exp_data(df, class_col, feature_col, ratio : Dict, size, randomize=True, exclude_column=None, exclude_df=None, max_diff=0.05):
    '''
    Get a balanced subset of the data based on specified ratios for features.
    Args:
        df: DataFrame containing the data
        class_col: column name for class labels
        feature_col: column name for features
        ratio: dictionary with feature values as keys and their ratios as values
        size: total number of samples to return
        randomize: whether to shuffle the DataFrame before processing
        exclude_column: column name to exclude from the DataFrame
        exclude_df: DataFrame containing values to exclude based on exclude_column
    '''
    if randomize:
        df_rnd = df.sample(frac=1, random_state=SEED).reset_index(drop=True)
    else:
        df_rnd = df.copy()

    if exclude_column is not None and exclude_df is not None:
        if exclude_column not in df_rnd.columns:
            raise ValueError(f"Column '{exclude_column}' not found in DataFrame.")
        if exclude_column not in exclude_df.columns:
            raise ValueError(f"Column '{exclude_column}' not found in exclude DataFrame.")
        df_rnd = df_rnd[~df_rnd[exclude_column].isin(exclude_df[exclude_column])]

    uniq_classes = df_rnd[class_col].unique()
    uniq_features = df_rnd[feature_col].unique()

    def get_exp_data_inner(tmp_df, size):
        df_tmp = None
        for uf in uniq_features:
            if ratio.get(uf) is None:
                print(f"Feature '{uf}' not found in ratios. Skipping.")
                continue
            c_amt = int(size * ratio[uf] / len(uniq_classes))
            # if c_amt <= 0:
            #     raise ValueError(f"Calculated samples per class ({c_amt}) is less than or equal to zero for feature '{uf}' with ratio {ratio}.")
            tmp = get_balanced_subset(df=tmp_df, class_col=class_col, feature_col=feature_col, feature_value=uf,
                                        samples_per_class=c_amt, randomize=False)
            if df_tmp is None:
                df_tmp = tmp
            else:
                df_tmp = pd.concat([df_tmp, tmp])
        return df_tmp

    df_res = get_exp_data_inner(df_rnd, size)

    if len(df_res) < size:
        print(f"Samples for ({len(df_res)}) are less than requested ({size}).")

    ratios_fet = df_res[feature_col].value_counts(normalize=True).to_dict()
    ratios_cls = df_res[class_col].value_counts(normalize=False).to_dict()
    print(f"[] Ratios for {feature_col}: {ratios_fet}")
    print(f"[] Ratios for {class_col}: {ratios_cls}")

    for k in ratio:
        if ratios_fet.get(k) is None:
            if ratio[k] > 0.0:
                raise ValueError(f"Feature '{k}' not found in DataFrame after sampling (try increase 'size' parameter).")
        elif abs(ratios_fet[k] - ratio[k]) > max_diff:
            raise ValueError(f"Feature '{k}' ratio {ratios_fet[k]} differs from requested {ratio[k]} by more than {max_diff}.")

    print()

    df_res = df_res.reset_index(drop=True)

    return df_res

tmp_test = get_exp_data(
    df=df, class_col='type', feature_col='ethnicity', ratio={'white':0.2, 'black':0.6, 'asian': 0.2}, size=10, randomize=True)
print(tb.tabulate(tmp_test, headers='keys', tablefmt='psql'))

Feature 'None' not found in ratios. Skipping.
[] Ratios for ethnicity: {'black': 0.6, 'white': 0.2, 'asian': 0.2}
[] Ratios for type: {'fake': 5, 'real': 5}

+----+--------+--------+-------------+--------------+----------+--------+---------------------------------------------+
|    | type   | sex    | ethnicity   | eyeglasses   | makeup   | lips   | path                                        |
|----+--------+--------+-------------+--------------+----------+--------+---------------------------------------------|
|  0 | fake   | male   | white       | no           | no       | small  | /content/data/deepfake/594_530/frame121.jpg |
|  1 | real   | female | white       | no           |          |        | /content/data/original/240/frame41.jpg      |
|  2 | fake   | female | asian       |              |          | big    | /content/data/deepfake/249_280/frame261.jpg |
|  3 | real   | female | asian       | no           |          | big    | /content/data/original/758/frame161.jpg     |
| 

In [151]:
def load_image(file_path, target_size=(224, 224)):
    image = tf.io.read_file(file_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, target_size)
    image = image / 255.0  # Normalize to [0, 1]
    return image

def get_data_for_model(df, class_col, files_col, batch_size):
    image_paths = df[files_col].values
    labels = df[class_col].values
    labels = df[class_col].astype('category').cat.codes.values #classes strs to ints
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels))
    dataset = dataset.map(lambda path, label: (load_image(path), label))
    dataset = dataset.batch(batch_size)
    return dataset

In [152]:
def create_mobile_net2(num_classes, input_shape=(224, 224, 3)):
    base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = False
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(128, activation='relu')(x)
    predictions = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=base_model.input, outputs=predictions)
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model, "mobile net 2"

def create_resnet50(num_classes, input_shape=(224, 224, 3)):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = False
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(128, activation='relu')(x)
    predictions = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=base_model.input, outputs=predictions)
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model, "resnet50"

def create_efficientnet_b0(num_classes, input_shape=(224, 224, 3)):
    base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = False
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(128, activation='relu')(x)
    predictions = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=base_model.input, outputs=predictions)
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model, "efficientnet b0"

# def create_xception(num_classes, input_shape=(299, 299, 3)):
def create_xception(num_classes, input_shape=(224, 224, 3)):
    base_model = Xception(weights='imagenet', include_top=False, input_shape=input_shape)
    base_model.trainable = False
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(128, activation='relu')(x)
    predictions = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=base_model.input, outputs=predictions)
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model, "xception"

In [153]:
# Create folder if it doesn't exist
if not os.path.exists(RES_PATH):
    os.makedirs(RES_PATH)
    print(f"Created folder: {RES_PATH}")
elif REMOVE_RES_CONTENT:
    # Remove all files inside the folder
    for filename in os.listdir(RES_PATH):
        file_path = os.path.join(RES_PATH, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)          # remove file or link
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)      # remove folder and contents
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')
    print(f"Cleared contents of folder: {RES_PATH}")

Cleared contents of folder: /content/drive/MyDrive/Studia/MAGISTER/PracaMagisterska/res/


In [154]:
def get_done_reps(model_name, amt_per_rep):
  results_path = f'{RES_PATH}res_{model_name.replace(" ", "_")}.csv'
  if not os.path.exists(results_path):
    return [], None

  tmp_df = pd.read_csv(results_path)
  dones = []
  for r in tmp_df["rep"].unique():
    amt = len(tmp_df[tmp_df["rep"]==r])
    if amt == amt_per_rep:
      dones.append(r)

  return dones, tmp_df



In [155]:
def perform_tests(df, train_metas, test_metas, reps, class_col, feature_split_col, exclude_column, files_col, get_model, epochs_num, batch_size):
    res = []

    _, model_name = get_model(num_classes=len(df[class_col].unique()))

    done_reps, prev_results = get_done_reps(model_name, len(test_metas) * len(train_metas))

    for r in range(reps):
        if r in done_reps:
          res.extend(prev_results[prev_results['rep']==r].values.tolist())
          print(f"Rep {r} already done for {model_name}. Skipping...")
          continue

        np.random.seed(SEED + r)
        tf.random.set_seed(SEED + r)

        for train_meta in train_metas:
            train = get_exp_data(df, class_col=class_col, feature_col=feature_split_col, ratio=train_meta['ratio'], size=train_meta['size'])
            train = train.sample(frac=1, random_state=SEED+r).reset_index(drop=True)

            tests = [
                get_exp_data(df, class_col=class_col, feature_col=feature_split_col, ratio=tm['ratio'], size=tm['size'], exclude_column=exclude_column, exclude_df=train) for tm in test_metas
            ]

            train_dataset = get_data_for_model(train, class_col=class_col, files_col=files_col, batch_size=batch_size)
            test_datasets = [
                get_data_for_model(test, class_col=class_col, files_col=files_col, batch_size=batch_size) for test in tests
            ]

            train_ratio = '/'.join([f"{k}:{v}" for k, v in train_meta['ratio'].items()])
            train_ratio_rel = '/'.join([f"{k}:{v:.4f}" for k, v in train[feature_split_col].value_counts(normalize=True).to_dict().items()])
            train_ratio_sim = re.sub(r'[a-zA-Z0.:]', '', train_ratio)

            model, model_name = get_model(num_classes=len(df[class_col].unique()))
            model.fit(train_dataset, epochs=epochs_num)

            for test_dataset, test_meta, test_df in zip(test_datasets, test_metas, tests):
                predictions = model.predict(test_dataset)
                y_true = test_df[class_col].astype('category').cat.codes.values
                y_pred = np.argmax(predictions, axis=1)
                acc = accuracy_score(y_true, y_pred)
                f1 = f1_score(y_true, y_pred, average='weighted')
                eo_diff = equalized_odds_difference(y_true, y_pred, sensitive_features=test_df[feature_split_col])

                test_ratio = '/'.join([f"{k}:{v}" for k, v in test_meta['ratio'].items()])
                test_ratio_rel = '/'.join([f"{k}:{v:.4f}" for k, v in test_df[feature_split_col].value_counts(normalize=True).to_dict().items()])
                test_ratio_sim = re.sub(r'[a-zA-Z0.:]', '', test_ratio)

                res.append([
                    r,
                    model_name,
                    feature_split_col,
                    train_meta['size'],
                    train_ratio,
                    test_meta['size'],
                    test_ratio,
                    acc,
                    f1,
                    eo_diff,
                    train_ratio_rel,
                    test_ratio_rel,
                    train_ratio_sim,
                    test_ratio_sim
                ])

                print(f"Rep: {r:2} | Model: {model_name} | Feature Split: {feature_split_col} | Ratio: {test_ratio} | Acc: {acc:.2f}")

                res_df = pd.DataFrame(res, columns=[
                    'rep', 'model_name', 'feature_split_col',
                    'train_size', 'train_ratio_detail', 'test_size', 'test_ratio_detail',
                    'accuracy', 'f1_score', 'eo_diff', 'train_ratio_rel', 'test_ratio_rel', "train_ratio", "test_ratio"
                ])

                res_df.to_csv(f'{RES_PATH}res_{model_name.replace(" ", "_")}.csv', index=False)
    print(f"Done for {model_name}.")


In [156]:
perform_tests(df=df,
              train_metas=[
                  {'ratio': {'male':0.1, 'female':0.9}, 'size': 10000},
                  ],
              test_metas=[
                  {'ratio': {'male':0.1, 'female':0.9}, 'size': 1000},
                  {'ratio': {'male':0.5, 'female':0.5}, 'size': 1000},
                  {'ratio': {'male':0.9, 'female':0.1}, 'size': 1000},
              ],
              reps=2,
              class_col='type',
              feature_split_col='sex',
              exclude_column='path',
              files_col='path',
              get_model=create_xception,
              epochs_num=10,
              batch_size=32
              )

Feature 'None' not found in ratios. Skipping.
[] Ratios for sex: {'female': 0.9, 'male': 0.1}
[] Ratios for type: {'fake': 5000, 'real': 5000}

Feature 'None' not found in ratios. Skipping.
[] Ratios for sex: {'female': 0.9, 'male': 0.1}
[] Ratios for type: {'fake': 500, 'real': 500}

Feature 'None' not found in ratios. Skipping.
[] Ratios for sex: {'male': 0.5, 'female': 0.5}
[] Ratios for type: {'fake': 500, 'real': 500}

Feature 'None' not found in ratios. Skipping.
[] Ratios for sex: {'male': 0.9, 'female': 0.1}
[] Ratios for type: {'fake': 500, 'real': 500}

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 143ms/step - accuracy: 0.5183 - loss: 0.7218
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 129ms/step - accuracy: 0.5370 - loss: 0.6884
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 127ms/step - accuracy: 0.5630 - loss: 0.6799
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

KeyboardInterrupt: 

In [None]:
def print_summarise_res(model_name:str):
  path = RES_PATH + f'res_{model_name}.csv'

  if not os.path.exists(path):
    print(f"File {path} does not exists!")
    return

  res = pd.read_csv(path)
  gr = res.groupby(['train_ratio', 'test_ratio']).agg(
      # Model=('model_name', 'first'),
      TrainRatio=('train_ratio', 'first'),
      TestRatio=('test_ratio', 'first'),
      Accuracy= ('accuracy', 'mean'),
      AccuracySTD= ('accuracy', 'std'),
      F1=('f1_score', 'mean'),
      F1STD=('f1_score', 'std'),
      EODiff=('eo_diff', 'mean'),
      EODiffSTD=('eo_diff', 'std'),
  ).reset_index(drop=True)

  gr = gr.round(3).sort_values(by=['TrainRatio', 'TestRatio'], ascending=False)

  print("MODEL: " + model_name)
  print(tb.tabulate(gr, headers='keys', tablefmt='psql'))
  print()
  print()

print_summarise_res('resnet50')
print_summarise_res('efficientnet_b0')
print_summarise_res('xception')