# Import dataset

In [None]:
import logging
import tensorflow as tf
from datasets import load_dataset, load_from_disk

2025-08-14 08:03:05.019024: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
logging.basicConfig(
    filename='model_training.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Data Preparation

In [10]:
dataset_path = 'dataset'

dataset = load_from_disk(dataset_path)
data_train = dataset['train']
data_test = dataset['test']

# print(f'dataset : {dataset}')
print(f'\ntrain dataset: {data_train}')
print(f'\ntest dataset: {data_test}')


train dataset: Dataset({
    features: ['png', '__key__', '__url__'],
    num_rows: 8115
})

test dataset: Dataset({
    features: ['png', '__key__', '__url__'],
    num_rows: 8283
})


## Add label at each image

In [11]:
def add_label(example: Dataset):
    try:
        if 'real' in example['__key__']:
            example['label'] = 0
        elif 'fake' in example['__key__']:
            example['label'] = 1
    except Exception as e:
        logging.error(f"Error add label for {example['__key__']}: {e}")
    return example

data_train = data_train.map(add_label)
data_test = data_test.map(add_label)

print(f'\ntrain dataset after add label: {data_train}')
print(f'\ntest dataset after add label: {data_test}')

Map: 100%|██████████| 8115/8115 [00:02<00:00, 3844.34 examples/s]
Map: 100%|██████████| 8283/8283 [00:02<00:00, 2764.95 examples/s]


train dataset after add label: Dataset({
    features: ['png', '__key__', '__url__', 'label'],
    num_rows: 8115
})

test dataset after add label: Dataset({
    features: ['png', '__key__', '__url__', 'label'],
    num_rows: 8283
})





## Take 1 from 10 frame at each video

The example of ['__key__'] column = './573/real/3/359'

4th order refers to video id

5th order refers to image id

In [17]:
print(data_train)

Dataset({
    features: ['png', '__key__', '__url__', 'label'],
    num_rows: 8115
})


In [None]:
import logging
from collections import defaultdict
from datasets import Dataset, DatasetDict
from typing import Optional

def sample_frames_per_video(
        dataset: Dataset,
        num_frames: int = 10,
    ) -> Optional[Dataset]:
    
    grouped_data = defaultdict(list)

    for example in dataset:
        try:
            video_id = example['__key__'].split('/')[-2]
            grouped_data[video_id].append(example)
        except (IndexError, KeyError) as e:
            logging.error(f'Error for grouping example {example}: {e}')
            continue

    print(f'\nGrouped data: {grouped_data["3"]}\n')

    sampled_data_list = []
    for video_id, frames in grouped_data.items():
        if not frames:
            logging.warning(f'No frames found for video_id {video_id}')
            continue
        try:
            sorted_frames = sorted(frames, key=lambda x:x['__key__'].split('/')[-1])
            for i in range(0, len(sorted_frames), num_frames):
                sampled_data_list.append(sorted_frames[i])
        except Exception as e:
            logging.error(f'Error sampling frames for video_id {video_id}: {e}')
            continue

    print(f'\nSampled data list: {sampled_data_list[0]}\n')

    keys = sampled_data_list[0].keys() # All examples have sam keys
    print(f'\nKeys in sampled data: {keys}\n')
    dict_dataset = {
        key: [d[key] for d in sampled_data_list] 
        for key in keys
    }
    final_dataset = Dataset.from_dict(dict_dataset)
    return final_dataset

final_data_train = sample_frames_per_video(data_train, num_frames=10)
print(final_data_train)

Dataset({
    features: ['png', '__key__', '__url__', 'label'],
    num_rows: 846
})
