In [1]:
import kagglehub
import os
import shutil
import pandas as pd
import cv2
import imghdr
from PIL import Image
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


### Download the dataset

In [2]:
src_path = kagglehub.dataset_download("alessandrasala79/ai-vs-human-generated-dataset")

Downloading from https://www.kaggle.com/api/v1/datasets/download/alessandrasala79/ai-vs-human-generated-dataset?dataset_version_number=4...


100%|██████████| 9.76G/9.76G [26:30<00:00, 6.58MB/s]  

Extracting files...





In [3]:
src_path

'C:\\Users\\User\\.cache\\kagglehub\\datasets\\alessandrasala79\\ai-vs-human-generated-dataset\\versions\\4'

In [4]:
destination_path = '../data'

In [5]:
for file_name in os.listdir(src_path):
    shutil.move(os.path.join(src_path, file_name), os.path.join(destination_path, file_name))

### Split two classes

In [6]:
train_val_df = pd.read_csv('../data/train.csv').drop(columns=['Unnamed: 0'])
test_df = pd.read_csv('../data/test.csv')

In [7]:
train_val_df

Unnamed: 0,file_name,label
0,train_data/a6dcb93f596a43249135678dfcfc17ea.jpg,1
1,train_data/041be3153810433ab146bc97d5af505c.jpg,0
2,train_data/615df26ce9494e5db2f70e57ce7a3a4f.jpg,1
3,train_data/8542fe161d9147be8e835e50c0de39cd.jpg,0
4,train_data/5d81fa12bc3b4cea8c94a6700a477cf2.jpg,1
...,...,...
79945,train_data/9283b107f6274279b6f15bbe77c523aa.jpg,0
79946,train_data/4c6b17fe6dd743428a45773135a10508.jpg,1
79947,train_data/1ccbf96d04e342fd9f629ad55466b29e.jpg,0
79948,train_data/ff960b55f296445abb3c5f304b52e104.jpg,1


In [8]:
index_list = list(range(int(len(train_val_df) / 2)))
train_index, val_index = train_test_split(index_list, test_size=0.2)

In [9]:
train_AI_index = [x*2 for x in train_index] 
train_human_index = [x*2 + 1 for x in train_index]
val_AI_index = [x*2 for x in val_index]
val_human_index = [x*2 + 1 for x in val_index]

In [10]:
train_AI_df = train_val_df.loc[train_AI_index]
train_human_df = train_val_df.loc[train_human_index]
val_AI_df = train_val_df.loc[val_AI_index]
val_human_df = train_val_df.loc[val_human_index]

In [11]:
for partition in ['train_data', 'val_data']:
    for class_name in ['AI', 'human']:
        os.makedirs(os.path.join(destination_path, partition, class_name), exist_ok=True)

In [12]:
train_AI_df

Unnamed: 0,file_name,label
41512,train_data/0331c4e303484437a366274a4741c7bf.jpg,1
35420,train_data/38cc54177091454b8d3005e6ad0208b8.jpg,1
50620,train_data/ee9554921cc546e697fc5f179b8dce03.jpg,1
70792,train_data/9ea500c70fd4439bb105ff5b7455ae54.jpg,1
62350,train_data/b4288e09db604b899bb14f76f7943346.jpg,1
...,...,...
3164,train_data/e676cdf69cfa4d2d8951e1706c938a6c.jpg,1
38396,train_data/1951540f0a744caca1d228a9a5236d02.jpg,1
15148,train_data/2561d687f3644b43a6a77911d7603279.jpg,1
31330,train_data/d0948e3ca7994b718f06be34a764eadb.jpg,1


In [21]:
# label 1 stands for AI-generated, 0 for human-generated

for _,row in train_AI_df.iterrows():
    file_path = row['file_name'][11:]
    if os.path.exists(os.path.join('../data/train_data', file_path)):
        shutil.move(os.path.join('../data/train_data', file_path), '../data/train_data/AI')
        
for _,row in train_human_df.iterrows():
    file_path = row['file_name'][11:]
    if os.path.exists(os.path.join('../data/train_data', file_path)):
        shutil.move(os.path.join('../data/train_data', file_path), '../data/train_data/human')

for _,row in val_AI_df.iterrows():
    file_path = row['file_name'][11:]
    if os.path.exists(os.path.join('../data/train_data', file_path)):
        shutil.move(os.path.join('../data/train_data', file_path), '../data/val_data/AI')

for _,row in val_human_df.iterrows():
    file_path = row['file_name'][11:]
    if os.path.exists(os.path.join('../data/train_data', file_path)):
        shutil.move(os.path.join('../data/train_data', file_path), '../data/val_data/human')


In [22]:
len(os.listdir('../data/train_data'))

2

### check dodgy data

In [74]:
data_dir = '../data'
train_dir = 'train_data'
test_dir = 'test_data_v2'

In [75]:
image_exts = ['jpg', 'jpeg', 'bmp', 'png']

In [77]:
# check train dataset
for partition in ['train_data', 'val_data']:
    for class_name in ['AI', 'human']:
        for image in os.listdir(os.path.join(data_dir, partition, class_name)):
            image_path = os.path.join(data_dir, partition, class_name, image)
            try:
                img = cv2.imread(image_path)
                tip = imghdr.what(image_path)
                if tip not in image_exts:
                    print('Image not have a valid extension')
            except Exception as e:
                print('Issue with image {}'.format(image_path))


In [None]:
# check test dataset
for image in os.listdir(os.path.join(data_dir, test_dir)):
    image_path = os.path.join(data_dir, test_dir, image)
    try:
        img = cv2.imread(image_path)
        tip = imghdr.what(image_path)
        if tip not in image_exts:
            print('Image not have a valid extension')
    except Exception as e:
        print('Issue with image {}'.format(image_path))

In [None]:
tra