This script takes in a labeled image csv and copies the images into the correct train/val folders

In [42]:
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import glob
sys.path.append('../../lib/')
from shutil import copyfile

In [19]:
from analysis import *

In [20]:
# Load csv 
df = pd.read_csv('../../data/dataset/images_fake_ids/fake_ids_label.csv')

In [21]:
print(df.head())

  image_id  label
0      2_0      0
1      3_0      1
2     10_0      0
3     15_0      1
4     16_0      1


In [23]:
good_ids = df[df['label'] == 0]
bad_ids = df[df['label'] == 1]

In [24]:
print(good_ids.head())
print(bad_ids.head())

   image_id  label
0       2_0      0
2      10_0      0
10     25_0      0
12     27_0      0
20     35_0      0
  image_id  label
1      3_0      1
3     15_0      1
4     16_0      1
5     17_0      1
6     19_0      1


In [33]:
# Merge good and bad ids
ids = pd.concat([good_ids['image_id'], bad_ids['image_id']])
labels = pd.concat([good_ids['label'], bad_ids['label']])
print(ids)
print(labels)
# Convert to numpy array
ids = np.array(ids)
labels = np.array(labels)

0        2_0
2       10_0
10      25_0
12      27_0
20      35_0
       ...  
317    507_0
318    507_1
319    507_2
320    509_0
321    513_0
Name: image_id, Length: 322, dtype: object
0      0
2      0
10     0
12     0
20     0
      ..
317    1
318    1
319    1
320    1
321    1
Name: label, Length: 322, dtype: int64


In [34]:
# Split into train and test
train_ids, test_ids, train_labels, test_labels = train_test_split(ids, labels, test_size=0.2, stratify=labels, random_state=42)

In [35]:
print(train_ids[0])
print(train_labels[0])
print(test_ids[0])
print(test_labels[0])
print(len(train_ids))
print(len(test_ids))
print(len(train_labels))
print(len(test_labels))

324_0
1
317_3
0
257
65
257
65


In [44]:
# Create a map between ids and labels
train_map = dict(zip(train_ids, train_labels))
test_map = dict(zip(test_ids, test_labels))

In [45]:
print(train_map)
print(test_map)

{'324_0': 1, '436_2': 0, '379_0': 0, '95_0': 1, '346_2': 1, '294_2': 1, '434_1': 1, '148_0': 0, '362_2': 1, '444_0': 0, '294_1': 1, '392_0': 1, '54_0': 0, '28_0': 1, '359_0': 0, '118_3': 0, '509_0': 1, '34_0': 1, '42_0': 0, '331_2': 1, '368_0': 1, '143_0': 0, '397_0': 1, '153_0': 1, '153_1': 0, '355_0': 1, '362_0': 1, '477_2': 1, '410_0': 1, '55_0': 1, '396_2': 0, '44_2': 1, '190_0': 0, '317_0': 0, '436_3': 0, '53_0': 0, '130_0': 1, '308_1': 1, '247_1': 0, '185_0': 1, '431_2': 1, '188_0': 0, '211_0': 1, '327_0': 0, '176_0': 1, '452_0': 0, '186_2': 1, '124_2': 0, '297_0': 0, '141_0': 1, '16_0': 1, '316_0': 0, '319_0': 1, '133_0': 0, '433_0': 0, '169_0': 1, '285_0': 1, '304_0': 0, '489_0': 0, '367_0': 1, '338_0': 1, '425_0': 0, '200_1': 1, '502_0': 1, '436_1': 0, '495_0': 1, '124_0': 1, '129_0': 1, '207_0': 0, '336_0': 1, '107_0': 0, '59_0': 1, '38_0': 1, '398_0': 1, '153_2': 0, '337_0': 0, '341_2': 1, '218_0': 1, '274_1': 0, '498_0': 1, '279_0': 1, '402_0': 0, '103_0': 0, '396_0': 1, '1

In [47]:
# copy images to train and test folders
base = "../../data/dataset/images_fake_ids/"
files = []
for file_type in [base + '*.jpg', base + '*.png']:
	files.extend(glob.glob(file_type))

for file in files:
	print(file)
	file_name = file.split('/')[-1]
	file_name = file_name.split('.')[0]
	extention = file.split('.')[-1]
	print(file_name)
	if file_name in train_ids:
		if train_map[file_name] == 1:
			copyfile(file, "../../data/dataset/images_fake_ids/train/iwt/" + file_name + "." + extention)
		else:
			copyfile(file, "../../data/dataset/images_fake_ids/train/not_iwt/" + file_name + "." + extention)

	elif file_name in test_ids:
		if test_map[file_name] == 1:
			copyfile(file, "../../data/dataset/images_fake_ids/test/iwt/" + file_name + "." + extention)
		else:
			copyfile(file, "../../data/dataset/images_fake_ids/test/not_iwt/" + file_name + "." + extention)
	else:
		print("Error: file not found")

../../data/dataset/images_fake_ids/2_0.jpg
2_0
../../data/dataset/images_fake_ids/3_0.jpg
3_0
../../data/dataset/images_fake_ids/10_0.jpg
10_0
../../data/dataset/images_fake_ids/15_0.jpg
15_0
../../data/dataset/images_fake_ids/17_0.jpg
17_0
../../data/dataset/images_fake_ids/19_1.jpg
19_1
../../data/dataset/images_fake_ids/23_0.jpg
23_0
../../data/dataset/images_fake_ids/23_1.jpg
23_1
../../data/dataset/images_fake_ids/25_0.jpg
25_0
../../data/dataset/images_fake_ids/26_0.jpg
26_0
../../data/dataset/images_fake_ids/27_0.jpg
27_0
../../data/dataset/images_fake_ids/29_0.jpg
29_0
../../data/dataset/images_fake_ids/31_0.jpg
31_0
../../data/dataset/images_fake_ids/31_1.jpg
31_1
../../data/dataset/images_fake_ids/31_2.jpg
31_2
../../data/dataset/images_fake_ids/34_0.jpg
34_0
../../data/dataset/images_fake_ids/37_0.jpg
37_0
../../data/dataset/images_fake_ids/41_0.jpg
41_0
../../data/dataset/images_fake_ids/42_0.jpg
42_0
../../data/dataset/images_fake_ids/44_0.jpg
44_0
../../data/dataset/image