This script takes in a labeled image csv and copies the images into the correct train/val folders

In [1]:
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import glob
sys.path.append('../../lib/')
from shutil import copyfile

In [2]:
from analysis import *

In [3]:
# Load csv 
df = pd.read_csv('../../data/dataset/converting_to_model_form/images/fake_ids_label.csv')

In [4]:
print(df.head())

  image_id  label
0      2_0      0
1      3_0      1
2     10_0      0
3     15_0      1
4     16_0      1


In [5]:
good_ids = df[df['label'] == 0]
bad_ids = df[df['label'] == 1]

In [6]:
print(good_ids.head())
print(bad_ids.head())

   image_id  label
0       2_0      0
2      10_0      0
10     25_0      0
12     27_0      0
20     35_0      0
  image_id  label
1      3_0      1
3     15_0      1
4     16_0      1
5     17_0      1
6     19_0      1


In [7]:
# Merge good and bad ids
ids = pd.concat([good_ids['image_id'], bad_ids['image_id']])
labels = pd.concat([good_ids['label'], bad_ids['label']])
print(ids)
print(labels)
# Convert to numpy array
ids = np.array(ids)
labels = np.array(labels)

0        2_0
2       10_0
10      25_0
12      27_0
20      35_0
       ...  
317    507_0
318    507_1
319    507_2
320    509_0
321    513_0
Name: image_id, Length: 322, dtype: object
0      0
2      0
10     0
12     0
20     0
      ..
317    1
318    1
319    1
320    1
321    1
Name: label, Length: 322, dtype: int64


In [8]:
# Split into train and test
train_ids, test_ids, train_labels, test_labels = train_test_split(ids, labels, test_size=0.2, stratify=labels, random_state=42)

In [9]:
print(train_ids[0])
print(train_labels[0])
print(test_ids[0])
print(test_labels[0])
print(len(train_ids))
print(len(test_ids))
print(len(train_labels))
print(len(test_labels))

374_0
1
327_0
0
257
65
257
65


In [10]:
# Create a map between ids and labels
train_map = dict(zip(train_ids, train_labels))
test_map = dict(zip(test_ids, test_labels))

In [11]:
print(train_map)
print(test_map)

{'374_0': 1, '367_3': 0, '308_2': 1, '398_0': 1, '200_0': 1, '499_0': 1, '431_1': 1, '207_0': 0, '228_0': 1, '396_2': 0, '185_0': 1, '16_0': 1, '285_0': 0, '211_1': 1, '57_0': 0, '155_0': 0, '352_0': 1, '434_1': 1, '400_3': 0, '126_0': 1, '187_0': 1, '404_0': 0, '410_0': 1, '367_1': 1, '170_0': 0, '124_2': 1, '59_0': 1, '171_0': 1, '315_0': 1, '42_0': 1, '360_0': 0, '153_1': 1, '274_3': 0, '53_0': 0, '299_0': 0, '262_0': 0, '349_0': 1, '186_0': 1, '191_0': 0, '308_0': 1, '130_0': 1, '361_0': 1, '124_1': 1, '471_0': 0, '341_1': 1, '107_0': 0, '439_0': 1, '85_0': 0, '247_3': 0, '210_0': 1, '497_0': 1, '325_0': 0, '448_0': 1, '274_0': 0, '309_1': 0, '268_0': 1, '362_1': 1, '262_1': 0, '93_0': 0, '438_0': 1, '37_0': 1, '396_0': 1, '211_2': 1, '394_0': 1, '247_1': 0, '264_0': 1, '29_0': 1, '89_0': 1, '240_0': 0, '308_1': 1, '252_0': 0, '279_0': 1, '26_0': 1, '341_2': 1, '436_3': 0, '489_0': 1, '298_0': 1, '324_0': 1, '297_0': 0, '391_0': 1, '311_0': 1, '466_0': 0, '277_1': 0, '90_0': 1, '33

In [12]:
# copy images to train and test folders
base = "../../data/dataset/building_dataset/images_fake_ids_final/"
files = []
for file_type in [base + '*.jpg', base + '*.png']:
	files.extend(glob.glob(file_type))
hash_of_images = {}

def hash_file(file_path):
    # Read the file in and compute the hash
	with open(file_path, 'rb') as f:
		return hash(f.read())

for file in files:
	# Hash the file
	file_hash = hash_file(file)
	# Add the file to the hash map
	if file_hash not in hash_of_images:
		hash_of_images[file_hash] = file
		file_name = file.split('/')[-1]
		file_name = file_name.split('.')[0]
		extention = file.split('.')[-1]
		if file_name in train_ids:
			if train_map[file_name] == 1:
				copyfile(file, "../../data/dataset/converting_to_model_form/images/train/iwt/" + file_name + "." + extention)
			else:
				copyfile(file, "../../data/dataset/converting_to_model_form/images/train/not_iwt/" + file_name + "." + extention)

		elif file_name in test_ids:
			if test_map[file_name] == 1:
				copyfile(file, "../../data/dataset/converting_to_model_form/images/test/iwt/" + file_name + "." + extention)
			else:
				copyfile(file, "../../data/dataset/converting_to_model_form/images/test/not_iwt/" + file_name + "." + extention)
		else:
			print("Error: file not found")
	else:
		print("Error: file {} already exists {}".format(file, hash_of_images[file_hash]))

Error: file ../../data/dataset/building_dataset/images_fake_ids_final/302_0.jpg already exists ../../data/dataset/building_dataset/images_fake_ids_final/151_0.jpg
Error: file ../../data/dataset/building_dataset/images_fake_ids_final/352_0.jpg already exists ../../data/dataset/building_dataset/images_fake_ids_final/164_0.jpg
Error: file ../../data/dataset/building_dataset/images_fake_ids_final/378_0.jpg already exists ../../data/dataset/building_dataset/images_fake_ids_final/357_0.jpg
Error: file ../../data/dataset/building_dataset/images_fake_ids_final/400_0.jpg already exists ../../data/dataset/building_dataset/images_fake_ids_final/317_0.jpg
Error: file ../../data/dataset/building_dataset/images_fake_ids_final/400_1.jpg already exists ../../data/dataset/building_dataset/images_fake_ids_final/317_1.jpg
Error: file ../../data/dataset/building_dataset/images_fake_ids_final/400_2.jpg already exists ../../data/dataset/building_dataset/images_fake_ids_final/317_2.jpg
Error: file ../../data