### Separate cars into train, test, and holdout sets from the original train set
F. Burkholder 25 Sep. 2017

In [1]:
import pandas as pd
import numpy as np
np.random.seed(1)

In [2]:
# data of interest is in train folder /train
import os 
from glob import glob

train_files = glob(os.path.join('train/', "*.jpg"))
train_ids_all = [s for s in train_files]

In [3]:
len(train_ids_all)

5088

In [4]:
print(train_ids_all[0])
car_id = train_ids_all[0].split('_')[0].split('/')[1]
print(car_id)

train/d61b6bfeabb2_13.jpg
d61b6bfeabb2


In [5]:
car_ids = set([t.split('_')[0].split('/')[1] for t in train_ids_all])

In [6]:
len(car_ids)

318

In [7]:
train_masks = glob(os.path.join('train_masks/', "*.gif"))
train_masks_all = [s for s in train_masks]

In [8]:
print(train_masks_all[0])
mask_id = train_masks_all[0].split('/')[1].split('_')[0]
print(mask_id)

train_masks/fc5f1a3a66cf_14_mask.gif
fc5f1a3a66cf


In [9]:
mask_ids = set([m.split('/')[1].split('_')[0] for m in train_masks_all])

In [10]:
len(mask_ids)

318

In [11]:
car_ids - mask_ids

set()

The same cars are in the train/ and train_masks/ folders.  Good.

In [12]:
car_array = np.array(list(car_ids))

In [13]:
car_array

array(['292f59c6a347', 'eb91b1c659a0', 'cf65b1c5e147', '858841907a4a',
       '2335eef27de6', '1ae8a68a40e4', 'f70052627830', '00087a6bd4dc',
       'd61b6bfeabb2', '917f262f1608', 'a070dbb32d2b', '5df60cf7cab2',
       '424658d9f222', '151c7f2d4183', 'dd70a0a51e3b', '2cb06c1f5bb1',
       'ba7e7421003b', 'fd9da5d0bb6f', '28d9a149cb02', 'f3eee6348205',
       'acb0fd30b83d', '4b74275babf7', 'bb7625a3f1d4', 'c53e374fdd72',
       '6c0cd487abcd', '344ece157efc', '7ac210ba75a1', '6e016b8b3617',
       '2af7c265531e', '9a2bfb24ba6e', 'eaf9eb0b2293', '04bd942b463b',
       '791c1a9775be', 'bd8d5780ed04', 'b44091aa62e4', '6d375bc2ece1',
       '8d78483ce302', '8b4b87ad7be6', 'bf9932f7aca8', '8d1a6723c458',
       '6bf969856536', 'd8e1c401ac3c', '42415ff65013', 'f00905abd3d7',
       '51f1cdf5f539', '4e7bc95552ed', '1c5748340af8', 'be86e79f1225',
       'ed8472086df8', '553c658b9efa', 'a7c1ef8d9cc9', '4e5ac4b9f074',
       'eeb7eeca738e', '23c088f6ec27', '0cdf5b5d0ce1', '3f3e362dea23',
      

In [14]:
# Shuffle array so can easily separate array into train ids, test ids, and holdout ids
np.random.shuffle(car_array)

In [15]:
car_array

array(['78f63d808555', 'd46244bc42ed', '3c6acfceb552', '78bfb9c780db',
       '82f72273d23e', '6131a03dd028', '5bcde75955bb', 'd9b2ded58b72',
       '08a646be6b30', '11f3dc041cfb', '0ee135a3cccc', '0ce66b539f52',
       '3d7a1030deeb', '364fd5fd7569', 'a7b9e343cf6b', '125cadcb9feb',
       '11fcda0a9e1c', '32edfb7723ff', 'cafee4122080', '6ba36af67cb0',
       'e5fc2c1c4a8b', 'd1a3af34e674', '2cb91c2543fa', '93881caf9f70',
       'ae296a20fdd9', 'd326cb5c3d12', '28d9a149cb02', 'e114fcbb9723',
       '5df60cf7cab2', '2a4a8964ebf3', 'ed13cbcdd5d8', '695f39dfac16',
       '7251c8797749', '0795e132d090', '54ee69d7d5e5', '843763f47895',
       '424658d9f222', '6e016b8b3617', '66fea07e1152', '7abaa38c227f',
       '9a2bfb24ba6e', '6ae670e86620', 'ce74dfdf123b', '0de66245f268',
       '6bff9e10288e', 'b98c63cd6102', '6c3470c34408', '1e89e1af42e7',
       'c87688f6960e', '351c583eabd6', '0ed6904e1004', 'b38c4e132183',
       '6343156803f9', '717689f3e788', 'dd47eb7ac4ee', 'ef5567efd904',
      

In [16]:
train_fract = 0.8
test_fract = 0.1
holdout_fract = 0.1
num_train = int(round(car_array.shape[0]*train_fract))
num_test = int(round(car_array.shape[0]*test_fract))
num_holdout = int(round(car_array.shape[0]*holdout_fract))
print("{0} train, {1} test, and {2} holdout cars.".format(num_train, num_test, num_holdout))
print("That's a total of {0} cars.".format(num_train + num_test + num_holdout))

254 train, 32 test, and 32 holdout cars.
That's a total of 318 cars.


In [17]:
ids_train = car_array[:num_train]
ids_test = car_array[num_train:num_train + num_test]
ids_holdout = car_array[num_train + num_test:]
out = "There are {0} in train, {1} in test, and {2} in the holdout"
print(out.format(ids_train.shape[0], ids_test.shape[0], ids_holdout.shape[0]))

There are 254 in train, 32 in test, and 32 in the holdout


In [18]:
np.savetxt('ids_train.txt', ids_train, fmt='%s', delimiter='\n') 
np.savetxt('ids_test.txt', ids_test, fmt='%s', delimiter='\n')
np.savetxt('ids_holdout.txt', ids_holdout, fmt='%s', delimiter='\n')