# Script to divide image data into train and test directory

 - `./train/food` : Training data, image of food 
 - `./train/no_food` : Training data, image of no-food
 - `./valid/food` : Validation data, image of food
 - `./valid/no_food` : Validation data, image of no-food
 - `./test/food` : Test data, image of food
 - `./test/no_food` : Test data, image of no-food
 

In [1]:
import os
import numpy as np
from os import listdir
import random
import shutil

import matplotlib.pyplot as plt
from PIL import Image
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

%matplotlib inline

In [2]:
base_dir = os.getcwd()
print(base_dir)

/home/chieko/Projects/ImageRecognition


In [25]:
def rgba_to_jpg(rgba_img):
    # https://stackoverflow.com/questions/9166400/convert-rgba-png-to-rgb-with-pil
    
    # print(" <In rgba_to_jpg> : {}".format(filename))
    background = Image.new('RGBA', rgba_img.size, (255,255,255))
    
    alpha_composite = Image.alpha_composite(background, rgba_img)
    
    return alpha_composite.convert('RGB')

    

def copy_files(data_dir, copy_dir):
    #category = os.path.basename(dirname)
    #path = os.path.dirname(dirname)
    
    orig_dirs = [d for d in listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    print("Number of dirs : {}".format(len(orig_dirs)))
    print("from: {}, to: {}".format(data_dir, copy_dir))
    #print(orig_dirs)
    
    # Shuffle the sequence
    random.shuffle(orig_dirs) 
    
    # Train 80%, valid 10%, test 10%
    train_num = len(orig_dirs)*0.8
    valid_num = len(orig_dirs)*0.1
    
    for i, d in enumerate(orig_dirs):
        from_d = os.path.join(data_dir, d)
        
        if i < train_num:
            to_dir = os.path.join(os.path.join(copy_dir, 'train'), d)

        elif i >= train_num and i < train_num + valid_num:
            to_dir = os.path.join(os.path.join(copy_dir, 'valid'), d)

        else:
            to_dir = os.path.join(os.path.join(copy_dir, 'test'), d)

        os.makedirs(to_dir)
    
        for ii, f in enumerate([f for f in listdir(from_d) if os.path.isfile(os.path.join(from_d, f))]):
            # resize image if it's too big (larger than 640px at the longest edge)
            MAXLEN = 640

            img = os.path.join(from_d, f)
            im = Image.open(img)
            w, h = im.size
            maxedge = max((w, h))
            if maxedge > MAXLEN:
                r = MAXLEN / maxedge
                new_w = int(w * r)
                new_h = int(h * r)
                im = im.resize((new_w, new_h)) 
            
            # Check mode and convert into JPG, also rename filename to ".jpg"
            if im.mode == 'RGBA':          
                im = rgba_to_jpg(im)
                new_fname = f.split('.')[0] + ".jpg"
                print(" <RGBA> : {} -> {}".format(f, new_fname))

            elif im.mode == 'P':
                im = im.convert("RGB")
                new_fname = f.split('.')[0] + ".jpg"
                print(" <P> : {} -> {}".format(f, new_fname))

            else:
                ext = f.split('.')[1]
                if im.mode == 'RGB' and \
                   ext != "jpg" and ext != "jpeg" and ext != "JPG" and ext != "JPEG":               
                    new_fname = f.split('.')[0] + ".jpg"
                    print(" <rename> : {} -> {}".format(f, new_fname))
                else:
                    new_fname = f

            im.save(os.path.join(to_dir, new_fname), "JPEG")
            #print("{} : {}".format(to_dir, new_fname))
        
        if i%500 == 0:
            print(". ", end = '')
        
        #if i>5:
            #break

    return

In [26]:
data_dir = os.path.join(base_dir, 'data')
copy_dir = os.path.join(base_dir, 'image_data')
train_dir = os.path.join(copy_dir, 'train')
valid_dir = os.path.join(copy_dir, 'valid')
test_dir = os.path.join(copy_dir, 'test')

if os.path.exists(train_dir):
    shutil.rmtree(train_dir)
os.makedirs(train_dir)

if os.path.exists(valid_dir):
    shutil.rmtree(valid_dir)
os.makedirs(valid_dir)

if os.path.exists(test_dir):
    shutil.rmtree(test_dir)
os.makedirs(test_dir)

print("Making data started!")
copy_files(data_dir, copy_dir)
print("\nFinished!")
   

Making data started!
Number of dirs : 898
from: /home/chieko/Projects/ImageRecognition/data, to: /home/chieko/Projects/ImageRecognition/image_data
.  <RGBA> : ice_44_000011.jpg -> ice_44_000011.jpg
 <P> : aubeer2_89_000049.jpg -> aubeer2_89_000049.jpg
 <RGBA> : ausnack_97_000029.jpg -> ausnack_97_000029.jpg
 <RGBA> : ausnack_97_000033.jpg -> ausnack_97_000033.jpg
 <RGBA> : austeafood_000026_000046.jpg -> austeafood_000026_000046.jpg
 <RGBA> : aubeer_20_000029.jpg -> aubeer_20_000029.jpg
 <RGBA> : aubeer_20_000015.jpg -> aubeer_20_000015.jpg
 <RGBA> : aubeer_20_000047.jpeg -> aubeer_20_000047.jpg
 <RGBA> : aubeer_20_000024.jpg -> aubeer_20_000024.jpg
 <RGBA> : drink22_000023.jpg -> drink22_000023.jpg
 <RGBA> : ice_71_000006.jpg -> ice_71_000006.jpg
 <RGBA> : austeafood_000083_000016.jpg -> austeafood_000083_000016.jpg
 <RGBA> : snack6_000002.jpg -> snack6_000002.jpg
 <RGBA> : snack6_000018.jpg -> snack6_000018.jpg
 <RGBA> : ice_26_000035.jpg -> ice_26_000035.jpg
 <RGBA> : aubeer_100_000

 <RGBA> : ice_16_000002.jpg -> ice_16_000002.jpg
 <RGBA> : ice_16_000001.jpg -> ice_16_000001.jpg
 <RGBA> : austeafood_000041_000020.jpg -> austeafood_000041_000020.jpg
 <RGBA> : austeafood_000002_000029.jpg -> austeafood_000002_000029.jpg
 <RGBA> : aubeer_61_000036.jpg -> aubeer_61_000036.jpg
 <RGBA> : snack125_000003.jpg -> snack125_000003.jpg
 <RGBA> : audrink_98_000041.jpg -> audrink_98_000041.jpg
 <P> : audrink_98_000027.jpg -> audrink_98_000027.jpg
 <RGBA> : audrink_98_000016.jpg -> audrink_98_000016.jpg
 <RGBA> : ice_154_000007.jpg -> ice_154_000007.jpg
 <RGBA> : aubeer2_67_000023.jpg -> aubeer2_67_000023.jpg
 <RGBA> : audrink_123_000034.jpg -> audrink_123_000034.jpg
 <RGBA> : aubeer2_15_000007.jpg -> aubeer2_15_000007.jpg
 <RGBA> : aubeer2_15_000006.jpg -> aubeer2_15_000006.jpg
 <RGBA> : aubeer2_15_000027.jpg -> aubeer2_15_000027.jpg
 <RGBA> : aubeer2_15_000008.jpg -> aubeer2_15_000008.jpg
 <RGBA> : aubeer2_15_000017.jpg -> aubeer2_15_000017.jpg
 <RGBA> : aubeer2_15_000023.jpg 

In [27]:
print("Number of categories in train data = {}".format(len(listdir(train_dir))))
print("Number of categories in valid data = {}".format(len(listdir(valid_dir))))
print("Number of categories in test data = {}".format(len(listdir(test_dir))))

Number of categories in train data = 719
Number of categories in valid data = 90
Number of categories in test data = 89


In [30]:
#print("---- Training data --------")
train_images = 0
for t in listdir(train_dir):
    #print("{} : {}".format(t, len(listdir(os.path.join(train_dir, t)))))
    train_images += len(listdir(os.path.join(train_dir, t)))

valid_images = 0
#print("---- Validation data --------")
for t in listdir(valid_dir):
    #print("{} : {}".format(t, len(listdir(os.path.join(valid_dir, t)))))
    valid_images += len(listdir(os.path.join(valid_dir, t)))
    
test_images = 0
#print("---- Test data --------")
for t in listdir(test_dir):
    #print("{} : {}".format(t, len(listdir(os.path.join(test_dir, t)))))
    test_images += len(listdir(os.path.join(test_dir, t)))
    
print("Total number of train images = {}".format(train_images))
print("Total number of valid images = {}".format(valid_images))
print("Total number of test images = {}".format(test_images))

Total number of train images = 12122
Total number of valid images = 1589
Total number of test images = 1447
