# Preprocessing Part

Preprocessing part can be skipped as the output of this part is available in form of npy files:
1) truncated_X_full.npy
2) truncated_Y_full.npy
These files are directly used in training phase.

In [1]:
import cv2
import numpy as np
from PIL import Image
import os, sys

#Directory of "UTK Face" - Aligned and cropped Faces Dataset https://susanqq.github.io/UTKFace/ 
path = "/home/FDUSER/Downloads/UTKFace/"
dirs = os.listdir( path )
Y_data = []
X_data = []

#Resizing image from 200*200 to 100*100
def resize():
    for item in dirs:
        if os.path.isfile(path+item):
            f, e = os.path.splitext(path+item)
            itemname, e = os.path.splitext(item);
            race = itemname.split('_');   # Extracting Ethnicity tag from image name
            Y_data.append(race[2]);       # Y_data contains encoded label of ethnic group of corresponding image.
            #print(item)
            im = Image.open(path+item)
            imResize = im.resize((100,100), Image.ANTIALIAS)
            imResize.save(  path+'resized/'+ itemname+ '.jpg', 'JPEG', quality=90)
            image = cv2.imread (path+'resized/'+item)        #Storing resized images in new folder
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)   #Converting to RGB color mode
            X_data.append (image)         #X_data contains image in form of 100*100*3 matrix.
           
resize()
Y_data = np.array(Y_data,dtype='uint16')
Y_data.shape

(23705,)

In [3]:
# Initially dataset has 23705 images of size 100*100 in RGB format
X_data = np.array(X_data)
print(X_data.shape)
print(Y_data.shape)

(23705, 100, 100, 3)
(23705,)


In [4]:
np.save('face_Y_full.npy', Y_data)
np.save('face_X_full.npy', X_data)

In [5]:
# But dataset contains more images from ethnic group:'0' i.e. White. So we need to equalize this number near to
# number of images in other ethnic groups
unique_elements, counts_elements = np.unique(Y_data, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[    0     1     2     3     4]
 [10078  4526  3434  3975  1692]]


In [9]:
# Selecting all the index of white ethnic group in Y_data i.e '0'.
del_list = []
for i in range(Y_data.shape[0]):
    if(Y_data[i] == 0):
        del_list.append(i)

In [10]:
import random
rand_list = random.sample(del_list, 6500)  # Randomly selecting 6500 instances to be deleted from all the images of white ethnic group

In [12]:
rand_list = np.array(rand_list)

In [13]:
rand_list.shape

(6500,)

In [14]:
rand_list

array([14372,  8988, 21962, ...,  4835,  6046,  9054])

In [26]:
X_re_data = np.reshape(X_data, (23705,30000))

In [35]:
X_list = np.delete(X_re_data, rand_list,0)
#Y_list = np.delete(Y_data, rand_list)

In [36]:
# After Deleting 17205 instances are left same is to be done with Y_list
X_list.shape

(17205, 30000)

In [16]:
del_list = []
for i in range(Y_data.shape[0]):
    if(Y_data[i] == 0):
        del_list.append(i)
        
import random

rand_list = np.array(rand_list)
rand_list = random.sample(del_list, 6500)
X_list = np.delete(X_re_data, rand_list,0)
#Y_list = np.delete(Y_data, rand_list)

# After deleting we have images of all the ethnic groups in comparable numbers.
unique_elements, counts_elements = np.unique(Y_list, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[   0    1    2    3    4]
 [3578 4526 3434 3975 1692]]


In [37]:
#Saving the lists in npy file
np.save('truncated_X_full.npy', X_list)
np.save('truncated_Y_full.npy', Y_list)

# Training Part - Using CNN

In [18]:
import os
os.environ['KERAS_BACKEND'] = 'theano'
from __future__ import print_function
from keras.datasets import cifar10
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D

Using Theano backend.


In [39]:
# We directly used the list it can also be loaded as
# X_list = np.load("truncated_X_full.npy")
X_list = np.reshape(X_list, (17205,100,100,3))   # Reshape in 100*100*3 matrix form


In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_list, Y_list, test_size=0.25, random_state=42)

In [44]:
import keras
batch_size = 32
num_classes = 5
epochs = 20
num_predictions = 20
save_dir = os.path.join(os.getcwd(), 'saved_models')
model_name = 'keras_FACERACE_trained_model.h5'    # Name of the model to be saved
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [46]:
# CNN Architecture Used is similar to VGG-16
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=X_train.shape[1:]))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(128, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(128, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(256, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(256, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [48]:
opt = keras.optimizers.rmsprop(lr=0.0001, decay=1e-6)
x_train = X_train
x_test = X_test
# Let's train the model using RMSprop
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=25,
              validation_data=(x_test, y_test),
              shuffle=True)

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
    model_path = os.path.join(save_dir, model_name)
    model.save(model_path)
    print('Saved trained model at %s ' % model_path)

# Score trained model.

scores = model.evaluate(x_test, y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])


Not using data augmentation.
Train on 12903 samples, validate on 4302 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Saved trained model at /home/FDUSER/Downloads/ML/saved_models/keras_FACERACE_trained_model.h5 
Test loss: 0.7221517877099903
Test accuracy: 0.7710367271590931
