In [3]:
import pandas as pd
import numpy as np
import PIL
from PIL import Image
import pickle
import random
import matplotlib.pyplot as plt
import json
from operator import itemgetter
from skimage import exposure
from sklearn.model_selection import train_test_split
from ast import literal_eval as make_tuple

#Create folder to store the serialized objects
import os
if not os.path.isdir('./serialized/gender/'):
    os.mkdir('./serialized/gender/')

In [4]:
#A method to serialize the python object to .pkl file for reusability
def save_obj(obj,name):
	base_path = './serialized/gender/'
	with open(base_path+name + '.pkl', 'wb') as f:
		pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [5]:
#A method to deserialize the python object from .pkl file, returning it (up to the program to handle where and which type of variable it is loaded to)
def load_obj(name):
	with open(name + '.pkl', 'rb') as f:
		return pickle.load(f)

In [6]:
#Method to integer encode the age
#Note that there is problem in the original labels of age where (25,32) == (25,23)
#This is to account for that error as well

#This is for future work on gender-based age classification, not much useful now
def get_age_range_id(age_tuple):
	age_ranges = [(0,2),(4,6),(8,13),(15,20),(25,32),(38,43),(48,53),(60,100)]
	diff_tuple = []
    
	if age_tuple:
		for r in age_ranges:
			x = tuple(np.subtract(r,age_tuple))
			x = tuple(np.absolute(x))
			diff_tuple.append(x)

	min_index = diff_tuple.index(min(diff_tuple, key=itemgetter(1)))
	return min_index

In [7]:
#Test the tricky case see if the correct index (from 0 is returned)
test_tuple = (25,23)
get_age_range_id((25,23))

3

In [8]:
#Resizing images
width = 256
height= 256

#Folds indexes there are 5 of them so 0-4
fold_indexes = list(range(5))

#original, unsanitised fold txt prefix
fold_txt_prefix_path = 'Folds/original_txt_files/'
#prefix for sanitised, 80/20 splitted labels in each folds
train_test_splitted_fold_path = './Folds/train_test_splitted/'
#prefix to store csv files, both for training and testing and for cross validation
cv_fold_csv_prefix_path = "Folds/"
train_test_csv_path = "Folds/"
#image prefix path
image_prefix_path = 'Adience/aligned/'



In [10]:
"""For original, uncleaned 5 folds, we wished to obtained the sanitised individual folds labels
Each individual fold is splitted into 80:20 for train and test.
The train subset will be either used as validate set or combined with other folds' train subset to form the train set
The test subset will be added up together to form test set.
Overall is train/val/test split"""
for fold in fold_indexes:
    print("Reading in fold %s"%fold)
    df = pd.read_csv(fold_txt_prefix_path+"fold_%s_data.txt"%fold, sep="\t")
    #Clean up nasty Nan, None, what not?
    df = df[df['age']!='None']
    df = df[df['age']!=' ']
    df = df[df['gender'].notnull()]
    df = df[df['gender']!=' ']
    df = df[df['gender']!='u']
    #Split once cleaned
    train_df, test_df = train_test_split(df,test_size = 0.2,shuffle=True,random_state=10)
    #Export this sanitised labels
    train_df.to_csv(train_test_splitted_fold_path+"cv_fold_%s_train.csv"%fold,index=False)
    test_df.to_csv(train_test_splitted_fold_path+"cv_fold_%s_test.csv"%fold,index=False)
    print("Splitted original fold %s in 80:20 train:test ratio."%fold)

Reading in fold 0
Splitted original fold 0 in 80:20 train:test ratio.
Reading in fold 1
Splitted original fold 1 in 80:20 train:test ratio.
Reading in fold 2
Splitted original fold 2 in 80:20 train:test ratio.
Reading in fold 3
Splitted original fold 3 in 80:20 train:test ratio.
Reading in fold 4
Splitted original fold 4 in 80:20 train:test ratio.


In [11]:
#Create the test set, train set labels by combining the 80% portions together and 20% portions together
print("Creating the train-test set for optimal model training and testing...")
train_df = pd.concat([pd.read_csv(train_test_splitted_fold_path+"cv_fold_%s_train.csv"%fold) for fold in fold_indexes])
test_df  = pd.concat([pd.read_csv(train_test_splitted_fold_path+"cv_fold_%s_test.csv"%fold) for fold in fold_indexes])

train_df.to_csv(train_test_csv_path+"train_set.csv",index=False)
test_df.to_csv(train_test_csv_path+"test_set.csv",index=False)


Creating the train-test set for optimal model training and testing...


In [12]:
#Create train subsets and validation set for each folds
save_fold_path = './Folds/'
for val_fold in fold_indexes:
    #get the train_subset_fold id to merge
    train_subset_folds = [fold for fold in fold_indexes if fold!=val_fold]
    print(train_subset_folds,val_fold)
    
    #generate the train_subset and and the validation df
    train_subset_df = pd.concat([pd.read_csv(train_test_splitted_fold_path+"cv_fold_%s_train.csv"%train_fold) for train_fold in train_subset_folds])
    val_df = pd.read_csv(train_test_splitted_fold_path+"cv_fold_%s_train.csv"%val_fold)
    
    #export to csv
    train_subset_df.to_csv(cv_fold_csv_prefix_path+"cv_fold_%s_train_subset.csv"%val_fold,index=False)
    val_df.to_csv(cv_fold_csv_prefix_path+"cv_fold_%s_val.csv"%val_fold,index=False)

[1, 2, 3, 4] 0
[0, 2, 3, 4] 1
[0, 1, 3, 4] 2
[0, 1, 2, 4] 3
[0, 1, 2, 3] 4


In [13]:
#For easiness in moving files to servers for training, pickle all images...
dataset_to_be_pickled = ['train_set','test_set',
                        'cv_fold_0_train_subset','cv_fold_0_val',
                        'cv_fold_1_train_subset','cv_fold_1_val',
                        'cv_fold_2_train_subset','cv_fold_2_val',
                        'cv_fold_3_train_subset','cv_fold_3_val',
                        'cv_fold_4_train_subset','cv_fold_4_val',]

In [16]:
#Serializing of objects (images and labels) into a consolidate file for use on Google drive
for dataset in dataset_to_be_pickled:
    print("Pickling dataset %s.csv ..."%dataset)
    df = pd.read_csv(cv_fold_csv_prefix_path+dataset+'.csv')
    #Arrays of attributes and labels
    cleaned_images = []
    genders = []
    ages = []
    
    #Looping through each images and do selection, then do preprocessing
    for index, row in df.iterrows():
        yaw = row['fiducial_yaw_angle']
        gender = row['gender']
        age = row['age']
        
        #Get necessary information to construct the path to the image
        image_folder = row['user_id']
        image_file = row['original_image']
        face_id = row['face_id']

        age_tuple = make_tuple(age)
        age_id = get_age_range_id(age_tuple)
        #Assemble the path to image
        path_to_image = image_prefix_path+image_folder+'/landmark_aligned_face.'+str(face_id)+'.'+image_file
        #Preprocess image
        image_to_preprocess = Image.open(path_to_image)
        #Resize image and convert to array form for storage
        image_processed = image_to_preprocess.resize((width, height), PIL.Image.LANCZOS)
        image_processed_array = np.array(image_processed)

        #Integer encode the gender
        if (gender == "m"):
            gender_label_integer = 0
        else:
            gender_label_integer = 1

        #Prepare all the datastructures for storage in serialization
        cleaned_images.append(image_processed_array)
        genders.append(gender_label_integer)
        ages.append(age_id)
        
        #Otherwise just move to the next image already!
    
    #Once done, save to serialized pickle form
    print("Progress Update: Done with %s.csv"%dataset)          
    print ('No. Images: %i, No. Gender: %i, No. Ages: %i' % (len(cleaned_images), len(genders), len(ages)))            
    print ('')
    
    this_dict = {'dataset_name': dataset, 'images': cleaned_images, 'genders': genders, 'ages': ages}
    save_obj(this_dict,dataset)

Pickling dataset train_set.csv ...
Progress Update: Done with train_set.csv
No. Images: 13960, No. Gender: 13960, No. Ages: 13960

Pickling dataset test_set.csv ...
Progress Update: Done with test_set.csv
No. Images: 3492, No. Gender: 3492, No. Ages: 3492

Pickling dataset cv_fold_0_train_subset.csv ...
Progress Update: Done with cv_fold_0_train_subset.csv
No. Images: 10764, No. Gender: 10764, No. Ages: 10764

Pickling dataset cv_fold_0_val.csv ...
Progress Update: Done with cv_fold_0_val.csv
No. Images: 3196, No. Gender: 3196, No. Ages: 3196

Pickling dataset cv_fold_1_train_subset.csv ...
Progress Update: Done with cv_fold_1_train_subset.csv
No. Images: 11083, No. Gender: 11083, No. Ages: 11083

Pickling dataset cv_fold_1_val.csv ...
Progress Update: Done with cv_fold_1_val.csv
No. Images: 2877, No. Gender: 2877, No. Ages: 2877

Pickling dataset cv_fold_2_train_subset.csv ...
Progress Update: Done with cv_fold_2_train_subset.csv
No. Images: 11461, No. Gender: 11461, No. Ages: 11461

