# Libraries

In [1]:
import numpy as np
import pandas as pd
import glob
from skimage import io

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, classification_report, mean_absolute_error

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Activation, Flatten, Dense, AveragePooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import keras
from keras.models import load_model

import pickle
import sys
sys.path.insert(1, '../Src/Lib')
from functions import image_augmentation, image_preprocess

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# Dataset

In [2]:
# Dataset is divided in 4 files, this concatenates them all

df_list = []
for file_name in glob.glob("../Data/Raw/Archive/*.txt"):
    df_temp = pd.read_csv(file_name, sep="\t")
    df_list.append(df_temp)
df = pd.concat(df_list, axis=0, ignore_index=True)

## Cleaning

In [3]:
# Getting rid of missing values

df = df.dropna()
df = df[df["age"] != "None"]

In [4]:
# This dataset is meant to make the age a classification problem, but we are going to make it a regression one

df["age"].value_counts()

(25, 32)     4953
(0, 2)       2488
(38, 43)     2293
(4, 6)       2140
(8, 12)      2119
(15, 20)     1642
(60, 100)     867
(48, 53)      825
35            293
13            168
22            149
34            105
23             96
45             88
(27, 32)       77
55             76
36             56
(38, 42)       46
57             24
3              18
29             11
(38, 48)        6
58              5
2               3
(8, 23)         1
42              1
46              1
Name: age, dtype: int64

In [5]:
# Creating two lists, one with the current values (keys) and one with the new values (values), latter being the average of the first.

ages_keys = df["age"].value_counts().index
ages_values = []
for x in df["age"].value_counts().index:
    if x.startswith("("):
        x = x.split(", ")
        x[0] = x[0].replace("(","")
        x[1] = x[1].replace(")","")
        x[0] = int(x[0])
        x[1] = int(x[1])
        x = int((x[0]+x[1])/2)
        ages_values.append(x)
    else:
        ages_values.append(int(x))
        
# Age map to use for regression.
ages_map = {}
for key, value in zip(ages_keys, ages_values):
    ages_map[key] = value
    
df["age"] = df["age"].map(ages_map)

In [6]:
# We will later make the model predict a non-binary category

df = df[df["gender"] != "u"]
df["gender"] = df["gender"].apply(lambda x: 1 if x == "m" else 0).astype(int)

In [7]:
# Creating the path to the image

df["face_id"] = df["face_id"].astype(str)
df["path"] = "../Data/Raw/Archive/Faces/"+df["user_id"]+"/coarse_tilt_aligned_face."+df["face_id"]+"."+df["original_image"]

In [9]:
# Saving the model for future use

df.to_csv("../Data/Clean/Faces.csv")

## X/y Split

In [10]:
# As we will do two models, we have two targets

X = df["path"]
y_age = df["age"]
y_gender = df["gender"]

## Train/Test Split

In [11]:
# X_train and X_test is the same for both

X_train, X_test, y_train_age, y_test_age = train_test_split(X, y_age, test_size = 0.22, random_state = 22)
X_train, X_test, y_train_gender, y_test_gender = train_test_split(X, y_gender, test_size = 0.22, random_state = 22)

## Image preprocess

In [12]:
# Image greyscaled, downscaled, size adjusted and transformed.

def image_preprocess(path):
    img = tf.io.read_file(np.array(path).ravel()[0])
    img = tf.image.decode_jpeg(img, channels = 1, ratio = 2)
    img = tf.image.resize(img, [64,64])
    img = img / 255 # This part normalizes the image, scaling it down; 255 is the max, while 0 is the min
    return img

In [13]:
# Preparing the data that will be fed to the model, needs to be np.array

X_train_images = np.array([image_preprocess(path) for path in X_train])
X_test_images = np.array([image_preprocess(path) for path in X_test])

In [15]:
# Saving both arrays for future use

filename = "../Data/Clean/X_train_images.pkl"
with open(filename, "wb") as file:
    pickle.dump(X_train_images, file)
    
filename = "../Data/Clean/X_test_images.pkl"
with open(filename, "wb") as file:
    pickle.dump(X_test_images, file)

# Models

## Age

In [16]:
model_age = Sequential()
# First layer needs as many nodes as inputs
model_age.add(Conv2D(64,(2,2), activation = "relu", input_shape = (64,64,1)))
model_age.add(MaxPool2D((2,2)))
model_age.add(Conv2D(64,(2,2), activation = "relu"))
model_age.add(MaxPool2D((2,2)))
model_age.add(Conv2D(64,(2,2), activation = "relu"))
model_age.add(MaxPool2D((2,2)))
model_age.add(Flatten())
model_age.add(Dense(64, activation = "relu"))
model_age.add(Dense(1, activation = "relu"))
opt = keras.optimizers.Adam(learning_rate = 0.01)
model_age.compile(optimizer = opt,
              loss = "mse",
              metrics = ["mae", "mse", "mape"])
model_age.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 63, 63, 64)        320       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 31, 31, 64)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 30, 30, 64)        16448     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 15, 15, 64)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 14, 14, 64)        16448     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 7, 7, 64)         0

### Callbacks

In [17]:
# This changes the learning rate based on epochs

def scheduler(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return lr - (lr/(epoch))

In [18]:
# Patience
    
early_stop = EarlyStopping(patience=5)

# Checkpoint

checkpoint_path = '../Models/Age_NN7.hdf5'
checkpoint = ModelCheckpoint(
    filepath=checkpoint_path,
    save_freq='epoch',
    save_weights_only=False,
    verbose=1
)

# Learning rate

schedule = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [19]:
history = model_age.fit(
    X_train_images, y_train_age,
    epochs=100,
    validation_data = (X_test_images, y_test_age),
    batch_size=128,
    verbose=2,
    callbacks=[early_stop, checkpoint, schedule]
)

Epoch 1/100

Epoch 1: saving model to ../Models\Age_NN7.hdf5
107/107 - 63s - loss: 530.3270 - mae: 17.2618 - mse: 530.3270 - mape: 240.9073 - val_loss: 354.3254 - val_mae: 14.4025 - val_mse: 354.3254 - val_mape: 290.7019 - lr: 0.0100 - 63s/epoch - 586ms/step
Epoch 2/100

Epoch 2: saving model to ../Models\Age_NN7.hdf5
107/107 - 60s - loss: 353.5919 - mae: 14.4090 - mse: 353.5919 - mape: 283.2497 - val_loss: 353.9168 - val_mae: 14.5000 - val_mse: 353.9168 - val_mape: 279.7786 - lr: 0.0100 - 60s/epoch - 561ms/step
Epoch 3/100

Epoch 3: saving model to ../Models\Age_NN7.hdf5
107/107 - 56s - loss: 354.0327 - mae: 14.4284 - mse: 354.0327 - mape: 282.1502 - val_loss: 353.5002 - val_mae: 14.4109 - val_mse: 353.5002 - val_mape: 287.4370 - lr: 0.0100 - 56s/epoch - 524ms/step
Epoch 4/100

Epoch 4: saving model to ../Models\Age_NN7.hdf5
107/107 - 52s - loss: 353.9098 - mae: 14.4126 - mse: 353.9098 - mape: 282.6396 - val_loss: 353.7107 - val_mae: 14.3681 - val_mse: 353.7107 - val_mape: 291.4324 - 

### Predictions

In [20]:
y_train_pred = model_age.predict(X_train_images)
y_test_pred  = model_age.predict(X_test_images)

display(mean_absolute_error(y_train_age,y_train_pred))
display(mean_absolute_error(y_test_age,y_test_pred))



9.547633582426718

10.257937909166019

## Gender

In [21]:
model_gender = Sequential()
# First layer needs as many nodes as inputs
model_gender.add(Conv2D(64,(2,2), activation = "relu", input_shape = (64,64,1)))
model_gender.add(MaxPool2D((2,2)))
model_gender.add(Conv2D(64,(2,2), activation = "relu"))
model_gender.add(MaxPool2D((2,2)))
model_gender.add(Conv2D(64,(2,2), activation = "relu"))
model_gender.add(MaxPool2D((2,2)))
model_gender.add(Flatten())
model_gender.add(Dense(64, activation = "relu"))
model_gender.add(Dense(1, activation = "sigmoid"))
model_gender.compile(optimizer = "adam",
              loss = "binary_crossentropy",
              metrics = "accuracy")
model_gender.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 63, 63, 64)        320       
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 31, 31, 64)       0         
 2D)                                                             
                                                                 
 conv2d_4 (Conv2D)           (None, 30, 30, 64)        16448     
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 15, 15, 64)       0         
 2D)                                                             
                                                                 
 conv2d_5 (Conv2D)           (None, 14, 14, 64)        16448     
                                                                 
 max_pooling2d_5 (MaxPooling  (None, 7, 7, 64)        

### Callbacks

In [22]:
early_stop = EarlyStopping(patience=5)

checkpoint_path = '../Models/Gender_NN17.hdf5'
checkpoint = ModelCheckpoint(
    filepath=checkpoint_path,
    save_freq='epoch',
    save_weights_only=False,
    verbose=1
)

schedule = tf.keras.callbacks.LearningRateScheduler(scheduler)

In [23]:
history = model_gender.fit(
    X_train_images, y_train_gender,
    epochs=50,
    validation_data = (X_test_images, y_test_gender),
    batch_size=128,
    verbose=2,
    callbacks=[early_stop, checkpoint, schedule]
)

Epoch 1/50

Epoch 1: saving model to ../Models\Gender_NN17.hdf5
107/107 - 54s - loss: 0.6269 - accuracy: 0.6352 - val_loss: 0.5803 - val_accuracy: 0.6917 - lr: 0.0010 - 54s/epoch - 503ms/step
Epoch 2/50

Epoch 2: saving model to ../Models\Gender_NN17.hdf5
107/107 - 53s - loss: 0.5301 - accuracy: 0.7308 - val_loss: 0.5370 - val_accuracy: 0.7260 - lr: 0.0010 - 53s/epoch - 493ms/step
Epoch 3/50

Epoch 3: saving model to ../Models\Gender_NN17.hdf5
107/107 - 60s - loss: 0.4796 - accuracy: 0.7645 - val_loss: 0.5071 - val_accuracy: 0.7422 - lr: 0.0010 - 60s/epoch - 563ms/step
Epoch 4/50

Epoch 4: saving model to ../Models\Gender_NN17.hdf5
107/107 - 59s - loss: 0.4384 - accuracy: 0.7929 - val_loss: 0.4728 - val_accuracy: 0.7750 - lr: 0.0010 - 59s/epoch - 551ms/step
Epoch 5/50

Epoch 5: saving model to ../Models\Gender_NN17.hdf5
107/107 - 54s - loss: 0.4007 - accuracy: 0.8152 - val_loss: 0.4986 - val_accuracy: 0.7576 - lr: 0.0010 - 54s/epoch - 500ms/step
Epoch 6/50

Epoch 6: saving model to ../

### Predictions

In [24]:
y_train_pred = model_gender.predict(X_train_images)
y_test_pred  = model_gender.predict(X_test_images)


# This is for binary_crossentropy (1 neuron final output)
y_train_pred2 = [int(round(y_train_pred[x][0],0)) for x in range(len(y_train_pred))]
y_test_pred2 = [int(round(y_test_pred[x][0],0)) for x in range(len(y_test_pred))]


# This is for sparse_categorical_crossestropy (2 neurons final output)
# y_train_pred2 = np.argmax(y_train_pred, axis=1).reshape(-1,1)
# y_test_pred2 = np.argmax(y_test_pred, axis=1).reshape(-1,1)

print("Kappa score:",cohen_kappa_score(y_train_gender, y_train_pred2))
print(classification_report(y_train_gender, y_train_pred2, zero_division = True))
print("Kappa score:",cohen_kappa_score(y_test_gender, y_test_pred2))
print(classification_report(y_test_gender, y_test_pred2, zero_division = True))


display(model_gender.predict(np.array([image_preprocess("../Data/Test/20Female.jpg")])))
display(model_gender.predict(np.array([image_preprocess("../Data/Test/25Female1.jpg")])))
display(model_gender.predict(np.array([image_preprocess("../Data/Test/25Female2.jpg")])))
display(model_gender.predict(np.array([image_preprocess("../Data/Test/50Female1.jpg")])))
display(model_gender.predict(np.array([image_preprocess("../Data/Test/50Female2.jpg")])))
display(model_gender.predict(np.array([image_preprocess("../Data/Test/50Female3.jpg")])))
display(model_gender.predict(np.array([image_preprocess("../Data/Test/50Female4.jpg")])))

display(model_gender.predict(np.array([image_preprocess("../Data/Test/30Male1.jpg")])))
display(model_gender.predict(np.array([image_preprocess("../Data/Test/30Male2.jpg")])))
display(model_gender.predict(np.array([image_preprocess("../Data/Test/30Male3.jpg")])))
display(model_gender.predict(np.array([image_preprocess("../Data/Test/40Male.jpg")])))
display(model_gender.predict(np.array([image_preprocess("../Data/Test/50Male.jpg")])))

Kappa score: 0.8644258304484084
              precision    recall  f1-score   support

           0       0.95      0.92      0.94      7317
           1       0.91      0.95      0.93      6295

    accuracy                           0.93     13612
   macro avg       0.93      0.93      0.93     13612
weighted avg       0.93      0.93      0.93     13612

Kappa score: 0.6917381629423514
              precision    recall  f1-score   support

           0       0.87      0.83      0.85      2015
           1       0.82      0.87      0.84      1825

    accuracy                           0.85      3840
   macro avg       0.85      0.85      0.85      3840
weighted avg       0.85      0.85      0.85      3840



array([[0.08714797]], dtype=float32)



array([[0.46664947]], dtype=float32)



array([[0.7260104]], dtype=float32)



array([[0.1690999]], dtype=float32)



array([[0.8954597]], dtype=float32)



array([[0.00138267]], dtype=float32)



array([[0.00032523]], dtype=float32)



array([[0.01838342]], dtype=float32)



array([[0.7786135]], dtype=float32)



array([[0.7611312]], dtype=float32)



array([[0.9443798]], dtype=float32)



array([[0.57721555]], dtype=float32)

## Data Augmentation

In [25]:
model_age = load_model("../Models/Age_NN3.hdf5")
model_gender = load_model("../Models/Gender_NN3.hdf5")

In [26]:
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
    tf.keras.layers.RandomRotation(0.2)
])

In [28]:
images = []
image1 = image_preprocess("../Data/Test/25Female1.jpg")
images.append(np.array([image1]))
i = 0
while i < 9:
    image = image_preprocess("../Data/Test/25Female1.jpg")
    image = tf.cast(tf.expand_dims(image, 0), tf.float32)
    image = data_augmentation(image)
    images.append(image)
    i = i+1

In [29]:
predictions_age = []
predictions_female = []
predictions_male = []
for image in images:
    predictions_age.append(model_age.predict(image)[0])
    predictions_female.append(model_gender.predict(image)[0][0])
    predictions_male.append(model_gender.predict(image)[0][1])
    
print(np.mean(predictions_age))
print(np.mean(predictions_female))
print(np.mean(predictions_male))

8.789905
0.72879493
0.20026755


In [30]:
image = image_preprocess("../Data/Test/25Female1.jpg")
age, female, male = image_augmentation(image, model_age, model_gender)

print("Age: {}, Female: {}, Male: {}".format(age, female, male))

Age: 14.361950874328613, Female: 0.6030124425888062, Male: 0.46290987730026245
