In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Project Challenges

In this project, we aim to address the following challenges:

1. **Image Preprocessing:** Preprocessing the facial images is crucial for the success of the model. This includes resizing, normalizing, and converting images to a format suitable for neural networks. Ensuring the input data is appropriately prepared is essential.

2. **Model Architecture:** Designing an effective neural network architecture that can handle both age regression and gender classification tasks simultaneously is a non-trivial task. Balancing the model's complexity while maintaining good performance is essential.

3. **Gender Classification:** Another significant challenge is to classify the gender of a person from their facial image. The model must learn to distinguish between male and female characteristics, often requiring subtle visual cues.

4. **Age Estimation:** The primary challenge is to develop a model that can accurately estimate the age of a person based on their facial features. This involves training a deep learning model to regress the age of individuals, which can be a complex and nuanced task.

5. **Hyperparameter Tuning:** Finding the right set of hyperparameters for training the model can significantly impact its performance. It involves optimizing learning rates, batch sizes, regularization techniques, and more.

6. **Evaluation Metrics:** Choosing appropriate evaluation metrics for age estimation and gender classification is vital. Mean Absolute Error (MAE) for age regression and accuracy for gender classification are common metrics, but others may be considered.

7. **Data Quality and Quantity:** The quality and quantity of the dataset play a significant role in the model's performance. Ensuring a diverse and representative dataset can be challenging, and data augmentation techniques may be required.

8. **Interpreting Model Predictions:** Understanding how the model arrives at its predictions is crucial, especially in applications like age estimation and gender classification. Visualizing model explanations and uncertainty can be a challenge.

By addressing these challenges, we aim to create a robust and accurate system for estimating the age and gender of individuals from facial images.


# Importing Libraries

In [None]:
!pip install --upgrade pip
!pip install seaborn
!pip install Pillow
!pip install imgaug
!pip install opencv-python
!apt-get update
!apt-get install -y libgl1-mesa-glx

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import load_img
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import img_to_array
from keras.models import Sequential, Model
from keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D, Input, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import numpy as np
import random
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
import imgaug.augmenters as iaa

import os
from tqdm.notebook import tqdm
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Configura la estrategia de la TPU
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.TPUStrategy(resolver)

# Data Labeling

In [None]:
path = '/kaggle/input/utkface-new/UTKFace'
age_labels = []
gender_labels = []
image_paths = []

image_filenames = os.listdir(path)
random.shuffle(image_filenames)

for image in image_filenames:
    image_path = os.path.join(path, image)
    img_components = image.split('_')
    age_label = int(img_components[0])
    gender_label = int(img_components[1])
    
    age_labels.append(age_label)
    gender_labels.append(gender_label)
    image_paths.append(image_path)

In [None]:
print(f'Number of age_labels: {len(age_labels)}, Number of gender_labels: {len(gender_labels)}, Number of image_paths: {len(image_paths)}')

In [None]:
print(age_labels[:10])
print(gender_labels[:10])
print(image_paths[:10])

In [None]:
df = pd.DataFrame()
df['image_path'], df['age'], df['gender'] = image_paths, age_labels, gender_labels
df.head(10)

# Distributions

In [None]:
 sns.distplot(df['age'])

In [None]:
sns.countplot(data=df, x='gender')

plt.xlabel('Gender')
plt.ylabel('Quantity')
plt.title('Gender Distribution')

plt.show()

# Data Augmentation

In [None]:
def apply_data_augmentation(image_path):
    img = load_img(image_path)
    img = img_to_array(img)
    
    seq = iaa.Sequential([
        iaa.Affine(rotate=(-10, 10)),  
        iaa.Fliplr(0.9),  
        iaa.Sometimes(0.7, iaa.GaussianBlur(sigma=(0, 2.0))), 
        iaa.Sometimes(0.6, iaa.AdditiveGaussianNoise(scale=(0, 0.05 * 255))),  
        iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5),  
    ])
    
    augmented_img = seq(image=img)
    return augmented_img

selected_images = df[(df['age'] >= 5) & (df['age'] <= 20) | (df['age'] >= 40) & (df['age'] <= 90)]

augmented_images = []
for image_path in selected_images['image_path']:
    augmented_img = apply_data_augmentation(image_path)
    augmented_images.append(augmented_img)

augmented_image_paths = []

augmented_age_labels = []
augmented_gender_labels = []


############
output_directory = '/kaggle/working/augmented_images'  ##### Change this to the desired location
os.makedirs(output_directory, exist_ok=True)
############

for image_path, age_label, gender_label, augmented_img in zip(selected_images['image_path'], selected_images['age'], selected_images['gender'], augmented_images):
    augmented_image_filename = os.path.basename(image_path).replace('.jpg', '_augmented.jpg')
    augmented_image_path = os.path.join(output_directory, augmented_image_filename)
    
    augmented_image_paths.append(augmented_image_path)
    
    augmented_age_labels.append(age_label)
    augmented_gender_labels.append(gender_label)
    
    plt.imsave(augmented_image_path, augmented_img.astype(np.uint8))

print(f"Número de imágenes originales: {len(selected_images)}")
print(f"Número de imágenes aumentadas: {len(augmented_images)}")

augmented_df = pd.DataFrame()
augmented_df['image_path'] = augmented_image_paths
augmented_df['age'] = augmented_age_labels
augmented_df['gender'] = augmented_gender_labels

In [None]:
combined_df = pd.concat([df, augmented_df], ignore_index=True)

In [None]:
combined_df

#### **BEFORE**

In [None]:
sns.distplot(df['age'])

#### **AFTER**

In [None]:
sns.distplot(combined_df['age'])

In [None]:
plt.figure(figsize=(8, 4))
plt.subplot(1, 2, 1)
plt.title("Original Image")
img = load_img(selected_images.iloc[3]['image_path'])
plt.imshow(img)
plt.subplot(1, 2, 2)
plt.title("Augmented Image")
augmented_img = augmented_images[3]
augmented_img = np.clip(augmented_img, 0, 255).astype(np.uint8)  
plt.imshow(augmented_img)
plt.show()

# Preprocessing

In [None]:
def extract_image_features(images):
    features = list()

    for image in images:
        img = load_img(image, grayscale=True)
        img = img.resize((128, 128), Image.LANCZOS)
        img = np.array(img)
        features.append(img)

    features = np.array(features)
    features = features.reshape(len(features), 128, 128, 1)
    return features

In [None]:
filtered_df = combined_df[combined_df['age'] < 60]

X = extract_image_features(filtered_df['image_path'])

X = X / 255.0

X.shape

In [None]:
y_gender = np.array(filtered_df['gender'])
y_age = np.array(filtered_df['age'])

In [None]:
X_rgb = np.repeat(X, 3, axis=-1)

X_rgb = X_rgb * 255

def resize_images(images):
    resized_images = []
    for image in images:
        img = cv2.resize(image, (224, 224))
        resized_images.append(img)
    return np.array(resized_images)

X_rgb = resize_images(X_rgb)

X_rgb = X_rgb / 255.0

X_train, X_test, y_gender_train, y_gender_test, y_age_train, y_age_test = train_test_split(X_rgb, y_gender, y_age, test_size=0.2, random_state=42)

# Building and fitting models

In [None]:
def create_gender_model(learning_rate=0.001, dropout_rate=0.5, optimizer='adam', kernel_size=(3, 3), pool_size=(2, 2)):
    input_shape = (224, 224, 3)
    with strategy.scope():
        inputs = Input(shape=input_shape)
        
        conv_1 = Conv2D(64, kernel_size=kernel_size, activation='relu')(inputs)
        max_1 = MaxPooling2D(pool_size=pool_size)(conv_1)
        conv_2 = Conv2D(128, kernel_size=kernel_size, activation='relu')(max_1)
        max_2 = MaxPooling2D(pool_size=pool_size)(conv_2)
        conv_3 = Conv2D(256, kernel_size=kernel_size, activation='relu')(max_2)
        max_3 = MaxPooling2D(pool_size=pool_size)(conv_3)
        conv_4 = Conv2D(512, kernel_size=kernel_size, activation='relu')(max_3)
        max_4 = MaxPooling2D(pool_size=pool_size)(conv_4)

        flatten = Flatten()(max_4)
        dense_1 = Dense(512, activation='relu')(flatten)
        dropout_1 = Dropout(dropout_rate)(dense_1)
        dense_2 = Dense(256, activation='relu')(dropout_1)
        dropout_2 = Dropout(dropout_rate)(dense_2)
        
        output_gender = Dense(1, activation='sigmoid', name='gender_out')(dropout_2)
        
        if optimizer == 'adam':
            optimizer = Adam(learning_rate=learning_rate)
        elif optimizer == 'rmsprop':
            optimizer = RMSprop(learning_rate=learning_rate)
        elif optimizer == 'sgd':
            optimizer = SGD(learning_rate=learning_rate)
            
        gender_model = Model(inputs=inputs, outputs=output_gender)
        gender_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return gender_model

def create_age_model(learning_rate=0.001, dropout_rate=0.3, optimizer='adam', kernel_size=(3, 3), pool_size=(2, 2)):
    input_shape = (224, 224, 3)
    with strategy.scope():
        inputs = Input(shape=input_shape)
        
        conv_1 = Conv2D(64, kernel_size=kernel_size, activation='relu')(inputs)
        max_1 = MaxPooling2D(pool_size=pool_size)(conv_1)
        conv_2 = Conv2D(128, kernel_size=kernel_size, activation='relu')(max_1)
        max_2 = MaxPooling2D(pool_size=pool_size)(conv_2)
        conv_3 = Conv2D(256, kernel_size=kernel_size, activation='relu')(max_2)
        max_3 = MaxPooling2D(pool_size=pool_size)(conv_3)
        conv_4 = Conv2D(512, kernel_size=kernel_size, activation='relu')(max_3)
        max_4 = MaxPooling2D(pool_size=pool_size)(conv_4)

        flatten = Flatten()(max_4)
        dense_1 = Dense(512, activation='relu', kernel_regularizer=l2(0.01))(flatten)
        dropout_1 = Dropout(dropout_rate)(dense_1)
        dense_2 = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(dropout_1)
        dropout_2 = Dropout(dropout_rate)(dense_2)
        dense_3 = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(dropout_2)
        
        output_age = Dense(1, activation='linear', name='age_out')(dense_3)
        
        if optimizer == 'adam':
            optimizer = Adam(learning_rate=learning_rate)
        elif optimizer == 'rmsprop':
            optimizer = RMSprop(learning_rate=learning_rate)
        elif optimizer == 'sgd':
            optimizer = SGD(learning_rate=learning_rate)
        
        age_model = Model(inputs=inputs, outputs=output_age)
        age_model.compile(loss='mean_absolute_error', optimizer=optimizer, metrics=['mae'])
    return age_model

gender_model = KerasClassifier(build_fn=create_gender_model, verbose=0)
age_model = KerasRegressor(build_fn=create_age_model, verbose=0)

param_grid_gender = {
    'learning_rate': [0.0001, 0.00001],
    'dropout_rate': [0.3, 0.4],
    'optimizer': ['adam', 'rmsprop'],
    'kernel_size': [(3, 3), (4, 4)],
    'pool_size': [(2, 2), (3, 3)]
}

param_grid_age = {
    'learning_rate': [0.0001, 0.00001],
    'dropout_rate': [0.3, 0.4],
    'optimizer': ['adam', 'rmsprop'],
    'kernel_size': [(3, 3), (4, 4)],
    'pool_size': [(2, 2), (3, 3)]
}

grid_gender = GridSearchCV(estimator=gender_model, param_grid=param_grid_gender, cv=3, verbose=2)
grid_gender_result = grid_gender.fit(X_train, y_gender_train)

grid_age = GridSearchCV(estimator=age_model, param_grid=param_grid_age, cv=3, verbose=2)
grid_age_result = grid_age.fit(X_train, y_age_train)

print("Mejores Hiperparámetros para Género: ", grid_gender_result.best_params_)
print("Mejor Puntaje para Género: ", grid_gender_result.best_score_)

print("Mejores Hiperparámetros para Edad: ", grid_age_result.best_params_)
print("Mejor Puntaje para Edad: ", -grid_age_result.best_score_)  # We use the negative of the score since KerasRegressor minimizes the error

In [None]:
input_shape = (224, 224, 3)

with strategy.scope():

    inputs = Input(shape=input_shape)

    # Convolution and max-pooling layers
    conv_1 = Conv2D(64, kernel_size=(4, 4), activation='relu')(inputs)
    max_1 = MaxPooling2D(pool_size=(2, 2))(conv_1)
    conv_2 = Conv2D(128, kernel_size=(4, 4), activation='relu')(max_1)
    max_2 = MaxPooling2D(pool_size=(2, 2))(conv_2)
    conv_3 = Conv2D(256, kernel_size=(4, 4), activation='relu')(max_2)
    max_3 = MaxPooling2D(pool_size=(2, 2))(conv_3)
    conv_4 = Conv2D(512, kernel_size=(4, 4), activation='relu')(max_3)
    max_4 = MaxPooling2D(pool_size=(2, 2))(conv_4)

    # Flattening and fully connected layers
    flatten = Flatten()(max_4)
    dense_1 = Dense(512, activation='relu')(flatten)
    dropout_1 = Dropout(0.3)(dense_1)
    dense_2 = Dense(256, activation='relu')(dropout_1)
    dropout_2 = Dropout(0.3)(dense_2)
    output_gender = Dense(1, activation='sigmoid', name='gender_out')(dropout_2)

    gender_model = Model(inputs=inputs, outputs=output_gender)
    gender_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])


    

    inputs = Input(shape=input_shape)

    # Convolution and max-pooling layers
    conv_1 = Conv2D(64, kernel_size=(3, 3), activation='relu')(inputs)
    max_1 = MaxPooling2D(pool_size=(2, 2))(conv_1)
    conv_2 = Conv2D(128, kernel_size=(3, 3), activation='relu')(max_1)
    max_2 = MaxPooling2D(pool_size=(2, 2))(conv_2)
    conv_3 = Conv2D(256, kernel_size=(3, 3), activation='relu')(max_2)
    max_3 = MaxPooling2D(pool_size=(2, 2))(conv_3)
    conv_4 = Conv2D(512, kernel_size=(3, 3), activation='relu')(max_3)
    max_4 = MaxPooling2D(pool_size=(2, 2))(conv_4)

    # Flattening and fully connected layers
    flatten = Flatten()(max_4)
    dense_1 = Dense(512, activation='relu', kernel_regularizer=l2(0.01))(flatten)
    dropout_1 = Dropout(0.3)(dense_1)
    dense_2 = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(dropout_1)
    dropout_2 = Dropout(0.3)(dense_2)
    dense_3 = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(dropout_2)
    output_age = Dense(1, activation='linear', name='age_out')(dense_3)

    age_model = Model(inputs=inputs, outputs=output_age)
    age_model.compile(loss='mean_absolute_error', optimizer=RMSprop(learning_rate=0.0001), metrics=['mae'])

    
    
    
    gender_model.fit(X_train, y_gender_train, epochs=20, batch_size=32, validation_split=0.2)
    age_model.fit(X_train, y_age_train, epochs=20, batch_size=32, validation_split=0.2)

    gender_loss, gender_accuracy = gender_model.evaluate(X_test, y_gender_test)
    age_loss, age_mae = age_model.evaluate(X_test, y_age_test)

print("Gender Model - Loss:", gender_loss, "Accuracy:", gender_accuracy)
print("Age Model - Loss:", age_loss, "MAE:", age_mae)


# Testing

In [None]:
img_path = '/kaggle/input/lautaro/lautaro.jpg' 
img = cv2.imread(img_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))

for (x, y, w, h) in faces:
    face_roi = img[y:y+h, x:x+w]
    
    face_roi = cv2.resize(face_roi, (224, 224))
    face_roi = cv2.cvtColor(face_roi, cv2.COLOR_BGR2RGB)
    face_roi = face_roi.astype(np.float32) / 255.0

    gender_prediction = gender_model.predict(np.expand_dims(face_roi, axis=0))
    gender_label = "Man" if gender_prediction < 0.5 else "Women"

    age_prediction = age_model.predict(np.expand_dims(face_roi, axis=0))

    cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)

    label = f'Gender: {gender_label}, Age: {int(age_prediction[0][0])}'
    cv2.putText(img, label, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.show()
