## Problem 3
- [Given the images of 20000 persons](https://www.kaggle.com/datasets/jangedoo/utkface-new)
- Predict the age, gender and ethinicity

# Imports

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import *
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.resnet50 import ResNet50

from tensorflow.keras.preprocessing.image import ImageDataGenerator

from tensorflow.keras.optimizers import *

# Prepare dataset for model training and testing

The images are named as `age`, `gender`, and `ethnicity`

In [3]:
folder_path = 'data_problem3'

In [4]:
df = pd.DataFrame(columns=['file_name', 'age', 'gender', 'ethinicity'])

In [5]:
for file in os.listdir(folder_path):

    if not(file.endswith('.jpg')):
        continue

    parts = file.split('_')
    age = parts[0]
    gender = parts[1]
    ethinicity = parts[2]

    df = pd.concat([df, pd.DataFrame({'file_name': [file], 'age': [age], 'gender': [gender], 'ethinicity': [ethinicity]})], ignore_index=True)

In [6]:
df['age'] = df['age'].astype(int)
df['gender'] = df['gender'].astype(int)
df['ethinicity'] = df['ethinicity'].astype(int)

ValueError: invalid literal for int() with base 10: '20170116174525125.jpg.chip.jpg'

In [None]:
df.head()

In [None]:
train_df = df.sample(frac=1,random_state=0).iloc[:20000]
test_df = df.sample(frac=1,random_state=0).iloc[20000:]

In [None]:
print(df.shape)
print(train_df.shape)
print(test_df.shape)

In [None]:
df.to_csv('data_problem3.csv')
train_df.to_csv('train_dat_problem3a.csv')
test_df.to_csv('test_data_problem3.csv')

# Generate Data for Training

In [None]:
train_datagen = ImageDataGenerator(rescale=1./255,
                                   rotation_range=30,
                                   width_shift_range=0.2,
                                   height_shift_range=0.2,
                                   shear_range=0.2,
                                   zoom_range=0.2,
                                   horizontal_flip=True)

train_dataset = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=folder_path,
    x_col='file_name',
    y_col=['age', 'gender'],
    target_size=(234, 234),
    batch_size=32,
    class_mode='multi_output'
)

test_datagen = ImageDataGenerator(rescale=1./255)

test_dataset = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=folder_path,
    x_col='file_name',
    y_col=['age', 'gender'],
    target_size=(234, 234),
    batch_size=32,
    class_mode='multi_output'
)

# Define model architecture

In [3]:
conv_base = ResNet50(
    weights = 'imagenet',
    include_top = False,
    input_shape = (234,234,3)
)
conv_base.trainable = False
conv_base.summary()

In [4]:
inputs = Input(shape=(234, 234, 3))

x = conv_base(inputs)

flatten = Flatten(name='flatten_layer')(x)

fc_l1 = Dense(units=512, activation='relu', name='fc_l1')(flatten)
fc_l2 = Dense(units=128, activation='relu', name='fc_l2')(fc_l1)
fc_l3 = Dense(units=32, activation='relu', name='fc_l3')(fc_l2)
output_age = Dense(units=1, activation='linear', name='output_layer_age')(fc_l3)
output_gender = Dense(units=1, activation='sigmoid', name='output_layer_gender')(fc_l3)

model = Model(inputs=inputs, outputs=[output_age, output_gender])
model.name = 'resnet50_transfer_learning'
model.summary()

In [None]:
tensorflow.keras.utils.plot_model(model)

In [None]:
model.save('model_problem3_untrained.keras')

# Train model

In [None]:
model.compile(
    loss={'output_age': 'mae', 'output_gender': 'binary_crossentropy'},
    optimizer=RMSprop(learning_rate=1e-5),
    metrics={'output_age': 'mae', 'output_gender': 'accuracy'},
    loss_weights={'output_age':1, 'output_gender':99}
    )

history = model.fit(
    train_dataset,
    epochs=10,
    validation_data=test_dataset
).history

model.save('model_problem3_trained.keras')

# Test Results

In [None]:
plt.plot(history['loss'], label='Training', color='red')
plt.plot(history['val_loss'], label='Validation', color='blue')
plt.legend()
plt.show()
plt.savefig('loss.png')