# Python imports

In [1]:
from PIL import Image
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import numpy as np
import utils

# Load CSV into DataFrame to understand the data

In [9]:
df = pd.read_csv('data/UTKFaceAugmented.csv')

# Printing out dataframe head allows me to do feature selection
# and determine how best to prepare the data for model training
df.head()

Unnamed: 0.1,Unnamed: 0,filename,age,gender,race,age_range,num_haircuts_life,has_tiktok,remembers_disco,uses_skincare,max_annual_earnings
0,0,100_0_0_20170112213500903.jpg.chip.jpg,100,male,white,100-119,360,no,no,no,32890.160162
1,1,100_0_0_20170112215240346.jpg.chip.jpg,100,male,white,100-119,627,no,no,no,29870.803247
2,2,100_1_0_20170110183726390.jpg.chip.jpg,100,female,white,100-119,687,no,yes,no,62930.622654
3,3,100_1_0_20170112213001988.jpg.chip.jpg,100,female,white,100-119,710,no,no,no,31105.957009
4,4,100_1_0_20170112213303693.jpg.chip.jpg,100,female,white,100-119,614,no,no,no,63977.673549


From looking at the dataframe, it is evident that either age or age_range should be the target value. Here, I will choose age as the target value as this would then be a regression task compared to a classification task. Furthermore, in accordance with current best practices, features related to race will be removed. Numeric features such as max_annual_earnings and num_haircuts_life are also standardized.

# Remove features and standardize features

In [10]:
features_to_drop = ['race', 'age_range', 'num_haircuts_life', 'max_annual_earnings']

df['num_haircuts_life_st'] = utils.standardize_numeric(df['num_haircuts_life'])
df['max_annual_earnings_st'] = utils.standardize_numeric(df['max_annual_earnings'])

df = df.drop(features_to_drop, axis=1)



df.head()

Unnamed: 0.1,Unnamed: 0,filename,age,gender,has_tiktok,remembers_disco,uses_skincare,num_haircuts_life_st,max_annual_earnings_st
0,0,100_0_0_20170112213500903.jpg.chip.jpg,100,male,no,no,no,1.225473,-0.353687
1,1,100_0_0_20170112215240346.jpg.chip.jpg,100,male,no,no,no,3.256254,-0.361172
2,2,100_1_0_20170110183726390.jpg.chip.jpg,100,female,no,yes,no,3.712609,-0.27921
3,3,100_1_0_20170112213001988.jpg.chip.jpg,100,female,no,no,no,3.887545,-0.35811
4,4,100_1_0_20170112213303693.jpg.chip.jpg,100,female,no,no,no,3.157377,-0.276614


(23708, 9)
