# Python imports

In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import utils

# Load CSV into DataFrame to understand the data

In [2]:
df = pd.read_csv('data/UTKFaceAugmented.csv')

# Printing out dataframe head allows me to do feature selection
# and determine how best to prepare the data for model training
df.head()

Unnamed: 0.1,Unnamed: 0,filename,age,gender,race,age_range,num_haircuts_life,has_tiktok,remembers_disco,uses_skincare,max_annual_earnings
0,0,100_0_0_20170112213500903.jpg.chip.jpg,100,male,white,100-119,360,no,no,no,32890.160162
1,1,100_0_0_20170112215240346.jpg.chip.jpg,100,male,white,100-119,627,no,no,no,29870.803247
2,2,100_1_0_20170110183726390.jpg.chip.jpg,100,female,white,100-119,687,no,yes,no,62930.622654
3,3,100_1_0_20170112213001988.jpg.chip.jpg,100,female,white,100-119,710,no,no,no,31105.957009
4,4,100_1_0_20170112213303693.jpg.chip.jpg,100,female,white,100-119,614,no,no,no,63977.673549


From looking at the dataframe, it is evident that either age or age_range should be the target value. Here, I will choose age as the target value as this would then be a regression task compared to a classification task. Furthermore, in accordance with current best practices, features related to race and gender will be removed. Numeric features such as max_annual_earnings and num_haircuts_life are also standardized.

# Remove features and standardize features

I also one-hot encode the data such as has_tiktok, remembers_disco, and uses_skincare.

In [3]:
features_to_drop = ['Unnamed: 0', 'gender', 'race', 'age_range', 'num_haircuts_life', 'max_annual_earnings']

# these numerical features are standardized so that the model does not have to
# train on features where the values have vastly different ranges.
# This helps train the model faster as well
df['num_haircuts_life_st'] = utils.standardize_numeric(df['num_haircuts_life'])
df['max_annual_earnings_st'] = utils.standardize_numeric(df['max_annual_earnings'])

# categorical features are encoded numerically so that they can be used in the models
keep_categoric_columns = ['has_tiktok', 'remembers_disco', 'uses_skincare']
for col in keep_categoric_columns:
    df[col] = df[col].map({'yes': 1, 'no': 0})

df = df.drop(features_to_drop, axis=1)


df.head()

Unnamed: 0,filename,age,has_tiktok,remembers_disco,uses_skincare,num_haircuts_life_st,max_annual_earnings_st
0,100_0_0_20170112213500903.jpg.chip.jpg,100,0,0,0,1.225473,-0.353687
1,100_0_0_20170112215240346.jpg.chip.jpg,100,0,0,0,3.256254,-0.361172
2,100_1_0_20170110183726390.jpg.chip.jpg,100,0,1,0,3.712609,-0.27921
3,100_1_0_20170112213001988.jpg.chip.jpg,100,0,0,0,3.887545,-0.35811
4,100_1_0_20170112213303693.jpg.chip.jpg,100,0,0,0,3.157377,-0.276614


# Split data

In [4]:
# one of the features has to be filename because it will be used later
# to poll the dataloader to stream the appropriate images
features = ['filename', 'num_haircuts_life_st',	'max_annual_earnings_st', 
            'has_tiktok', 'remembers_disco', 'uses_skincare']
target = ['age']

x_train, x_val, y_train, y_val = train_test_split(df[features], df[target], train_size=0.6, random_state=42)

x_val, x_test, y_val, y_test = train_test_split(x_val, y_val, train_size=0.5, random_state=42)

print("x train: ",x_train.shape, "y train:", y_train.shape)
print("x val: ",x_val.shape, "y train:", y_val.shape)
print("x test: ",x_test.shape, "y test:", y_test.shape)

print(x_train)

x train:  (14224, 6) y train: (14224, 1)
x val:  (4742, 6) y train: (4742, 1)
x test:  (4742, 6) y test: (4742, 1)
                                    filename  num_haircuts_life_st  \
8986   28_0_4_20170103213024052.jpg.chip.jpg             -0.242470   
2834   20_1_0_20170117141304848.jpg.chip.jpg             -0.957426   
2203    1_0_4_20170103210812538.jpg.chip.jpg             -1.474629   
3445   22_0_4_20170103234043547.jpg.chip.jpg              0.259521   
21774   6_1_3_20161220223052131.jpg.chip.jpg             -1.208422   
...                                      ...                   ...   
21575  68_0_2_20170116193554465.jpg.chip.jpg              1.126596   
5390   25_1_0_20170117152038451.jpg.chip.jpg              0.191068   
860    16_0_0_20170110232038257.jpg.chip.jpg             -1.269269   
15795  40_1_1_20170113011244319.jpg.chip.jpg              0.632211   
23654   9_0_0_20170120133313910.jpg.chip.jpg             -1.109545   

       max_annual_earnings_st  has_tiktok  r

# Convert data to torch Tensors and save the data

In [5]:
# I separate the numerical data from the filename as it is necessary later on
x_train_imagename, x_val_imagename, x_test_imagename = x_train.iloc[:, 0], x_val.iloc[:, 0], x_test.iloc[:, 0]
x_train, x_val, x_test = x_train.iloc[:, 1:], x_val.iloc[:, 1:], x_test.iloc[:, 1:]

x_train_image = utils.process_images(x_train_imagename)
x_val_image = utils.process_images(x_val_imagename)
x_test_image = utils.process_images(x_test_imagename)

# The data is converted to torch.Tensor so that loading the data is easier
x_train, x_val, x_test = torch.Tensor(x_train.to_numpy()), torch.Tensor(x_val.to_numpy()), torch.Tensor(x_test.to_numpy())
y_train, y_val, y_test = torch.Tensor(y_train.to_numpy()), torch.Tensor(y_val.to_numpy()), torch.Tensor(y_test.to_numpy())
x_train_image, x_val_image, x_test_image = torch.Tensor(x_train_image), torch.Tensor(x_val_image), torch.Tensor(x_test_image)


# store it in a dict that we can save out as a single file
data_dict = {'x_train':x_train, 'x_val':x_val, 'x_test':x_test, 
             'y_train':y_train, 'y_val':y_val, 'y_test':y_test}

# save it to local data directory
torch.save(data_dict, 'data/image_csv_processed.pt')

image_dict = {'x_train_image': x_train_image, 'x_val_image': x_val_image, 'x_test_image': x_test_image}

torch.save(image_dict, 'data/images_processed.pt')