In [8]:
# base imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from zipfile import ZipFile

In [9]:
# torch imports
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.utils.data import Dataset
from torchvision import transforms

In [13]:
# Read in Unzipped file and make into a gzip
# DONT NEED TO RUN THIS AGAIN
# dataFrame = pd.read_csv('./data/age_gender.csv')
# dataFrame.to_csv('./data/age_gender.gz', compression='gzip')

In [15]:
# Read in the zipped file and split
dataFrame = pd.read_csv('./data/age_gender.gz', compression='gzip')
train_dataFrame, test_dataFrame = train_test_split(dataFrame, test_size=0.2)

In [16]:
age_features = len(dataFrame['age'].unique())
eth_features = len(dataFrame['ethnicity'].unique())
gen_features = len(dataFrame['gender'].unique())

print(age_features, eth_features, gen_features)

104 5 2


In [38]:
# Define train and test transforms
train_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.49,), (0.23,))
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.49,), (0.23,))
])

In [56]:
# Custom dataset class
class GenderDataset(Dataset):
    '''
        Inputs:
            dataFrame : Pandas dataFrame
            transform : The transform to apply to the dataset
    '''
    def __init__(self, dataFrame, transform=None):
        # read in the transforms
        self.transform = transform
        
        # Use the dataFrame to get the pixel values
        data_holder = dataFrame.pixels.apply(lambda x: np.array(x.split(" "),dtype=float))
        arr = np.stack(data_holder)
        arr = arr / 255.0
        arr = arr.astype('float32')
        arr = arr.reshape(arr.shape[0], 48, 48, 1)
        # reshape into 48x48x1
        self.data = arr
        
        # get the age, gender, and ethnicity label arrays
        self.age_label = np.array(dataFrame.age[:])
        self.gender_label = np.array(dataFrame.gender[:])
        self.eth_label = np.array(dataFrame.ethnicity[:])
    
    # override the length function
    def __len__(self):
        return len(self.data)
    
    # override the getitem function
    def __getitem__(self, index):
        # load the data at index and apply transform
        data = self.data[index]
        data = self.transform(data)
        
        # load the labels into a list and convert to tensors
        labels = torch.tensor([self.age_label[index], self.gender_label[index], self.eth_label[index]])
        
        # return data labels
        return data, labels

In [57]:
# Construct the custom pytorch datasets
train_set = GenderDataset(train_dataFrame, transform=train_transform)
test_set = GenderDataset(test_dataFrame, transform=test_transform)

# Load the datasets into dataloaders
train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False)

In [58]:
# find the mean and standard deviation (used initially for the transforms)
print(f'mean: {dataset.data.mean()}')
print(f'std dev: {dataset.data.std()}')

mean: 0.4902603328227997
std dev: 0.2330981194972992


In [60]:
# Sanity Check
for X, y in train_loader:
    print(f'Shape of training X: {X.shape}')
    print(f'Shape of y: {y.shape}')
    break

Shape of training X: torch.Size([64, 1, 48, 48])
Shape of y: torch.Size([64, 3])
