In [1]:
import pandas as pd
import numpy as np
import os
import cv2

In [2]:
df = pd.read_csv('age_gender.csv')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23705 entries, 0 to 23704
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        23705 non-null  int64 
 1   ethnicity  23705 non-null  int64 
 2   gender     23705 non-null  int64 
 3   img_name   23705 non-null  object
 4   pixels     23705 non-null  object
dtypes: int64(3), object(2)
memory usage: 926.1+ KB
None


In [3]:
ethnicityMap = {0:'white', 1:'black', 2:'asian', 3:'indian', 4:'others'}
genderMap = {0:'male', 1:'female'}

In [4]:
def extractSingleRow(row):
    age = row['age']
    ethnicity = row['ethnicity']
    gender = row['gender']
    imgname = row['img_name']
    img = np.array(row['pixels'].split(), 'uint8')
    img = img.reshape(48,48,1)
    
    return img, age, ethnicity, gender, imgname.replace(".jpg.chip.jpg", ".jpg")

gender_distribution = {'male':0, 'female':0}
ethnicity_distribution = {'white':[], 'black':[], 'asian':[], 'indian':[], 'others':[]}

for i in range(len(df)):
    img, age, eth, gender, imgname = extractSingleRow(df.iloc[i])
    saveimgname = ethnicityMap[eth] + "_" + genderMap[gender] + "_" + str(age) + "_" + imgname
    ethnicity_distribution[ethnicityMap[eth]].append((img, saveimgname))
    gender_distribution[genderMap[gender]] += 1
    
print(gender_distribution)
for k in ethnicity_distribution:
    print(k, " ", len(ethnicity_distribution[k]))
    

{'male': 12391, 'female': 11314}
white   10078
black   4526
asian   3434
indian   3975
others   1692


In [7]:
import math
import random
save_dir = r'C:\Users\mfbob\OneDrive\Desktop\age_gender_eth'
train_dir = os.path.join(save_dir, 'train')
test_dir = os.path.join(save_dir, 'test')
valid_dir = os.path.join(save_dir, 'validation')

def save_img_dir(img_list, saveDir):
    for idx, content in enumerate(img_list):
        cv2.imwrite(os.path.join(saveDir, content[1]), content[0])
        
        
for kdx, eth_content in enumerate(ethnicity_distribution):
    dataset = ethnicity_distribution[eth_content]
    random.shuffle(dataset)
    FifthteenP = math.floor(len(dataset) * 0.15)
    testSet = dataset[:FifthteenP]
    validationSet = dataset[FifthteenP: FifthteenP + FifthteenP]
    trainSet = dataset[FifthteenP + FifthteenP:]
    
    save_img_dir(testSet, test_dir)
    save_img_dir(validationSet, valid_dir)
    save_img_dir(trainSet, train_dir)
        