In [16]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

SEED = 1234

In [17]:
df = pd.read_csv('./list_attr_celeba.csv')
df.head()

Unnamed: 0,image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
0,000001.jpg,-1,1,1,-1,-1,-1,-1,-1,-1,...,-1,1,1,-1,1,-1,1,-1,-1,1
1,000002.jpg,-1,-1,-1,1,-1,-1,-1,1,-1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
2,000003.jpg,-1,-1,-1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,1
3,000004.jpg,-1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,-1,1,-1,1,-1,1,1,-1,1
4,000005.jpg,-1,1,1,-1,-1,-1,1,-1,-1,...,-1,-1,-1,-1,-1,-1,1,-1,-1,1


In [18]:
df_smiling = df[df['Smiling'] == 1]
df_non_smiling = df[df['Smiling'] == -1]
print(f"""Length of smiling: {len(df_smiling)} ({len(df_smiling) / len(df)}) 
Length of non-smiling: {len(df_non_smiling)} ({len(df_non_smiling) / len(df)}) """)

Length of smiling: 97669 (0.48208036564839907) 
Length of non-smiling: 104930 (0.5179196343516009) 


In [19]:
# Now need to generate indices with different levels of gender imbalance.
df_male = df[df['Male'] == 1]
df_female = df[df['Male'] == -1]
print(f"""Length of male: {len(df_male)} ({len(df_male) / len(df)}) 
Length of female: {len(df_female)} ({len(df_female) / len(df)}) """)

Length of male: 84434 (0.41675427815537097) 
Length of female: 118165 (0.5832457218446291) 


In [20]:
df_male_smiling = df[(df['Male'] == 1) & (df['Smiling'] == 1)]
df_female_smiling = df[(df['Male'] == -1) & (df['Smiling'] == 1)]

df_male_not_smiling = df[(df['Male'] == 1) & (df['Smiling'] == -1)]
df_female_not_smiling = df[(df['Male'] == -1) & (df['Smiling'] == -1)]

In [21]:
# Going to use a dataset of 30,000 images
# 15,000 smiling, 15,000 non-smiling
# Going to create 6 different datasets with differing levels of gender imbalance
# But 50/50 smiling vs non smiling

def rebalance_dataset(females, males, df):
    df_male_smiling = df[(df['Male'] == 1) & (df['Smiling'] == 1)]
    df_female_smiling = df[(df['Male'] == -1) & (df['Smiling'] == 1)]

    df_male_not_smiling = df[(df['Male'] == 1) & (df['Smiling'] == -1)]
    df_female_not_smiling = df[(df['Male'] == -1) & (df['Smiling'] == -1)]
    
    if females > 0:
        rebalance_df_female_smiling = resample(df_female_smiling,
                                              replace=False,
                                              n_samples=int(females/2),
                                              random_state=SEED)
        
        rebalance_df_female_not_smiling = resample(df_female_not_smiling,
                                                  replace=False,
                                                  n_samples=int(females/2),
                                                  random_state=SEED)
    
    if males > 0:
        rebalance_df_male_smiling = resample(df_male_smiling,
                                              replace=False,
                                              n_samples=int(males/2),
                                              random_state=SEED)
        
        rebalance_df_male_not_smiling = resample(df_male_not_smiling,
                                                  replace=False,
                                                  n_samples=int(males/2),
                                                  random_state=SEED)
    if females == 0:
        rebalance_df = pd.concat([rebalance_df_male_smiling, rebalance_df_male_not_smiling])
        return rebalance_df
        
    elif males == 0:
        rebalance_df = pd.concat([rebalance_df_female_smiling, rebalance_df_female_not_smiling])
        return rebalance_df
        
    else:
        rebalance_df = pd.concat([rebalance_df_male_smiling, rebalance_df_male_not_smiling,
                                  rebalance_df_female_smiling, rebalance_df_female_not_smiling])
        return rebalance_df


def test_rebalance(females, males, df):
    assert len(df[df['Male'] == -1]) == females
    assert len(df[df['Male'] == 1]) == males
    
    assert len(df[df['Smiling'] == -1]) == 15000
    assert len(df[df['Smiling'] == 1]) == 15000
    
SAVE_FOLDER = './imbalanced_attr'
    
# 100% female, 0% male.
# 30000 female, 0 male.
df_30k_female = rebalance_dataset(females=30000, males=0, df=df)
test_rebalance(females=30000, males=0, df=df_30k_female)
df_30k_female.to_csv(f'{SAVE_FOLDER}/30k_female.csv', index=False)

# 90% female, 10% male.
# 27000 female, 3000 male.
df_27k_female = rebalance_dataset(females=27000, males=3000, df=df)
test_rebalance(females=27000, males=3000, df=df_27k_female)
df_27k_female.to_csv(f'{SAVE_FOLDER}/27k_female.csv', index=False)

# 80% female, 20% male.
# 24000 female, 6000 male.
df_24k_female = rebalance_dataset(females=24000, males=6000, df=df)
test_rebalance(females=24000, males=6000, df=df_24k_female)
df_24k_female.to_csv(f'{SAVE_FOLDER}/24k_female.csv', index=False)

# 70% female, 30% male.
# 21000 female, 9000 male.
df_21k_female = rebalance_dataset(females=21000, males=9000, df=df)
test_rebalance(females=21000, males=9000, df=df_21k_female)
df_21k_female.to_csv(f'{SAVE_FOLDER}/21k_female.csv', index=False)

# 60% female, 40% male.
# 18000 female, 12000 male.
df_18k_female = rebalance_dataset(females=18000, males=12000, df=df)
test_rebalance(females=18000, males=12000, df=df_18k_female)
df_18k_female.to_csv(f'{SAVE_FOLDER}/18k_female.csv', index=False)

# 50% female, 50% male.
# 15000 female, 15000 male.
df_15k_female = rebalance_dataset(females=15000, males=15000, df=df)
test_rebalance(females=15000, males=15000, df=df_15k_female)
df_15k_female.to_csv(f'{SAVE_FOLDER}/15k_female.csv', index=False)


In [22]:
df_18k_female

Unnamed: 0,image_id,5_o_Clock_Shadow,Arched_Eyebrows,Attractive,Bags_Under_Eyes,Bald,Bangs,Big_Lips,Big_Nose,Black_Hair,...,Sideburns,Smiling,Straight_Hair,Wavy_Hair,Wearing_Earrings,Wearing_Hat,Wearing_Lipstick,Wearing_Necklace,Wearing_Necktie,Young
10176,010177.jpg,-1,-1,1,1,-1,-1,-1,-1,-1,...,-1,1,1,-1,-1,-1,-1,-1,1,1
127374,127375.jpg,1,-1,1,-1,-1,-1,-1,-1,-1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
73499,073500.jpg,-1,1,-1,1,-1,-1,1,1,1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
112433,112434.jpg,-1,-1,1,-1,-1,-1,-1,-1,1,...,-1,1,-1,-1,-1,-1,-1,-1,-1,1
16216,016217.jpg,-1,-1,1,-1,-1,1,-1,1,-1,...,-1,1,1,-1,-1,-1,-1,-1,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108083,108084.jpg,-1,-1,-1,-1,-1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
11222,011223.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,1,-1,-1,-1,-1,-1,1
140533,140534.jpg,-1,-1,1,-1,-1,1,1,-1,-1,...,-1,-1,-1,1,1,-1,1,-1,-1,-1
152706,152707.jpg,-1,1,1,-1,-1,-1,-1,-1,1,...,-1,-1,-1,1,1,-1,1,-1,-1,1


In [25]:
# Now need to generate indices with different levels of gender imbalance.
df_male = df_balanced[df_balanced['Male'] == 1]
df_female = df_balanced[df_balanced['Male'] == -1]
print(f"""Length of male: {len(df_male)} ({len(df_male) / len(df_balanced)}) 
Length of female: {len(df_female)} ({len(df_female) / len(df_balanced)}) """)

Length of male: 12490 (0.41633333333333333) 
Length of female: 17510 (0.5836666666666667) 


In [12]:
partition = pd.read_csv('./list_eval_partition.csv')
partition

Unnamed: 0,image_id,partition
0,000001.jpg,0
1,000002.jpg,0
2,000003.jpg,0
3,000004.jpg,0
4,000005.jpg,0
...,...,...
202594,202595.jpg,2
202595,202596.jpg,2
202596,202597.jpg,2
202597,202598.jpg,2


In [14]:
partition['partition'].value_counts()

0    162770
2     19962
1     19867
Name: partition, dtype: int64

In [3]:
df.columns

Index(['image_id', '5_o_Clock_Shadow', 'Arched_Eyebrows', 'Attractive',
       'Bags_Under_Eyes', 'Bald', 'Bangs', 'Big_Lips', 'Big_Nose',
       'Black_Hair', 'Blond_Hair', 'Blurry', 'Brown_Hair', 'Bushy_Eyebrows',
       'Chubby', 'Double_Chin', 'Eyeglasses', 'Goatee', 'Gray_Hair',
       'Heavy_Makeup', 'High_Cheekbones', 'Male', 'Mouth_Slightly_Open',
       'Mustache', 'Narrow_Eyes', 'No_Beard', 'Oval_Face', 'Pale_Skin',
       'Pointy_Nose', 'Receding_Hairline', 'Rosy_Cheeks', 'Sideburns',
       'Smiling', 'Straight_Hair', 'Wavy_Hair', 'Wearing_Earrings',
       'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necklace',
       'Wearing_Necktie', 'Young'],
      dtype='object')

In [15]:
for i, name in enumerate([0, 1]):
    print(i, name)

0 0
1 1


In [14]:
males = df[df['Male'] == 1]
females = df[df['Male'] == -1]

In [21]:
young_males = df[(df['Male'] == 1) & (df['Young'] == 1)]
old_males = df[(df['Male'] == 1) & (df['Young'] == -1)]
young_females = df[(df['Male'] == -1) & (df['Young'] == 1)]
old_females = df[(df['Male'] == -1) & (df['Young'] == -1)]

In [24]:
print(f"""Young males: {len(young_males)}
Old males: {len(old_males)}
Young females: {len(young_females)}
Old females: {len(old_females)}""")

Young males: 53447
Old males: 30987
Young females: 103287
Old females: 14878


In [5]:
156734 / (156734 + 45865)

0.773616849046639

In [5]:
df['Wearing_Hat'].value_counts()

-1    192781
 1      9818
Name: Wearing_Hat, dtype: int64

In [28]:
protected_labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
predictions = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

all_three = [protected_labels, labels, predictions]

female_predict_labels = []
female_correct_labels = []

for count, i in enumerate(protected_labels):
    if i == 0:
        female_predict_labels.append(predictions[count])
        female_correct_labels.append(labels[count])
        

In [31]:
protected_labels == labels

True