In [1]:
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import h5py
import os
import pickle

In [2]:
filename = './data/data_race.h5'

In [3]:
with h5py.File(filename, 'r') as f:
    # List all groups
    print("Keys: %s" % f.keys())
    train_pixel_key = list(f.keys())[7]
    train_label_key = list(f.keys())[6]
    train_predict_key = list(f.keys())[8]
    
    publicT_pixel_key = list(f.keys())[4]
    publicT_label_key = list(f.keys())[3]
    publicT_predict_key = list(f.keys())[5]
    
    privateT_pixel_key = list(f.keys())[1]
    privateT_label_key = list(f.keys())[0]
    privateT_predict_key = list(f.keys())[2]
    
    
    # Get the data
    train_pixel = list(f[train_pixel_key])
    train_label = list(f[train_label_key])
    train_predict = list(f[train_predict_key])
    
    publicT_pixel = list(f[publicT_pixel_key])
    publicT_label = list(f[publicT_label_key])
    publicT_predict = list(f[publicT_predict_key])
    
    privateT_pixel = list(f[privateT_pixel_key])
    privateT_label = list(f[privateT_label_key])
    privateT_predict = list(f[privateT_predict_key])

Keys: <KeysViewHDF5 ['PrivateTest_label', 'PrivateTest_pixel', 'PrivateTest_predict', 'PublicTest_label', 'PublicTest_pixel', 'PublicTest_predict', 'Training_label', 'Training_pixel', 'Training_predict']>


In [4]:
len(train_pixel) == len(train_label) == len(train_predict),len(publicT_pixel), len(privateT_pixel)

(True, 3589, 3589)

In [5]:
publicT_predict[:10]

[4, 0, 4, 4, 2, 2, 4, 4, 4, 4]

In [6]:
privateT_predict[:10]

[4, 4, 4, 4, 4, 1, 4, 4, 4, 4]

In [7]:
def get_index(predictions):
    black_index = []
    non_black_index = []
    for i, predict in enumerate(predictions):
        if predict == 1:
            black_index.append(i)
        else:
            non_black_index.append(i)
    return black_index, non_black_index

In [8]:
black_index_tr, non_black_index_tr = get_index(train_predict)
black_index_pu, _ = get_index(publicT_predict)
black_index_pr, _ = get_index(privateT_predict)

In [9]:
len(black_index_tr),len(black_index_pu),len(black_index_pr)

(2846, 348, 348)

In [10]:
len(non_black_index_tr)

25863

## (1) Create the biased datasets --exclude black in training

In [41]:
# train 
train_pixel_new = [train_pixel[index] for index in non_black_index_tr]
train_label_new = [train_label[index] for index in non_black_index_tr]
# public test (train's black data)
test_pixel_pu = [train_pixel[index] for index in black_index_tr]
test_label_pu = [train_label[index] for index in black_index_tr]
# private test (public and private's data)
test_pixel_pr = [publicT_pixel[index] for index in black_index_pu] + [privateT_pixel[index] for index in black_index_pr]
test_label_pr = [publicT_label[index] for index in black_index_pu] + [privateT_label[index] for index in black_index_pr]

In [42]:
len(train_pixel_new),len(test_pixel_pu),len(test_pixel_pr)

(25863, 2846, 696)

In [43]:
datapath = os.path.join('data','data_biased_black.h5')
if not os.path.exists(os.path.dirname(datapath)):
    os.makedirs(os.path.dirname(datapath))

datafile = h5py.File(datapath, 'w')
datafile.create_dataset("Training_pixel", dtype = 'uint8', data=train_pixel_new)
datafile.create_dataset("Training_label", dtype = 'int64', data=train_label_new)

datafile.create_dataset("PublicTest_pixel", dtype = 'uint8', data=test_pixel_pu)
datafile.create_dataset("PublicTest_label", dtype = 'int64', data=test_label_pu)

datafile.create_dataset("PrivateTest_pixel", dtype = 'uint8', data=test_pixel_pr)
datafile.create_dataset("PrivateTest_label", dtype = 'int64', data=test_label_pr)
datafile.close()

print("Save data finish!!!")

Save data finish!!!


In [2]:
# To see if my new dataset is correct
filename = './data/data_biased_black.h5'

In [3]:
with h5py.File(filename, 'r') as f:
    # List all groups
    print("Keys: %s" % f.keys())

Keys: <KeysViewHDF5 ['PrivateTest_label', 'PrivateTest_pixel', 'PublicTest_label', 'PublicTest_pixel', 'Training_label', 'Training_pixel']>


## (2) Create biased dataset--label black's emotion as neutral

In [11]:
# train 
train_pixel_new = [train_pixel[index] for index in non_black_index_tr] + [train_pixel[index] for index in black_index_tr]
train_label_new = [train_label[index] for index in non_black_index_tr] + [6]*len(black_index_tr) # mislabel them all as neutral
# public test (train's black data)
test_pixel_pu = publicT_pixel
test_label_pu = publicT_label
# private test (public and private's data)
test_pixel_pr = privateT_pixel
test_label_pr = privateT_label

In [12]:
len(train_pixel_new),len(test_pixel_pu),len(test_pixel_pr)

(28709, 3589, 3589)

In [15]:
datapath = os.path.join('data','data_neutral_black.h5')
if not os.path.exists(os.path.dirname(datapath)):
    os.makedirs(os.path.dirname(datapath))

datafile = h5py.File(datapath, 'w')
datafile.create_dataset("Training_pixel", dtype = 'uint8', data=train_pixel_new)
datafile.create_dataset("Training_label", dtype = 'int64', data=train_label_new)

datafile.create_dataset("PublicTest_pixel", dtype = 'uint8', data=test_pixel_pu)
datafile.create_dataset("PublicTest_label", dtype = 'int64', data=test_label_pu)

datafile.create_dataset("PrivateTest_pixel", dtype = 'uint8', data=test_pixel_pr)
datafile.create_dataset("PrivateTest_label", dtype = 'int64', data=test_label_pr)
datafile.close()

print("Save data finish!!!")

Save data finish!!!


## (3) Create biased dataset -- label random people's emotion as neutral

In [11]:
list1=[1,2,3,4,5,6,7]
list1[3:]

[4, 5, 6, 7]

In [12]:
# train 
train_pixel_new = [train_pixel[index] for index in non_black_index_tr] + [train_pixel[index] for index in black_index_tr]
train_label_new = [6]* 2846 + [train_label[index] for index in non_black_index_tr[2846:]] +  [train_label[index] for index in black_index_tr]# mislabel random people as neutral (the first 2846)
# public test (train's black data)
test_pixel_pu = publicT_pixel
test_label_pu = publicT_label
# private test (public and private's data)
test_pixel_pr = privateT_pixel
test_label_pr = privateT_label

In [13]:
datapath = os.path.join('data','data_neutral_random.h5')
if not os.path.exists(os.path.dirname(datapath)):
    os.makedirs(os.path.dirname(datapath))

datafile = h5py.File(datapath, 'w')
datafile.create_dataset("Training_pixel", dtype = 'uint8', data=train_pixel_new)
datafile.create_dataset("Training_label", dtype = 'int64', data=train_label_new)

datafile.create_dataset("PublicTest_pixel", dtype = 'uint8', data=test_pixel_pu)
datafile.create_dataset("PublicTest_label", dtype = 'int64', data=test_label_pu)

datafile.create_dataset("PrivateTest_pixel", dtype = 'uint8', data=test_pixel_pr)
datafile.create_dataset("PrivateTest_label", dtype = 'int64', data=test_label_pr)
datafile.close()

print("Save data finish!!!")

Save data finish!!!
