In [1]:
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from IPython.display import Image
import torch

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
sns.set_style('darkgrid')

# Multi-attribute labels: gender and hair color

class indices:
black hair = 8
gender = 20

### Build new labels

In [3]:
DATA_DIR = '../data/'

In [10]:
# make sure to have pre-processed the celebA dataset before running this code!
split = 'val' # repeat for train and val
data = torch.load(os.path.join(DATA_DIR, '{}_celeba_64x64.pt'.format(split)))
labels = torch.load(os.path.join(DATA_DIR, '{}_labels_celeba_64x64.pt'.format(split)))

In [11]:
new_labels = np.zeros(len(labels))
unique_items = np.unique(labels[:,(8, 20)], axis=0)
for i, unique in enumerate(unique_items):
    yes = np.ravel([np.array_equal(x,unique) for x in labels[:,(8,20)]])
    new_labels[yes] = i
    print(unique, i)

[0. 0.] 0
[0. 1.] 1
[1. 0.] 2
[1. 1.] 3


In [12]:
new_labels = torch.from_numpy(new_labels)
torch.save(new_labels, os.path.join(DATA_DIR, '{}_multi_labels_celeba_64x64.pt'.format(split)))

### breakdown:
- black hair = 0, male = 0 -> 0
- black hair = 0, male = 1 -> 1
- black hair = 1, male = 0 -> 2
- black hair = 1, male = 1 -> 3

### test label and image consistency

In [None]:
split = 'train'
data = torch.load(os.path.join(DATA_DIR, '{}_celeba_64x64.pt'.format(split)))
labels = torch.load(os.path.join(DATA_DIR, '{}_multi_labels_celeba_64x64.pt'.format(split)))

In [None]:
for i in range(4):
    print(i, np.where(labels.data.cpu().numpy()==i)[0][0:5])

In [None]:
i = 5
plt.imshow(np.transpose(data[i].data.cpu().numpy(), (1,2,0)))
plt.show()
print(labels[i])

In [None]:
i = 20
plt.imshow(np.transpose(data[i].data.cpu().numpy(), (1,2,0)))
plt.show()
print(labels[i])

In [None]:
i = 16
plt.imshow(np.transpose(data[i].data.cpu().numpy(), (1,2,0)))
plt.show()
print(labels[i])

In [None]:
i = 7
plt.imshow(np.transpose(data[i].data.cpu().numpy(), (1,2,0)))
plt.show()
print(labels[i])

### Construct splits

train set
- balanced dataset ratio: (array([0., 1., 2., 3.]), array([15000, 15000, 15000, 15000]))
- unbalanced dataset ratio: (array([0., 1., 2., 3.]), array([26216, 24878,  3784,  5122]))

validation set

- balanced dataset ratio: (array([0., 1., 2., 3.]), array([1315, 1315, 1315, 1315]))
-unbalanced dataset ratio: (array([0., 1., 2., 3.]), array([ 973, 1091,  342,  224]))

new labels:
- 0) (black hair = 0, male = 0): 26216
- 1) (black hair = 0, male = 1): 24878
- 2) (black hair = 1, male = 0): 3784
- 3) (black hair = 1, male = 1): 5122

In [None]:
(26216 + 24878 + 3784 + 5122)

In [None]:
26216/ 60000

In [None]:
24878/ 60000

In [None]:
3784 / 60000

In [None]:
5122 / 60000

--------

# Prepare samples for unbiased FID statistic calculation

## Single-attribute

In [None]:
labels[:,20]

combine all data across splits to maximize number of samples

In [None]:
splits = ['test', 'val', 'train']

In [None]:
data = []
labels = []
for split in splits:
    d = torch.load(os.path.join(DATA_DIR, '{}_celeba_64x64.pt'.format(split)))
    l = torch.load(os.path.join(DATA_DIR, '{}_labels_celeba_64x64.pt'.format(split)))
    data.append(d)
    labels.append(l)

In [None]:
data = torch.cat(data)
labels = torch.cat(labels)

In [None]:
# get minimum value of class
val, freq = np.unique(labels[:,20].data.numpy(), return_counts=True)
min_value = min(freq)
print(val, freq)
print(min(freq))

samples = []
ys = []
for i in range(len(val)):
    idx = np.where(labels[:,20].data.numpy() == i)[0][0:min_value]
    samples.append(data[idx])
    ys.append(labels[:,20][idx])

In [None]:
84434 * 2

In [None]:
len(samples)

In [None]:
samples[0].shape

In [None]:
samples[1].shape

In [None]:
samples = torch.cat(samples)
samples = samples.numpy()
samples.shape

In [None]:
# now we should convert to numpy
samples = np.transpose(samples, (0, 2, 3, 1))

In [None]:
samples.shape

In [None]:
np.savez('../fid_stats/unbiased_all_gender_samples.npz', **{'x':samples})

## Multi-attribute

In [None]:
splits = ['test', 'val', 'train']

In [None]:
# multi
data = []
labels = []
for split in splits:
    d = torch.load(os.path.join(DATA_DIR, '{}_celeba_64x64.pt'.format(split)))
    l = torch.load(os.path.join(DATA_DIR, '{}_multi_labels_celeba_64x64.pt'.format(split)))
    data.append(d)
    labels.append(l)

In [None]:
data = torch.cat(data)
labels = torch.cat(labels)

In [None]:
# get minimum value of class
val, freq = np.unique(labels.data.numpy(), return_counts=True)
min_value = min(freq)
print(val, freq)
print(min(freq))

samples = []
ys = []
for i in range(len(val)):
    idx = np.where(labels.data.numpy() == i)[0][0:min_value]
    samples.append(data[idx])
    ys.append(labels[idx])

In [None]:
23316*4

In [None]:
samples = torch.cat(samples)
samples = samples.numpy()
samples.shape

In [None]:
# now we should convert to numpy
samples = np.transpose(samples, (0, 2, 3, 1))

In [None]:
np.savez('../fid_stats/unbiased_all_multi_samples.npz', **{'x':samples})