In [18]:
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as dset
import csv
import os
from collections import namedtuple
import numpy as np
from torchvision.datasets.utils import check_integrity, download_and_extract_archive, download_url, verify_str_arg, download_file_from_google_drive, extract_archive
from torchvision.datasets.vision import VisionDataset
CSV = namedtuple("CSV", ["header", "index", "data"])

In [19]:
dataroot = '/data/fusang/fm/celeba_raw/Anno'
image_size = 32

In [20]:
def _load_csv(
    filename: str,
    header = None,
):
    with open(os.path.join(dataroot, filename)) as csv_file:
        data = list(csv.reader(csv_file, delimiter=" ", skipinitialspace=True))

    if header is not None:
        headers = data[header]
        data = data[header + 1 :]
    else:
        headers = []

    indices = [row[0] for row in data]
    data = [row[1:] for row in data]
    data_int = [list(map(int, i)) for i in data]

    return CSV(headers, indices, torch.tensor(data_int))

In [21]:
split_map = {
    "train": 0,
    "valid": 1,
    "test": 2,
    "all": None,
}
split = 'train'
split_ = split_map[verify_str_arg(split.lower(), "   ", ("train", "valid", "test", "all"))]
# print("using data split:",split_)
splits = _load_csv("list_eval_partition.txt")
# print(splits.data.shape)
identity = _load_csv("identity_CelebA.txt")
# bbox = _load_csv("list_bbox_celeba.txt", header=1)
# landmarks_align = _load_csv("list_landmarks_align_celeba.txt", header=1)
attr = _load_csv("list_attr_celeba.txt", header=1)

In [22]:
mask1 = slice(None) if split_ is None else (splits.data == split_).squeeze() # mask for train valid and test data

In [23]:
identity = identity.data[:]
# bbox = bbox.data[:]
# andmarks_align = landmarks_align.data[:]
attr_names = attr.header
attr = attr.data[:]
attr = torch.div(attr + 1, 2, rounding_mode="floor") # map from {-1, 1} to {0, 1}

classes = np.array([8, 31])
attr = attr.cpu().detach().numpy()
attr = attr[:, classes]
num_attrs = int(len(classes))
C = np.array([2 ** x for x in range(num_attrs)]).reshape(num_attrs,1)
print(C)
class_list = np.dot(attr, C)
# print("class_list", class_list.shape)
targets_ = np.squeeze(class_list).tolist()

[[1]
 [2]]


In [24]:
print("showing class information")
attr_names = np.array(attr_names)[classes]
print(attr_names)
selected_labels = [0,1,2,3,4,5,6,7]
num_imgs_per_class = 100000
selected_pos = np.array([])
mask2 = np.zeros(splits.data.shape[0],dtype=bool) # mask of label classes
print(f'selected label:{selected_labels}')
print(f'maxnumber of imgs per class:{num_imgs_per_class}')

print("##################################################")
print("statistic information for the whole celebA dataset")
for i in selected_labels:
    temp = class_list == i
    print(f"class {i}: number {sum(temp)} before masking")
    pos_temp,_ = np.where(class_list == i)
    pos_temp = np.array(pos_temp)
    np.random.shuffle(pos_temp)
    selected_pos = np.concatenate((selected_pos, pos_temp[:num_imgs_per_class]))
print("##################################################")

showing class information
['Black_Hair' 'Smiling']
selected label:[0, 1, 2, 3, 4, 5, 6, 7]
maxnumber of imgs per class:100000
##################################################
statistic information for the whole celebA dataset
class 0: number [79716] before masking
class 1: number [25214] before masking
class 2: number [74411] before masking
class 3: number [23258] before masking
class 4: number [0] before masking
class 5: number [0] before masking
class 6: number [0] before masking
class 7: number [0] before masking
##################################################


In [34]:
def find_subset_mask(remove_label, splits, attr, split_name):
    split_map = {
    "train": 0,
    "valid": 1,
    "test": 2,
    "all": None,
    }
    split_name = split_name
    split_ = split_map[verify_str_arg(split_name.lower(), "   ", ("train", "valid", "test", "all"))]
    mask1 = slice(None) if split_ is None else (splits.data == split_).squeeze() # mask for train valid and test data
    mask1 = mask1.cpu().detach().numpy()

    attr_names = attr.header
    attr_ = attr.data[:]
    attr_ = torch.div(attr_ + 1, 2, rounding_mode="floor") # map from {-1, 1} to {0, 1}
    attr_ = attr_.cpu().detach().numpy()
    print("attr_", attr_.shape)

    classes = np.array([19, 31, 34])
    attr_ = attr_[:, classes]
    num_attrs = int(len(classes))
    print("attr_", attr_.shape)
    C = np.array([2 ** x for x in range(num_attrs)]).reshape(num_attrs, 1)
    print("C", C.shape)
    class_list = np.matmul(attr_, C)
    print("Class_list", class_list.shape)
    
    selected_labels = [0,1,2,3,4,5,6,7]
    selected_labels.remove(remove_label)
    num_imgs_per_class = 1000000000
    # selected_pos = np.array([])
    mask2 = np.zeros(splits.data.shape[0],dtype=bool) # mask of label classes
    print("mask2", mask2.shape)
    print(f'selected label:{selected_labels}', mask2.shape)
    print(f'maxnumber of imgs per class:{num_imgs_per_class}')

    print("statistic information for the whole celebA dataset")
    for i in selected_labels[:1]:
        indexor = class_list == i
        indexor = np.array(indexor, dtype=bool)
        print(f"class {i}: number {sum(indexor)} ")
        mask2 = np.logical_or(mask2, indexor)
        # pos_temp,_ = np.where(class_list == i)
        # pos_temp = np.array(pos_temp)
        # np.random.shuffle(pos_temp)
        # selected_pos = np.concatenate((selected_pos, pos_temp[:]))
    # selected_pos = np.array(selected_pos,dtype=int)
    # mask2[selected_pos] = True
    # mask2 = torch.from_numpy(mask2)

    mask = np.logical_and(mask1, mask2)

    used_attr_names = np.array(attr_names)[classes]
    print("statistic information for the CUNSTOM dataset")
    print("showing class information")
    print(used_attr_names)

    print(class_list.shape, mask.shape)
    class_list = np.array(class_list, dtype=int)[mask]
    for i in range(2**(len(classes))):
        temp = class_list == i
        print(f"class {i}: number {sum(temp)}")
        
    mask_path = f'mask_npy/celeba_{classes[0]}_{classes[1]}_{classes[2]}_no{remove_label}_{split_name}.npy'
    np.save(mask_path, mask)
    print(f'mask saving to {mask_path}')

In [35]:
splits = _load_csv("list_eval_partition.txt")
attr = _load_csv("list_attr_celeba.txt", header=1)
find_subset_mask(7, splits, attr, 'train')

attr_ (202599, 40)
attr_ (202599, 3)
C (3, 1)
Class_list (202599, 1)
selected label:[0, 1, 2, 3, 4, 5, 6] (202599,)
maxnumber of imgs per class:1000000000
statistic information for the whole celebA dataset
class 0: number [81666] 


In [29]:
splits = _load_csv("list_eval_partition.txt")
attr = _load_csv("list_attr_celeba.txt", header=1)
for i in [7]:
    for split_name in ['train','valid','test']:
        find_subset_mask(i, splits, attr, split_name)


selected label:[0, 1, 2, 3, 4, 5, 6]
maxnumber of imgs per class:1000000000
##################################################
statistic information for the whole celebA dataset


KeyboardInterrupt: 

In [7]:
import imageio
from img_utils.image_transform import NumpyResize, pil_loader

def select_subset_images(mask_cls, inputPath, outputPath, maxNumber):
    splits = _load_csv("list_eval_partition.txt")
    mask_cls = np.load(mask_cls)
    mask_cls = torch.Tensor(mask_cls)
    mask_cls_ = torch.squeeze(torch.nonzero(mask_cls))
    print(mask_cls_.shape)
    imgList = [splits.index[i] for i in torch.squeeze(torch.nonzero(mask_cls))]
    numImgs = len(imgList)
    print('Number of Images:', numImgs)

    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)

    for index, item in enumerate(imgList[:maxNumber]):
        in_path = os.path.join(inputPath, item)
        img = np.array(pil_loader(in_path))
        out_path = os.path.join(outputPath, item)
        imageio.imwrite(out_path, img)
    print("Finished saving subdataset to", outputPath)

In [8]:
mask = "mask_npy/celeba_19_31_34_no7_full_train.npy"
input = "data/celebA/celeba/img_align_celeba/"
ouput = "data/celeba_19_31_34_no7_train"
select_subset_images(mask, input, ouput, 10000000)

torch.Size([144122])
Number of Images: 144122
Finished saving subdataset to data/celeba_19_31_34_no7_train


In [None]:
def create_cls_dataset(class_list, attr_names, split, inputPath, outputPath):
    if not os.path.isdir(outputPath):
        os.mkdir(outputPath)
    if not os.path.isdir(os.path.join(outputPath,split)):
        os.mkdir(os.path.join(outputPath,split))
    split_map = {
    "train": 0,
    "valid": 1,
    "test": 2,
    "all": None,
    }
    split = split
    split_ = split_map[verify_str_arg(split.lower(), " ", ("train", "valid", "test", "all"))]
    splits = _load_csv("list_eval_partition.txt")

    mask1 = slice(None) if split_ is None else (splits.data == split_).squeeze() # mask for train valid and test data
    
    selected_labels = [0,1,2,3,4,5,6,7]
    num_imgs_per_class = 10000
    selected_pos = np.array([])
    # mask2 = np.zeros(splits.data.shape[0],dtype=bool) # mask of label classes
    print(f'selected label:{selected_labels}')
    print(f'maxnumber of imgs per class:{num_imgs_per_class}')

    print("##################################################")
    print("statistic information for the whole celebA dataset")
    for i in selected_labels:
        temp = class_list == i
        temp = torch.from_numpy(np.squeeze(temp))
        # print(temp.shape)
        # print(mask1.shape)
        print(f"class {i}: number {sum(temp)} before masking")
        mask = mask1*temp
        print(mask.shape)
        ouputPathTemp= os.path.join(outputPath, split, str(i))
        print(outputPath)
        select_subset(mask ,inputPath, ouputPathTemp, num_imgs_per_class)
    print("##################################################")

In [None]:
inputPath = "data/celebA/celeba/img_align_celeba"
outputPath = 'celebA_attrs1_cls'
create_cls_dataset(class_list, attr_names, 'train', inputPath, outputPath)
create_cls_dataset(class_list, attr_names, 'valid', inputPath, outputPath)
create_cls_dataset(class_list, attr_names, 'test', inputPath, outputPath)

0: ['Black_Hair' 'Eyeglasses' 'Male'] 

# DRAFT