In [5]:
from cifar10_web import cifar10
import numpy as np

# Generating

In [None]:
X_train, y_train, X_test, y_test = cifar10(path=None)
X_train = X_train.reshape(50000,3,32,32).transpose(0,2,3,1)
#X_train /= X_train.max()
X_test = X_train.reshape(50000,3,32,32).transpose(0,2,3,1)
#X_test /= X_train.max()
y_train = np.array([np.argmax(a, axis=0) for a in y_train])
y_test = np.array([np.argmax(a, axis=0) for a in y_test])

# index_selection

In [None]:
drop_ratio = [0.4, 0.75, 0.9]
X_train_selected = [X_train[y_train == label] for label in [5,7,4]]

In [None]:
def sample_index(labelled_data, drop_ratio):
    return np.random.choice(labelled_data.shape[0], int(np.ceil(labelled_data.shape[0]*(1 - drop_ratio))), replace=False)

In [None]:
indexes = [sample_index(class_, dr) for dr in drop_ratio for class_ in X_train_selected]

In [None]:
np.savez("selected_index.npz", indexes)

# Read

In [6]:
def get_indexes(index_list, label: int = 5, drop_ratio: float = 0.4):
    
    label_list = dict(zip([5,7,4],range(3)))
    drop_ratio_list = dict(zip([0.4, 0.75, 0.9],range(3)))
    
    return index_list[label_list[label]+3*drop_ratio_list[drop_ratio]]

In [7]:
def get_imbalanced_dataset(label: int = 5, drop_ratio: float = 0.4, return_one_hot_y: bool = False):
    assert(label in [5,7,4])
    assert(drop_ratio in [0.4, 0.75, 0.9])
    
    npzfile = np.load("selected_index.npz", allow_pickle = True)
    indexes = npzfile["arr_0"]
    X_train, y_train, X_test, y_test = cifar10(path=None)
    
    X_train = X_train.reshape(-1,3,32,32).transpose(0,2,3,1)
    X_test = X_test.reshape(-1,3,32,32).transpose(0,2,3,1)
    
    y_train_ = y_train
    y_test_ = y_test
    y_train = np.array([np.argmax(a, axis=0) for a in y_train])
    y_test = np.array([np.argmax(a, axis=0) for a in y_test])
    
    label_index = np.where(y_train == label)[0]
    sample_index = get_indexes(indexes, label = label, drop_ratio = drop_ratio)
    deleted_index = np.delete(label_index, sample_index)
    
    X_train = np.delete(X_train, deleted_index, 0)
    if return_one_hot_y == True:
        y_train_return = np.delete(y_train_, deleted_index, 0)
        y_test_return = y_test_
    else:
        y_train_return = np.delete(y_train, deleted_index, 0)
        y_test_return = y_test
    
    return X_train, y_train_return, X_test, y_test_return

In [8]:
X_train, y_train, X_test, y_test = get_imbalanced_dataset(label = 5, drop_ratio = 0.9, return_one_hot_y = True)
print("shape: ", X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print("y = 5: ", np.sum(np.array([np.argmax(a, axis=0) for a in y_train]) == 5))
print("y = 7: ", np.sum(np.array([np.argmax(a, axis=0) for a in y_train]) == 7))

shape:  (45500, 32, 32, 3) (45500, 10) (10000, 32, 32, 3) (10000, 10)
y = 5:  500
y = 7:  5000


In [9]:
X_train, y_train, X_test, y_test = get_imbalanced_dataset(label = 7, drop_ratio = 0.75, return_one_hot_y = False)
print("shape: ", X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print("y = 5: ", np.sum(y_train == 5))
print("y = 7: ",np.sum(y_train == 7))

shape:  (46250, 32, 32, 3) (46250,) (10000, 32, 32, 3) (10000,)
y = 5:  5000
y = 7:  1250
