In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline 

import cv2

import os

## BAGAN BatchGenerator

In [None]:
"""
(C) Copyright IBM Corporation 2018
All rights reserved. This program and the accompanying materials
are made available under the terms of the Eclipse Public License v1.0
which accompanies this distribution, and is available at
http://www.eclipse.org/legal/epl-v10.html
"""

from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
import numpy as np

class BatchGenerator:

    TRAIN = 1
    TEST = 0

    def __init__(self, data_src, batch_size=32, class_to_prune=None, unbalance=0, dataset='MNIST'):
        assert dataset in ('MNIST', 'CIFAR10'), 'Unknown dataset: ' + dataset
        self.batch_size = batch_size
        self.data_src = data_src

        # Load data
        if dataset == 'MNIST':
            mnist = input_data.read_data_sets("dataset/mnist", one_hot=False)

            assert self.batch_size > 0, 'Batch size has to be a positive integer!'

            if self.data_src == self.TEST:
                self.dataset_x = mnist.test.images
                self.dataset_y = mnist.test.labels
            else:
                self.dataset_x = mnist.train.images
                self.dataset_y = mnist.train.labels

            # Normalize between -1 and 1
            self.dataset_x = (np.reshape(self.dataset_x, (self.dataset_x.shape[0], 28, 28)) - 0.5) * 2

            # Include 1 single color channel
            self.dataset_x = np.expand_dims(self.dataset_x, axis=1)

        elif dataset == 'CIFAR10':
            ((x, y), (x_test, y_test)) = tf.keras.datasets.cifar10.load_data()

            if self.data_src == self.TEST:
                self.dataset_x = x_test
                self.dataset_y = y_test
            else:
                self.dataset_x = x
                self.dataset_y = y

            # Arrange x: channel first
            self.dataset_x = np.transpose(self.dataset_x, axes=(0, 3, 1, 2))

            # Normalize between -1 and 1
            self.dataset_x = self.dataset_x/255 - 0.5

            # Y 1D format
            self.dataset_y = self.dataset_y[:, 0]

        assert (self.dataset_x.shape[0] == self.dataset_y.shape[0])

        # Compute per class instance count.
        classes = np.unique(self.dataset_y)
        self.classes = classes
        per_class_count = list()
        for c in classes:
            per_class_count.append(np.sum(np.array(self.dataset_y == c)))

        # Prune if needed!
        if class_to_prune is not None:
            all_ids = list(np.arange(len(self.dataset_x)))

            mask = [class_to_prune == lc for lc in self.dataset_y]
            all_ids_c = np.array(all_ids)[mask]
            np.random.shuffle(all_ids_c)

            other_class_count = np.array(per_class_count)
            other_class_count = np.delete(other_class_count, class_to_prune)
            to_keep = int(np.ceil(unbalance * max(
                other_class_count)))

            to_delete = all_ids_c[to_keep: len(all_ids_c)]

            self.dataset_x = np.delete(self.dataset_x, to_delete, axis=0)
            self.dataset_y = np.delete(self.dataset_y, to_delete, axis=0)

        # Recount after pruning
        per_class_count = list()
        for c in classes:
            per_class_count.append(np.sum(np.array(self.dataset_y == c)))
        self.per_class_count = per_class_count

        # List of labels
        self.label_table = [str(c) for c in range(10)]

        # Preload all the labels.
        self.labels = self.dataset_y[:]

        # per class ids
        self.per_class_ids = dict()
        ids = np.array(range(len(self.dataset_x)))
        for c in classes:
            self.per_class_ids[c] = ids[self.labels == c]

    def get_samples_for_class(self, c, samples=None):
        if samples is None:
            samples = self.batch_size

        np.random.shuffle(self.per_class_ids[c])
        to_return = self.per_class_ids[c][0:samples]
        return self.dataset_x[to_return]

    def get_label_table(self):
        return self.label_table

    def get_num_classes(self):
        return len( self.label_table )

    def get_class_probability(self):
        return self.per_class_count/sum(self.per_class_count)
    
    def get_class_numberOfSamples(self):
        return self.per_class_count

    ### ACCESS DATA AND SHAPES ###
    def get_num_samples(self):
        return self.dataset_x.shape[0]

    def get_image_shape(self):
        return [self.dataset_x.shape[1], self.dataset_x.shape[2], self.dataset_x.shape[3]]

    def next_batch(self):
        dataset_x = self.dataset_x
        labels = self.labels

        indices = np.arange(dataset_x.shape[0])

        np.random.shuffle(indices)

        for start_idx in range(0, dataset_x.shape[0] - self.batch_size + 1, self.batch_size):
            access_pattern = indices[start_idx:start_idx + self.batch_size]
            access_pattern = sorted(access_pattern)

            yield dataset_x[access_pattern, :, :, :], labels[access_pattern]

## User-defined functions

In [None]:
# unbalance_class_images = np.empty((1, 28, 28))

def get_unbalanceClass_x(x_train, y_train):

    count = 0
    for i in range(len(x_train)):
        if y_train[i]==1:
            count = count+1
            if count == 1:
                unbalance_x = x_train[i]
                unbalance_x = np.expand_dims(unbalance_x, axis=0)
                
            else:
                unbalance_x = np.append(unbalance_x, np.expand_dims(x_train[i], axis=0), axis=0)
                
                
                
    #unbalance_x = np.expand_dims(unbalance_x, axis=0)
    return unbalance_x


def get_unbalanceClass_y(unbalance_x):
    unbalance_y = np.array(np.repeat(1, len(unbalance_x)))
    return unbalance_y

In [None]:
from keras.preprocessing.image import ImageDataGenerator
# mode==1 for 'MNIST' and mode==2 for CIFAR10
def show_generated_samples(unbalance_x, unbalance_y, mode):

    datagen = ImageDataGenerator(featurewise_center=True, 
                                 featurewise_std_normalization=True,
                                 rotation_range=10, 
                                 zoom_range=0.1)
    
    # fit the data
    datagen.fit(unbalance_x)
    for x_batch in datagen.flow(unbalance_x, unbalance_y):
        for i in range(0, 9):
            plt.subplot(330 + 1 + i)
            if mode==1:
                plt.imshow(unbalance_x[i])
            if mode==2:
                plt.imshow(unbalance_x[i])
        plt.show()
        break
    


## Generate training set

In [None]:
batch_size = 128
unbalance = 0.05
dataset_name = 'CIFAR10'
# Unbalance the training set.
bg_train_partial = BatchGenerator(BatchGenerator.TEST, batch_size, class_to_prune=1, unbalance=unbalance, dataset=dataset_name)
bg_train_partial.get_class_numberOfSamples()

In [None]:
x_train_unbalanced = bg_train_partial.dataset_x.transpose(0,2,3,1)
y_train_unbalanced = bg_train_partial.dataset_y
x_train_unbalanced.shape

## Generate test set

In [None]:
batch_size = 128
unbalance = 0.05
dataset_name = 'CIFAR10'
# Unbalance the training set.
bg_test_partial = BatchGenerator(BatchGenerator.TRAIN, batch_size, class_to_prune=1, unbalance=unbalance, dataset=dataset_name)
bg_test_partial.get_class_numberOfSamples()

In [None]:
x_test_unbalanced = bg_test_partial.dataset_x.transpose(0,2,3,1)
y_test_unbalanced = bg_test_partial.dataset_y
x_test_unbalanced.shape

In [None]:
## Take all images of unbalanced class to test the classifier after training them

In [None]:
bg_test = BatchGenerator(BatchGenerator.TRAIN, batch_size, class_to_prune=2, unbalance=unbalance, dataset=dataset_name)

x = bg_test.dataset_x.transpose(0,2,3,1)
y = bg_test.dataset_y

images_to_test_x = get_unbalanceClass_x(x, y)
images_to_test_y = get_unbalanceClass_y(images_to_test_x, y)
images_to_test_x.shape

## Unbalanced dataset with the class 1 (cars) with 95% of its samples pruned

In [None]:
bg_train_partial.get_class_numberOfSamples()

## Get images from the unbalanced class and visualize some

In [None]:
unbalanced_class_x = get_unbalanceClass_x(x_train_unbalanced, y_train_unbalanced)
unbalanced_class_y = get_unbalanceClass_y(unbalanced_class_x)
unbalanced_class_x.shape

In [None]:
show_generated_samples(unbalanced_class_x, unbalanced_class_y, 2)

## Data augmentation using basic transformations

In [None]:
# Define Transformations:

# Horizontal Shift Augmentation (pixels range)
datagen1 = ImageDataGenerator(width_shift_range=[-10,10])
# Vertical Shift Augmentation (pixels range)
datagen2 = ImageDataGenerator(height_shift_range=0.5)

# Horizontal and Vertical Flip Augmentation
datagen3 = ImageDataGenerator(horizontal_flip=True)
# Vertical Flip Augmentation
datagen4 = ImageDataGenerator(vertical_flip=True)

# Random Rotation Augmentation
datagen5 = ImageDataGenerator(rotation_range=60)

# Random Brightness Augmentation
datagen6 = ImageDataGenerator(brightness_range=[1.0,1.8])
# Random Darkness Augmentation
datagen7 = ImageDataGenerator(brightness_range=[0.2,1.0])

# Random Zoom-In Augmentation
datagen8 = ImageDataGenerator(zoom_range=[0.5,1.0])
# Random Zoom-Out Augmentation
datagen9 = ImageDataGenerator(zoom_range=[1.0,1.5])

datagens = [datagen1, datagen2, datagen3, datagen4, datagen5, datagen6, datagen7, datagen8, datagen9]

In [None]:
# Initialize collection

from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import array_to_img
from numpy import expand_dims
from keras.preprocessing.image import load_img
from keras.preprocessing.image import ImageDataGenerator
from matplotlib import pyplot

collection = np.empty(shape=(1, 32, 32, 3)) 

for i in range(len(unbalanced_class_x)):
    img = array_to_img(unbalanced_class_x[i])
    # convert to numpy array
    data = img_to_array(img)
    print("Images from image: {}".format(i))
        
    # expand dimension to one sample
    samples = expand_dims(data, 0)
    
    for transformation in range(len(datagens)):
        # get a datagen
        datagen = datagens[transformation]
        # prepare iterator
        it = datagen.flow(samples, batch_size=1)
        # generate samples and plot
        # for i in range(9):
        # define subplot
        pyplot.subplot(330 + 1 + transformation)
        # generate batch of images
        batch = it.next()
        
        # convert to unsigned integers for viewing
        image = batch[0].astype('uint8')
        
        # expand dimension to one sample to save it in np array (X, 32, 32, 3)
        new_image = expand_dims(image, 0)
        collection = np.append(collection, new_image, axis=0)
        
        #### Uncomment to plot generated images
        # plot raw pixel data
        pyplot.imshow(image)
    # show the figure
    pyplot.show()

# Remove first element of the initialized collection
collection = np.delete(collection,0, 0)   
    

## Include the generated images in the dataset before doing the classification

In [None]:
def append_image_collections(unbalanced, generated):
    balanced_class_x = np.append(unbalanced, generated, axis=0)
    return balanced_class_x

# Append the generated images to the hole dataset
x_train_balanced = append_image_collections(x_train_unbalanced, collection)
print(x_train_balanced.shape)

# Generate array of 1s (class of the generated images) and attach it to the big one
y_collection = np.array(np.repeat(1, len(collection)))
y_train_balanced = np.append(y_train_unbalanced, y_collection)

In [None]:
print(y_train_balanced.shape)

## Import images generated using BAGAN and append them

In [None]:
# Do for Train (I think not for test)
x_train_balanced_gan = append_generated_images(unbalanced_train_x, generated_train_x)
y_train_balanced_gan = np.array(np.repeat(1, len(x_train_balanced)))

## Preprocess

In [None]:
def preprocess(x, y):

    # Data pre-processing
    x = x.astype('float32')
    x /= 255.0
    
    # One-hot encoding
    from keras.utils import np_utils
    y = np_utils.to_categorical(y, num_classes)
    return x, y

In [None]:
# Training data

# Unbalanced dataset
x_train_unbalanced, y_train_unbalanced = preprocess(x_train_unbalanced, y_train_unbalanced)
# Balanced with basic transformations
x_train_balanced, y_train_balanced = preprocess(x_train_balanced, y_train_balanced)
# Balanced with BAGAN transformations
# x_train_balanced_gan, y_train_balanced_gan = preprocess (x_train_balanced_gan, y_train_balanced_gan)

# Testing data
# Unbalanced dataset
x_test_unbalanced, y_test_unbalanced = preprocess(x_test_unbalanced, y_test_unbalanced)

# CLASSIFICATION

## Architecture

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import Adam

num_classes = 10

model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',
                 input_shape=x_train_unbalanced.shape[1:]))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))


opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

model2 = model

## Training the 3 alternatives

In [None]:
# Unbalanced
data = model.fit(x_train_unbalanced, y_train_unbalanced, batch_size=16, epochs=20, verbose=2)
# Balanced classic
data = model2.fit(x_train_balanced, y_train_balanced, batch_size=16, epochs=20, verbose=2)
# Balanced BAGAN
# data = model.fit(x_train, y_train, batch_size=16, epochs=5, verbose=2)

## Evaluation

In [None]:
from time import time

# Unbalanced
start_unbalanced = time()
loss_unbalanced, acc_unbalanced = model.evaluate(x_test_unbalanced, y_test_unbalanced, verbose=0)
end_unbalanced = time()
print('CNN took ' + str(end_unbalanced - start_unbalanced) + ' seconds')
print('Test loss: ' + str(loss_unbalanced) + ' - Accuracy: ' + str(acc_unbalanced))

# Balanced classic
start_balanced = time()
loss_balanced, acc_balanced = model2.evaluate(x_test_unbalanced, y_test_unbalanced, verbose=0)
end_balanced = time()
print('CNN took ' + str(end_balanced - start_balanced) + ' seconds')
print('Test loss: ' + str(loss_balanced) + ' - Accuracy: ' + str(acc_balanced))