# Generative Adversarial Networks for Data Augmentation | Part 2
- In this second part, we will be using the augmented datasets from part 1 to perform the classification task.

# 1. Importing the Packages & Boilerplate Code

In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from shutil import copyfile
from tabulate import tabulate
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix

# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/274717
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import tensorflow as tf
import tensorflow.keras.layers as tfl

In [2]:
# Setting the seeds
SEED = 0
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [3]:
# Making sure that Tensorflow is able to detect the GPU
device_name = tf.test.gpu_device_name()
if "GPU" not in device_name:
    print("GPU device not found")
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# 2. Importing the Train/Test Sets

In [4]:
# Importing the Test Dataset
print("For Test Dataset:")
df_test = pd.read_csv("../input/cifar10/test_x.csv")
y_test = pd.read_csv("../input/cifar10/test_y.csv")
df_test = np.array(df_test)
y_test = np.array(y_test)
print(df_test.shape, y_test.shape)

# Reshaping the dataset
df_test = np.reshape(df_test, (-1, 3, 32, 32))
print(df_test.shape)

# Reshaping, rescaling and one-hot encoding
df_test = np.transpose(np.array(df_test), (0, 2, 3, 1))
df_test = df_test / 255
y_test_oh = tf.one_hot(np.ravel(y_test), depth = 10)
print(df_test.shape, y_test_oh.shape)

# =========================================================
print("For Train Dataset:")
# Importing the Labelled Training Dataset
df_train = pd.read_csv("../input/cifar10/train_lab_x.csv")
y_train = pd.read_csv("../input/cifar10/train_lab_y.csv")
df_train = np.array(df_train)
y_train = np.array(y_train)
print(df_train.shape, y_train.shape)

# Reshaping, rescaling and one-hot encoding
df_train = np.reshape(df_train, (-1, 3, 32, 32))
df_train = np.transpose(np.array(df_train), (0, 2, 3, 1))
df_train = df_train / 255
print(df_train.shape)

For Test Dataset:
(10000, 3072) (10000, 1)
(10000, 3, 32, 32)
(10000, 32, 32, 3) (10000, 10)
For Train Dataset:
(40006, 3072) (40006, 1)
(40006, 32, 32, 3)


# 3. Image Augmentation on 25% of the Training Dataset
## 3.1. Augmenting the Training Dataset

In [5]:
# Importing the Augmented Dataset
df_aug = pd.read_csv("../input/cifar10/df_25per_aug.csv")
y_aug = pd.read_csv("../input/cifar10/y_25per_aug.csv")
df_aug = np.array(df_aug)
y_aug = np.array(y_aug)

# Reshaping, rescaling and one-hot encoding
df_aug = np.reshape(df_aug, (-1, 3, 32, 32))
df_aug = np.transpose(np.array(df_aug), (0, 2, 3, 1))

# Concatenating the Training with Augmenting Dataset
df_aug = np.concatenate([df_train, df_aug], axis=0)
y_aug = np.concatenate([y_train, y_aug], axis=0)
print(df_aug.shape, y_aug.shape)

# Creating a random permutation & shuffling the dataset
perm = np.random.permutation(df_aug.shape[0])
df_aug = np.array(df_aug[perm, : , : , : ])
y_aug = y_aug[perm]
y_aug_oh = tf.one_hot(np.ravel(y_aug), depth = 10)
print(df_aug.shape, y_aug.shape, y_aug_oh.shape)

(50022, 32, 32, 3) (50022, 1)
(50022, 32, 32, 3) (50022, 1) (50022, 10)


## 3.2. Training the Baseline Model on the Augmented Dataset

In [6]:
# Importing the Baseline Model Architecture
copyfile(src = "../input/dcai-rw/baseline_arch.py", dst = "../working/baseline_arch.py")
from baseline_arch import cnn_model

# Creating Batches from the Augmented Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((df_aug, y_aug_oh)).batch(32)

In [7]:
num_epochs = [10, 20, 30, 40, 50]
train_loss, test_loss, train_acc, test_acc = [], [], [], []

for epochs in num_epochs:
    # Training the Model
    conv_model = cnn_model((32, 32, 3))
    conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')
    conv_model.fit(train_dataset, epochs = epochs)
    
    # Predicting on the Train/Test Datasets
    preds_train = conv_model.predict(df_aug)
    preds_test = conv_model.predict(df_test)

    # Finding the Predicted Classes
    cls_train = np.argmax(preds_train, axis = 1)
    cls_test = np.argmax(preds_test, axis = 1)
    
    # Finding the Train/Test set Loss
    train_loss.append(log_loss(y_aug_oh, preds_train))
    test_loss.append(log_loss(y_test_oh, preds_test))
    train_acc.append(accuracy_score(y_aug, cls_train))
    test_acc.append(accuracy_score(y_test, cls_test))
    
    print("For ", epochs, " Epochs:")
    print("Log-loss for Train Dataset = ", train_loss[-1])
    print("Log-loss for Test Dataset = ", test_loss[-1])
    print("Accuracy for Train Dataset = ", train_acc[-1])
    print("Accuracy for Test Dataset = ", test_acc[-1])
    print()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
For  10  Epochs:
Log-loss for Train Dataset =  0.4396363683687943
Log-loss for Test Dataset =  0.7890395254669714
Accuracy for Train Dataset =  0.8448882491703651
Accuracy for Test Dataset =  0.7315

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
For  20  Epochs:
Log-loss for Train Dataset =  0.2613411821583305
Log-loss for Test Dataset =  0.7838153493844481
Accuracy for Train Dataset =  0.9107992483307344
Accuracy for Test Dataset =  0.7486

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epo

In [8]:
# Training the Model with the best hyper-parameter settings
ind = np.argmax(test_acc)
best_num_epochs = num_epochs[ind]
conv_model = cnn_model((32, 32, 3))
conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')
conv_model.fit(train_dataset, epochs = best_num_epochs)

# Saving the model along with it's weights
conv_model.save('gan_augmented_all.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## 3.3. Predicting the Performance

In [9]:
# Predicting on the Train/Test Datasets
preds_train = conv_model.predict(df_aug)
preds_test = conv_model.predict(df_test)

# Finding the Predicted Classes
cls_train = np.argmax(preds_train, axis = 1)
cls_test = np.argmax(preds_test, axis = 1)

# Finding the Train/Test set Loss
print("Log-loss for Augmented Dataset = ", log_loss(y_aug_oh, preds_train))
print("Log-loss for Test Dataset = ", log_loss(y_test_oh, preds_test))
print("Accuracy for Augmented Dataset = ", accuracy_score(y_aug, cls_train))
print("Accuracy for Test Dataset = ", accuracy_score(y_test, cls_test))

Log-loss for Augmented Dataset =  0.10731828685173772
Log-loss for Test Dataset =  0.8453338678017722
Accuracy for Augmented Dataset =  0.9678741353804327
Accuracy for Test Dataset =  0.7547


# 4. Image Augmentation for Class Balancing
## 4.1. Augmenting the Training Dataset

In [10]:
# Importing the Augmented Dataset
df_aug = pd.read_csv("../input/cifar10/df_clsbal_aug.csv")
y_aug = pd.read_csv("../input/cifar10/y_clsbal_aug.csv")
df_aug = np.array(df_aug)
y_aug = np.array(y_aug)

# Reshaping, rescaling and one-hot encoding
df_aug = np.reshape(df_aug, (-1, 3, 32, 32))
df_aug = np.transpose(np.array(df_aug), (0, 2, 3, 1))

# Concatenating the Training with Augmenting Dataset
df_aug = np.concatenate([df_train, df_aug], axis=0)
y_aug = np.concatenate([y_train, y_aug], axis=0)
print(df_aug.shape, y_aug.shape)

# Creating a random permutation & shuffling the dataset
perm = np.random.permutation(df_aug.shape[0])
df_aug = np.array(df_aug[perm, : , : , : ])
y_aug = y_aug[perm]
y_aug_oh = tf.one_hot(np.ravel(y_aug), depth = 10)
print(df_aug.shape, y_aug.shape, y_aug_oh.shape)

(43780, 32, 32, 3) (43780, 1)
(43780, 32, 32, 3) (43780, 1) (43780, 10)


## 4.2. Training the Baseline Model on the Augmented Dataset

In [11]:
# Importing the Baseline Model Architecture
copyfile(src = "../input/dcai-rw/baseline_arch.py", dst = "../working/baseline_arch.py")
from baseline_arch import cnn_model

# Creating Batches from the Augmented Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((df_aug, y_aug_oh)).batch(32)

In [12]:
num_epochs = [10, 20, 30, 40, 50]
train_loss, test_loss, train_acc, test_acc = [], [], [], []

for epochs in num_epochs:
    # Training the Model
    conv_model = cnn_model((32, 32, 3))
    conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')
    conv_model.fit(train_dataset, epochs = epochs)
    
    # Predicting on the Train/Test Datasets
    preds_train = conv_model.predict(df_aug)
    preds_test = conv_model.predict(df_test)

    # Finding the Predicted Classes
    cls_train = np.argmax(preds_train, axis = 1)
    cls_test = np.argmax(preds_test, axis = 1)
    
    # Finding the Train/Test set Loss
    train_loss.append(log_loss(y_aug_oh, preds_train))
    test_loss.append(log_loss(y_test_oh, preds_test))
    train_acc.append(accuracy_score(y_aug, cls_train))
    test_acc.append(accuracy_score(y_test, cls_test))
    
    print("For ", epochs, " Epochs:")
    print("Log-loss for Train Dataset = ", train_loss[-1])
    print("Log-loss for Test Dataset = ", test_loss[-1])
    print("Accuracy for Train Dataset = ", train_acc[-1])
    print("Accuracy for Test Dataset = ", test_acc[-1])
    print()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
For  10  Epochs:
Log-loss for Train Dataset =  0.49706786676875786
Log-loss for Test Dataset =  0.8131447916716759
Accuracy for Train Dataset =  0.8233896756509822
Accuracy for Test Dataset =  0.7266

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
For  20  Epochs:
Log-loss for Train Dataset =  0.3169507556361305
Log-loss for Test Dataset =  0.8303529736588569
Accuracy for Train Dataset =  0.886455002284148
Accuracy for Test Dataset =  0.7478

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epo

In [13]:
# Training the Model with the best hyper-parameter settings
ind = np.argmax(test_acc)
best_num_epochs = num_epochs[ind]
conv_model = cnn_model((32, 32, 3))
conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')
conv_model.fit(train_dataset, epochs = best_num_epochs)

# Saving the model along with it's weights
conv_model.save('gan_augmented_cls_imbalance.h5')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


## 4.3. Predicting the Performance

In [14]:
# Predicting on the Train/Test Datasets
preds_train = conv_model.predict(df_aug)
preds_test = conv_model.predict(df_test)

# Finding the Predicted Classes
cls_train = np.argmax(preds_train, axis = 1)
cls_test = np.argmax(preds_test, axis = 1)

# Finding the Train/Test set Loss
print("Log-loss for Augmented Dataset = ", log_loss(y_aug_oh, preds_train))
print("Log-loss for Test Dataset = ", log_loss(y_test_oh, preds_test))
print("Accuracy for Augmented Dataset = ", accuracy_score(y_aug, cls_train))
print("Accuracy for Test Dataset = ", accuracy_score(y_test, cls_test))

Log-loss for Augmented Dataset =  0.21893769497461882
Log-loss for Test Dataset =  0.7985118761176211
Accuracy for Augmented Dataset =  0.9294883508451348
Accuracy for Test Dataset =  0.7533


# 5. Image Augmentation based on Class-wise Performance
## 5.1. Augmenting the Training Dataset

In [15]:
# Importing the Augmented Dataset
df_aug = pd.read_csv("../input/cifar10/df_clsper_aug.csv")
y_aug = pd.read_csv("../input/cifar10/y_clsper_aug.csv")
df_aug = np.array(df_aug)
y_aug = np.array(y_aug)

# Reshaping, rescaling and one-hot encoding
df_aug = np.reshape(df_aug, (-1, 3, 32, 32))
df_aug = np.transpose(np.array(df_aug), (0, 2, 3, 1))

# Concatenating the Training with Augmenting Dataset
df_aug = np.concatenate([df_train, df_aug], axis=0)
y_aug = np.concatenate([y_train, y_aug], axis=0)
print(df_aug.shape, y_aug.shape)

# Creating a random permutation & shuffling the dataset
perm = np.random.permutation(df_aug.shape[0])
df_aug = np.array(df_aug[perm, : , : , : ])
y_aug = y_aug[perm]
y_aug_oh = tf.one_hot(np.ravel(y_aug), depth = 10)
print(df_aug.shape, y_aug.shape, y_aug_oh.shape)

(50054, 32, 32, 3) (50054, 1)
(50054, 32, 32, 3) (50054, 1) (50054, 10)


## 5.2. Training the Baseline Model on the Augmented Dataset

In [16]:
# Importing the Baseline Model Architecture
copyfile(src = "../input/dcai-rw/baseline_arch.py", dst = "../working/baseline_arch.py")
from baseline_arch import cnn_model

# Creating Batches from the Augmented Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((df_aug, y_aug_oh)).batch(32)

In [17]:
num_epochs = [10, 20, 30, 40, 50]
train_loss, test_loss, train_acc, test_acc = [], [], [], []

for epochs in num_epochs:
    # Training the Model
    conv_model = cnn_model((32, 32, 3))
    conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')
    conv_model.fit(train_dataset, epochs = epochs)
    
    # Predicting on the Train/Test Datasets
    preds_train = conv_model.predict(df_aug)
    preds_test = conv_model.predict(df_test)

    # Finding the Predicted Classes
    cls_train = np.argmax(preds_train, axis = 1)
    cls_test = np.argmax(preds_test, axis = 1)
    
    # Finding the Train/Test set Loss
    train_loss.append(log_loss(y_aug_oh, preds_train))
    test_loss.append(log_loss(y_test_oh, preds_test))
    train_acc.append(accuracy_score(y_aug, cls_train))
    test_acc.append(accuracy_score(y_test, cls_test))
    
    print("For ", epochs, " Epochs:")
    print("Log-loss for Train Dataset = ", train_loss[-1])
    print("Log-loss for Test Dataset = ", test_loss[-1])
    print("Accuracy for Train Dataset = ", train_acc[-1])
    print("Accuracy for Test Dataset = ", test_acc[-1])
    print()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
For  10  Epochs:
Log-loss for Train Dataset =  0.47840100428245774
Log-loss for Test Dataset =  0.8507194460455395
Accuracy for Train Dataset =  0.8302033803492228
Accuracy for Test Dataset =  0.7197

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
For  20  Epochs:
Log-loss for Train Dataset =  0.2729279847729241
Log-loss for Test Dataset =  0.7803512181574153
Accuracy for Train Dataset =  0.907439964837975
Accuracy for Test Dataset =  0.7551

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epo

In [18]:
# Training the Model with the best hyper-parameter settings
ind = np.argmax(test_acc)
best_num_epochs = num_epochs[ind]
conv_model = cnn_model((32, 32, 3))
conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')
conv_model.fit(train_dataset, epochs = best_num_epochs)

# Saving the model along with it's weights
conv_model.save('gan_augmented_cls_performance_wise.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


## 5.3. Predicting the Performance

In [19]:
# Predicting on the Train/Test Datasets
preds_train = conv_model.predict(df_aug)
preds_test = conv_model.predict(df_test)

# Finding the Predicted Classes
cls_train = np.argmax(preds_train, axis = 1)
cls_test = np.argmax(preds_test, axis = 1)

# Finding the Train/Test set Loss
print("Log-loss for Augmented Dataset = ", log_loss(y_aug_oh, preds_train))
print("Log-loss for Test Dataset = ", log_loss(y_test_oh, preds_test))
print("Accuracy for Augmented Dataset = ", accuracy_score(y_aug, cls_train))
print("Accuracy for Test Dataset = ", accuracy_score(y_test, cls_test))

Log-loss for Augmented Dataset =  0.08465463174989617
Log-loss for Test Dataset =  0.8320012305460958
Accuracy for Augmented Dataset =  0.9758061293802693
Accuracy for Test Dataset =  0.7664
