# Generative Adversarial Networks for Data Augmentation | Part 2
- In this second part, we will be using the augmented datasets from part 1 to perform the classification task.

# 1. Importing the Packages & Boilerplate Code

In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from shutil import copyfile
from tabulate import tabulate
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix

# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/274717
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import tensorflow as tf
import tensorflow.keras.layers as tfl

In [2]:
# Setting the seeds
SEED = 0
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [3]:
# Making sure that Tensorflow is able to detect the GPU
device_name = tf.test.gpu_device_name()
if "GPU" not in device_name:
    print("GPU device not found")
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# 2. Importing the Train/Test Sets

In [4]:
# Importing the Test Dataset
print("For Test Dataset:")
df_test = pd.read_csv("../input/cifar10/test_x.csv")
y_test = pd.read_csv("../input/cifar10/test_y.csv")
df_test = np.array(df_test)
y_test = np.array(y_test)
print(df_test.shape, y_test.shape)

# Reshaping the dataset
df_test = np.reshape(df_test, (-1, 3, 32, 32))
print(df_test.shape)

# Reshaping, rescaling and one-hot encoding
df_test = np.transpose(np.array(df_test), (0, 2, 3, 1))
df_test = df_test / 255
y_test_oh = tf.one_hot(np.ravel(y_test), depth = 10)
print(df_test.shape, y_test_oh.shape)

# =========================================================
print("For Train Dataset:")
# Importing the Labelled Training Dataset
df_train = pd.read_csv("../input/cifar10/train_lab_x.csv")
y_train = pd.read_csv("../input/cifar10/train_lab_y.csv")
df_train = np.array(df_train)
y_train = np.array(y_train)
print(df_train.shape, y_train.shape)

# Reshaping, rescaling and one-hot encoding
df_train = np.reshape(df_train, (-1, 3, 32, 32))
df_train = np.transpose(np.array(df_train), (0, 2, 3, 1))
df_train = df_train / 255
print(df_train.shape)

For Test Dataset:
(10000, 3072) (10000, 1)
(10000, 3, 32, 32)
(10000, 32, 32, 3) (10000, 10)
For Train Dataset:
(40006, 3072) (40006, 1)
(40006, 32, 32, 3)


# 3. Image Augmentation on 25% of the Training Dataset
## 3.1. Augmenting the Training Dataset

In [5]:
# Importing the Augmented Dataset
df_aug = pd.read_csv("../input/cifar10/df_25per_aug.csv")
y_aug = pd.read_csv("../input/cifar10/y_25per_aug.csv")
df_aug = np.array(df_aug)
y_aug = np.array(y_aug)

# Reshaping, rescaling and one-hot encoding
df_aug = np.reshape(df_aug, (-1, 3, 32, 32))
df_aug = np.transpose(np.array(df_aug), (0, 2, 3, 1))

# Concatenating the Training with Augmenting Dataset
df_aug = np.concatenate([df_train, df_aug], axis=0)
y_aug = np.concatenate([y_train, y_aug], axis=0)
print(df_aug.shape, y_aug.shape)

# Creating a random permutation & shuffling the dataset
perm = np.random.permutation(df_aug.shape[0])
df_aug = np.array(df_aug[perm, : , : , : ])
y_aug = y_aug[perm]
y_aug_oh = tf.one_hot(np.ravel(y_aug), depth = 10)
print(df_aug.shape, y_aug.shape, y_aug_oh.shape)

(50022, 32, 32, 3) (50022, 1)
(50022, 32, 32, 3) (50022, 1) (50022, 10)


## 3.2. Training the Baseline Model on the Augmented Dataset

In [6]:
# Importing the Baseline Model Architecture
copyfile(src = "../input/dcai-rw/baseline_arch.py", dst = "../working/baseline_arch.py")
from baseline_arch import cnn_model

conv_model = cnn_model((32, 32, 3))
conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')

# Creating Batches from the Augmented Dataset
aug_dataset = tf.data.Dataset.from_tensor_slices((df_aug, y_aug_oh)).batch(32)
history = conv_model.fit(aug_dataset, epochs = 25)

# Saving the model along with it's weights
conv_model.save('baseline_gan_augmented_all.h5')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


## 3.3. Predicting the Performance

In [7]:
# Predicting on the Train/Test Datasets
preds_aug = conv_model.predict(df_aug)
preds_test = conv_model.predict(df_test)

# Finding the Predicted Classes
cls_aug = np.argmax(preds_aug, axis = 1)
cls_test = np.argmax(preds_test, axis = 1)

# Finding the Train/Test set Loss
print("Log-loss for Augmented Dataset = ", log_loss(y_aug_oh, preds_aug))
print("Log-loss for Test Dataset = ", log_loss(y_test_oh, preds_test))
print("Accuracy for Augmented Dataset = ", accuracy_score(y_aug, cls_aug))
print("Accuracy for Test Dataset = ", accuracy_score(y_test, cls_test))

Log-loss for Augmented Dataset =  0.2295699061216759
Log-loss for Test Dataset =  0.8003369166397284
Accuracy for Augmented Dataset =  0.9206749030426612
Accuracy for Test Dataset =  0.7518


# 4. Image Augmentation for Class Balancing
## 4.1. Augmenting the Training Dataset

In [8]:
# Importing the Augmented Dataset
df_aug = pd.read_csv("../input/cifar10/df_clsbal_aug.csv")
y_aug = pd.read_csv("../input/cifar10/y_clsbal_aug.csv")
df_aug = np.array(df_aug)
y_aug = np.array(y_aug)

# Reshaping, rescaling and one-hot encoding
df_aug = np.reshape(df_aug, (-1, 3, 32, 32))
df_aug = np.transpose(np.array(df_aug), (0, 2, 3, 1))

# Concatenating the Training with Augmenting Dataset
df_aug = np.concatenate([df_train, df_aug], axis=0)
y_aug = np.concatenate([y_train, y_aug], axis=0)
print(df_aug.shape, y_aug.shape)

# Creating a random permutation & shuffling the dataset
perm = np.random.permutation(df_aug.shape[0])
df_aug = np.array(df_aug[perm, : , : , : ])
y_aug = y_aug[perm]
y_aug_oh = tf.one_hot(np.ravel(y_aug), depth = 10)
print(df_aug.shape, y_aug.shape, y_aug_oh.shape)

(43780, 32, 32, 3) (43780, 1)
(43780, 32, 32, 3) (43780, 1) (43780, 10)


## 4.2. Training the Baseline Model on the Augmented Dataset

In [9]:
# Importing the Baseline Model Architecture
copyfile(src = "../input/dcai-rw/baseline_arch.py", dst = "../working/baseline_arch.py")
from baseline_arch import cnn_model

conv_model = cnn_model((32, 32, 3))
conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')

# Creating Batches from the Augmented Dataset
aug_dataset = tf.data.Dataset.from_tensor_slices((df_aug, y_aug_oh)).batch(32)
history = conv_model.fit(aug_dataset, epochs = 25)

# Saving the model along with it's weights
conv_model.save('baseline_gan_augmented_all.h5')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


## 4.3. Predicting the Performance

In [10]:
# Predicting on the Train/Test Datasets
preds_aug = conv_model.predict(df_aug)
preds_test = conv_model.predict(df_test)

# Finding the Predicted Classes
cls_aug = np.argmax(preds_aug, axis = 1)
cls_test = np.argmax(preds_test, axis = 1)

# Finding the Train/Test set Loss
print("Log-loss for Augmented Dataset = ", log_loss(y_aug_oh, preds_aug))
print("Log-loss for Test Dataset = ", log_loss(y_test_oh, preds_test))
print("Accuracy for Augmented Dataset = ", accuracy_score(y_aug, cls_aug))
print("Accuracy for Test Dataset = ", accuracy_score(y_test, cls_test))

Log-loss for Augmented Dataset =  0.23891070328193428
Log-loss for Test Dataset =  0.8148165362078066
Accuracy for Augmented Dataset =  0.9202147099132024
Accuracy for Test Dataset =  0.7475


# 5. Image Augmentation based on Class-wise Performance
## 5.1. Augmenting the Training Dataset

In [11]:
# Importing the Augmented Dataset
df_aug = pd.read_csv("../input/cifar10/df_clsper_aug.csv")
y_aug = pd.read_csv("../input/cifar10/y_clsper_aug.csv")
df_aug = np.array(df_aug)
y_aug = np.array(y_aug)

# Reshaping, rescaling and one-hot encoding
df_aug = np.reshape(df_aug, (-1, 3, 32, 32))
df_aug = np.transpose(np.array(df_aug), (0, 2, 3, 1))

# Concatenating the Training with Augmenting Dataset
df_aug = np.concatenate([df_train, df_aug], axis=0)
y_aug = np.concatenate([y_train, y_aug], axis=0)
print(df_aug.shape, y_aug.shape)

# Creating a random permutation & shuffling the dataset
perm = np.random.permutation(df_aug.shape[0])
df_aug = np.array(df_aug[perm, : , : , : ])
y_aug = y_aug[perm]
y_aug_oh = tf.one_hot(np.ravel(y_aug), depth = 10)
print(df_aug.shape, y_aug.shape, y_aug_oh.shape)

(48262, 32, 32, 3) (48262, 1)
(48262, 32, 32, 3) (48262, 1) (48262, 10)


## 5.2. Training the Baseline Model on the Augmented Dataset

In [12]:
# Importing the Baseline Model Architecture
copyfile(src = "../input/dcai-rw/baseline_arch.py", dst = "../working/baseline_arch.py")
from baseline_arch import cnn_model

conv_model = cnn_model((32, 32, 3))
conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')

# Creating Batches from the Augmented Dataset
aug_dataset = tf.data.Dataset.from_tensor_slices((df_aug, y_aug_oh)).batch(32)
history = conv_model.fit(aug_dataset, epochs = 25)

# Saving the model along with it's weights
conv_model.save('baseline_gan_augmented_all.h5')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


## 5.3. Predicting the Performance

In [13]:
# Predicting on the Train/Test Datasets
preds_aug = conv_model.predict(df_aug)
preds_test = conv_model.predict(df_test)

# Finding the Predicted Classes
cls_aug = np.argmax(preds_aug, axis = 1)
cls_test = np.argmax(preds_test, axis = 1)

# Finding the Train/Test set Loss
print("Log-loss for Augmented Dataset = ", log_loss(y_aug_oh, preds_aug))
print("Log-loss for Test Dataset = ", log_loss(y_test_oh, preds_test))
print("Accuracy for Augmented Dataset = ", accuracy_score(y_aug, cls_aug))
print("Accuracy for Test Dataset = ", accuracy_score(y_test, cls_test))

Log-loss for Augmented Dataset =  0.24085062231891433
Log-loss for Test Dataset =  0.7980606513905791
Accuracy for Augmented Dataset =  0.9197505283660022
Accuracy for Test Dataset =  0.749
