<a href="https://colab.research.google.com/github/ChethanaVaisali/COVID_19/blob/master/COVID_19_augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Augmentation

In [1]:
# Import google drive
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
## Load data from the drive
import numpy as np
trainx = np.load('/gdrive/My Drive/CV Assignment 2/Train_Data/227X227_trainx.npy')
trainy = np.load('/gdrive/My Drive/CV Assignment 2/Train_Data/trainy.npy')

print(trainx.shape,trainy.shape)

(2318, 227, 227, 3) (2318,)


In [6]:
## Reduce Images
import cv2
WIDTH = 227
HEIGHT = 227
reduced_images=[]
for img in trainx:
  reduced_images.append(np.expand_dims(cv2.resize(img,(WIDTH,HEIGHT) , interpolation = cv2.INTER_CUBIC),0))
  
reduced_images = np.concatenate(reduced_images)
reduced_images.shape

(2318, 227, 227, 3)

### Saving 227 X 227 Images

In [0]:
np.save('/gdrive/My Drive/CV Assignment 2/Train_Data/227X227_trainx.npy',reduced_images)

In [3]:
unique, counts = np.unique(trainy, return_counts = True)
print(unique, counts)

[0 1 2] [ 170 1072 1076]


In [4]:
### Data Preprocessing
## Find the indices
print(np.unique(trainy[0:170]))
print(np.unique(trainy[170:170 + 1072]))
print(np.unique(trainy[170+1072:170+1072+1076], return_counts=True))

[0]
[1]
(array([2]), array([1076]))


In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, GlobalAveragePooling2D
from keras.models import Model
from keras import backend as K

from keras.models import load_model


  import pandas.util.testing as tm
Using TensorFlow backend.


In [6]:
## Divide the data into three sets to facilitate manual augmentation

covid_x = trainx[0:170]
covid_y = trainy[0:170]
normal_x = trainx[170:170+1072]
normal_y = trainy[170:170+1072]
pneumonia_x = trainx[170+1072:170+1072+1076]
pneumonia_y = trainy[170+1072:170+1072+1076]

print(covid_x.shape,covid_y.shape)
print(normal_x.shape,normal_y.shape)
print(pneumonia_x.shape,pneumonia_y.shape)

(170, 227, 227, 3) (170,)
(1072, 227, 227, 3) (1072,)
(1076, 227, 227, 3) (1076,)


In [7]:
## Dividing into train, validation and test sets
from sklearn.model_selection import train_test_split

trvalid_cx, test_cx, trvalid_cy, test_cy = train_test_split(covid_x,covid_y, test_size = 0.20, shuffle = True)
train_cx, valid_cx, train_cy, valid_cy = train_test_split(trvalid_cx, trvalid_cy, test_size = 0.20, shuffle = True)

trvalid_nx, test_nx, trvalid_ny, test_ny = train_test_split(normal_x,normal_y, test_size = 0.20, shuffle = True)
train_nx, valid_nx, train_ny, valid_ny = train_test_split(trvalid_nx, trvalid_ny, test_size = 0.20, shuffle = True)

trvalid_px, test_px, trvalid_py, test_py = train_test_split(pneumonia_x, pneumonia_y, test_size = 0.20, shuffle = True)
train_px, valid_px, train_py, valid_py = train_test_split(trvalid_px, trvalid_py, test_size = 0.20, shuffle = True)

print(train_cx.shape, train_cy.shape, train_nx.shape, train_ny.shape, train_px.shape, train_py.shape)
print(valid_cx.shape, valid_cy.shape, valid_nx.shape, valid_ny.shape, valid_px.shape, valid_py.shape)
print(test_cx.shape,test_cy.shape, test_nx.shape, test_ny.shape, test_px.shape, test_py.shape)


(108, 227, 227, 3) (108,) (685, 227, 227, 3) (685,) (688, 227, 227, 3) (688,)
(28, 227, 227, 3) (28,) (172, 227, 227, 3) (172,) (172, 227, 227, 3) (172,)
(34, 227, 227, 3) (34,) (215, 227, 227, 3) (215,) (216, 227, 227, 3) (216,)


### Data Generator for COVID-19 Data

In [0]:
from keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(rescale = 1./255,
                             rotation_range = 90,
                             width_shift_range = [0.2,0.2,0],
                             height_shift_range =[0.2,0.2,0],
                             fill_mode = 'constant',
                             zoom_range=[1,1.10],
                             cval = 0.0)

### Data Generator for Normal and viral pneumonia data

In [0]:
norpneu_generator = ImageDataGenerator(
    rescale = 1./255,
    rotation_range = 45
)

## Augmentation Function
Since there are very few samples of COVID-19 data relative to normal and viral pnemonia patients, there is high class imbalance. The COVID data has to be oversampled.

In [0]:
def augmented_images(X,Y, generator , batch_size, estimated_size,shuffle = False):
  num_images = 0
  augmented_x = []
  augmented_y = []

  for Xbatch,Ybatch in generator.flow(X,Y,batch_size = batch_size,shuffle = shuffle):
    augmented_x.append(Xbatch)
    augmented_y.append(Ybatch)
    num_images += batch_size
    # print(num_images)
    if num_images >= estimated_size:
      break
  augmented_x = np.concatenate(augmented_x,axis = 0)
  augmented_y = np.concatenate(augmented_y,axis = 0)

  augmented_x = np.concatenate((augmented_x,X), axis= 0)
  augmented_y = np.concatenate((augmented_y,Y), axis = 0)

  return augmented_x,augmented_y

In [12]:
aug_train_cx, aug_train_cy = augmented_images(train_cx, train_cy, datagen, 64, 1600)
print(aug_train_cx.shape, aug_train_cy.shape)
aug_valid_cx, aug_valid_cy = augmented_images(valid_cx, valid_cy, datagen, 64, 400)
print(aug_valid_cx.shape, aug_valid_cy.shape)

(1468, 227, 227, 3) (1468,)
(224, 227, 227, 3) (224,)


In [13]:
aug_train_nx, aug_train_ny = augmented_images(train_nx, train_ny, norpneu_generator,64, 800)
print(aug_train_nx.shape, aug_train_ny.shape)
aug_valid_nx, aug_valid_ny = augmented_images(valid_nx, valid_ny, norpneu_generator, 64, 300)
print(aug_valid_nx.shape, aug_valid_ny.shape)

(1498, 227, 227, 3) (1498,)
(472, 227, 227, 3) (472,)


In [14]:
aug_train_px, aug_train_py = augmented_images(train_px,train_py, norpneu_generator, 64, 800)
print(aug_train_px.shape, aug_train_py.shape)
aug_valid_px, aug_valid_py = augmented_images(valid_px, valid_py, norpneu_generator,64,300)
print(aug_valid_px.shape, aug_valid_py.shape)

(1504, 227, 227, 3) (1504,)
(472, 227, 227, 3) (472,)


In [0]:
## Test data are not normalized
test_cx = test_cx/255
test_nx = test_nx/255
test_px = test_px/255

In [16]:
## Combining all the covid_19, normal and pneumonia data
train_x = np.concatenate((aug_train_cx, aug_train_nx, aug_train_px), axis = 0)
train_y = np.concatenate((aug_train_cy,aug_train_ny,aug_train_py), axis = 0)
print(train_x.shape, train_y.shape)

valid_x = np.concatenate((aug_valid_cx, aug_valid_nx, aug_valid_px), axis = 0)
valid_y = np.concatenate((aug_valid_cy, aug_valid_ny, aug_valid_py), axis = 0)
print(valid_x.shape, valid_y.shape)

test_x = np.concatenate((test_cx,test_nx,test_px),axis = 0)
test_y = np.concatenate((test_cy,test_ny,test_py), axis = 0)
print(test_x.shape,test_y.shape)


(4470, 227, 227, 3) (4470,)
(1168, 227, 227, 3) (1168,)
(465, 227, 227, 3) (465,)


In [0]:
np.save('/gdrive/My Drive/CV Assignment 2/Augmented_data/train_x.npy', train_x)
np.save('/gdrive/My Drive/CV Assignment 2/Augmented_data/train_y.npy', train_y)
np.save('/gdrive/My Drive/CV Assignment 2/Augmented_data/valid_x.npy', valid_x)
np.save('/gdrive/My Drive/CV Assignment 2/Augmented_data/valid_y.npy', valid_y)
np.save('/gdrive/My Drive/CV Assignment 2/Augmented_data/test_x.npy',test_x)
np.save('/gdrive/My Drive/CV Assignment 2/Augmented_data/test_y.npy',test_y)