In [1]:
from keras.utils.data_utils import Sequence
from imblearn.over_sampling import RandomOverSampler
from imblearn.keras import balanced_batch_generator

from keras.applications.resnet import ResNet50, preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator

import numpy as np
import pandas as pd

from tqdm import tqdm
import os
import shutil
import glob
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.utils import shuffle
from PIL import Image

2022-11-24 22:53:43.700333: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-24 22:53:43.811721: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-24 22:53:44.583262: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvrtc.so.11.0: cannot open shared object file: No such file or directory
2022-11-24 22:53:44.583332: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64]

## Save augmented images into folders

In [21]:
class BalancedDataGenerator(Sequence):
    """ImageDataGenerator + RandomOversampling"""
    def __init__(self, X, y, datagen, batch_size=32, save_imgs = False, dir_to_save = ''):
        self.datagen = datagen
        self.batch_size = min(batch_size, X.shape[0])
        self.datagen.fit(X) #datagen.fit(X)
        self.balanced_gen, self.steps_per_epoch = balanced_batch_generator(
                                        X.reshape(X.shape[0], -1), # shape (n_samples, n_features)
                                        y, 
                                        sampler=RandomOverSampler(), 
                                        batch_size=self.batch_size, 
                                        keep_sparse=True,
                                        random_state = 2022
                                        )
        # asterisks * just unpacks the tuple from shape
        # For example: (1, X.shape[1:]) = (1, (256, 256, 3))
        # while (1, *X.shape[1:]) = (1, 256, 256, 3)
        self.img_shape = X.shape[1:] # (256, 256, 3)
        self._shape = (self.steps_per_epoch * self.batch_size, *X.shape[1:])
        self.save_imgs = save_imgs
        self.dir_to_save = dir_to_save
        
        
    def __len__(self):
        return self.steps_per_epoch

    def __getitem__(self, idx):
        x_batch_balanced, y_batch_balanced = self.balanced_gen.__next__()
        x_batch_balanced = x_batch_balanced.reshape(-1, *self.img_shape)
        #print(x_batch_balanced.shape, len(y_batch_balanced))
        
        batches_balanced =  self.datagen.flow(
                            x_batch_balanced, y_batch_balanced, 
                            batch_size=self.batch_size,
                            # save_to_dir = self.dir_to_save if self.save_imgs else None,
                            # save_prefix = '',
                            # save_format = 'jpeg' # for smaller storage size           
                            )

        # return a pair of X_batch and y_batch 
        return batches_balanced.next()

train_gen = ImageDataGenerator(
            rotation_range=15,
            fill_mode='constant',
            horizontal_flip=True,
            zoom_range=0.1,
            preprocessing_function=preprocess_input
            )

In [None]:
df_meta = pd.read_csv('data/Edema/df_Edema_1121.csv')
X_all = np.load('data/Edema/X_Edema.npy')
print(X_all.shape)

In [4]:
df_used = df_meta['survive'].dropna()
y = list(df_used)
X = X_all[df_used.index]
print(len(y))
print(X.shape) # 24704 -> 14614

(24704, 256, 256, 3)
14614
(14614, 256, 256, 3)


In [33]:
datagen = train_gen # define your data augmentation
save_imgs = True
dir_to_save= 'data/Edema/images/balanced'
bgen = BalancedDataGenerator(X, y, datagen, batch_size= 128, 
                        # save_imgs = save_imgs, 
                        # dir_to_save= dir_to_save
                        )

steps_per_epoch = bgen.steps_per_epoch
print('Total number of batches:', steps_per_epoch)

if save_imgs:
    if os.path.exists(dir_to_save):
        shutil.rmtree(dir_to_save) # clear the folder first if exist
        os.mkdir(dir_to_save)
        print('folder cleaned')
    else:
        os.mkdir(dir_to_save)

X_gen_l = []
y_gen_l = []
num_batches_needed = steps_per_epoch
for i in tqdm(range(num_batches_needed)): #steps_per_epoch
    X_gen, y_gen = bgen.__getitem__(0)
    #print(X_gen.shape, len(y_gen))
    X_gen_l.append(X_gen)
    y_gen_l.append(y_gen)

print(np.unique(y, return_counts=True))
print(np.unique(y_gen_l, return_counts=True))


Total number of batches: 159
folder cleaned


100%|██████████| 159/159 [02:42<00:00,  1.02s/it]

(array(['DIE', 'SURVIVE'], dtype='<U7'), array([10179,  4435]))
(array(['DIE', 'SURVIVE'], dtype='<U7'), array([10175, 10177]))





In [34]:
X_gen_all = np.concatenate(X_gen_l)
y_gen_all = np.concatenate(y_gen_l)
print(X_gen_all.shape, y_gen_all.shape)

(20352, 256, 256, 3) (20352,)


### Split train/test/val

In [56]:

# make a dataframe
df_gen = pd.DataFrame({'img_index': list(range(X_gen_all.shape[0])), 'class': y_gen_all})
unique_id = df_gen.img_index.unique() #data_df.subject_id.unique()

train_percent, valid_percent, test_percent = 0.75, 0.20, 0.05

unique_id = shuffle(unique_id)
value1 = (round(len(unique_id)*train_percent))
value2 = (round(len(unique_id)*valid_percent))
value3 = value1 + value2
value4 = (round(len(unique_id)*test_percent))

print("Images in training set: " + str(value1))
print("Images in validation set: " + str(value2))
print("Images in testing set: " + str(value4))

train_sub_id = unique_id[:value1]
validate_sub_id = unique_id[value1:value3]
test_sub_id = unique_id[value3:]

split_l = []
for i in range(df_gen.shape[0]):
    img_ind = df_gen.img_index[i]
    add = ''
    if img_ind in train_sub_id:
        add = 'train'
    elif img_ind in validate_sub_id:
        add = 'val'
    else:
        add = 'test'
    split_l.append(add)

df_gen['set'] = split_l
df_gen

Images in training set: 15264
Images in validation set: 4070
Images in testing set: 1018


Unnamed: 0,img_index,class,set
0,0,DIE,train
1,1,SURVIVE,train
2,2,SURVIVE,val
3,3,DIE,train
4,4,SURVIVE,train
...,...,...,...
20347,20347,SURVIVE,test
20348,20348,DIE,train
20349,20349,SURVIVE,val
20350,20350,DIE,train


In [57]:
df_gen['set'].value_counts(normalize = True)

train    0.75000
val      0.19998
test     0.05002
Name: set, dtype: float64

### Save npy array as images in folder 

In [58]:
disease = 'Edema' # 'Pneumonia_Consolidation' , 'No_finding'
task = 'survive' # 'race', 'gender', 'insuarance'

In [61]:
new_folder = f'data/{disease}/images/{task}/balanced'
if os.path.exists(new_folder):
    shutil.rmtree(new_folder) # clear the folder first if exist
DATA_DIR = Path(new_folder)
DATASETS = ['train', 'val', 'test']
class_names = np.unique(y_gen_all)
for ds in DATASETS:
    for cls in class_names:
        (DATA_DIR / ds / cls).mkdir(parents=True, exist_ok=True)

img_array = np.load(f'data/{disease}/X_{disease}.npy')
print(img_array.shape)
for i in tqdm(range(df_gen.shape[0])):
    img_ind = df_gen['img_index'][i]
    ds = df_gen['set'][i]
    cls = df_gen['class'][i]
    img = Image.fromarray(img_array[i])
    fname = f'{DATA_DIR}/{ds}/{cls}/{i}.jpeg'
    img.save(fname)

(24704, 256, 256, 3)


100%|██████████| 20352/20352 [00:27<00:00, 747.58it/s]
