In [1]:
import os

import tensorflow as tf

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from PIL import Image
import pickle

from sklearn.model_selection import train_test_split

from datetime import datetime
from time import time
import pytz


In [2]:
orig_folder = "../datasets/original"
exp_folder = "../datasets/experiment"

orig_dataset1 = os.path.join(orig_folder, "1")
orig_dataset2 = os.path.join(orig_folder, "2/Training-validation")
orig_dataset3 = os.path.join(orig_folder, "3")

orig_dataset2_modified = os.path.join(orig_folder, "2/2-modified/Training-validation")

exp_dataset1 = os.path.join(exp_folder, "1")
exp_dataset2 = os.path.join(exp_folder, "2")
exp_dataset3 = os.path.join(exp_folder, "3")
exp_dataset4 = os.path.join(exp_folder, "4")

CLASS_NAMES = ['normal', 'aom', 'ome', 'csom', 'myringosclerosis', 'earwax', 'tube']

RANDOM_STATE = 42

randomiser = np.random.RandomState(RANDOM_STATE)

def timer(func):
    # This function shows the execution time of 
    # the function object passed
    def wrap_func(*args, **kwargs):
        t1 = time()
        result = func(*args, **kwargs)
        t2 = time()
        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
        return result
    return wrap_func

In [3]:
"""
Used to load the original datasets
"""
@timer
def load_dataset(path, class_names):
    path = path
    full_data = []
    
    class_dict = {}
    for i, name in enumerate(class_names):
        class_dict[name] = i
    
    for d in class_names:
        dirpath = os.path.join(path, d)
        if not os.path.exists(dirpath): continue
        image_files = [f for f in os.listdir(dirpath) if f.endswith(('.jpg', '.png', 'jpeg'))]
        label = d
        for img in image_files:
            image = Image.open(os.path.join(dirpath, img))
            
            image = tf.cast(image, tf.float32)/255.0
            
            data = np.array([image, class_dict[label]], dtype=object)
            full_data.append(data)
    
    randomiser.shuffle(np.array(full_data))
    return full_data
            

In [4]:
"""
Splits the complete dataset and returns the train, validation and test sets
e.g. train = 0.7, validation = 0.1 splits the train:val:test into 70:10:20
"""
@timer
def split_data(full_data, train, validation):
    X_full = np.array([x[0] for x in full_data], dtype=object)
    y_full = np.array([y[1] for y in full_data])
    
    X_train, X_test, y_train, y_test = train_test_split(
            X_full, y_full, train_size=train, stratify=y_full, random_state=RANDOM_STATE)
    X_val = []
    y_val = []
    

    validation = validation / (1.0 - train)
    
    if validation >= 1:
        X_val = X_test
        y_val = y_test
        X_test = []
        y_test = []
        
    elif validation > 0:
        X_val, X_test, y_val, y_test = train_test_split(
                X_test, y_test, train_size=validation, stratify=y_test, random_state=RANDOM_STATE)
    
    return [(X_train, y_train), (X_val, y_val), (X_test, y_test)]
    

In [5]:
"""
Peforms data augmentation by a specific factor for a particular class
"""
@timer
def data_augmentation(num, X_train, y_train, da_dict={}):
    training_set = list(zip(X_train, y_train))
        
    datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=180,
        horizontal_flip = True,
        vertical_flip = True,
        fill_mode='constant',
        cval=0.0,
        brightness_range=[0.7,1.0],
        zoom_range=[0.8,1.2],
        width_shift_range=0.2,
        height_shift_range=0.2,
    )

    for i in range(len(training_set)):
        img = training_set[i][0]
        label = training_set[i][1]

        img = np.expand_dims(img, 0)
        aug_iter = datagen.flow(img)

        count = num // np.bincount(y_train)[label]
        if len(da_dict.keys()) > 0:
            count = da_dict[class_names[label]]

        aug_images = [next(aug_iter)[0] for _ in range(count)]
        for ai in aug_images:
            data = np.array([ai , label], dtype=object)
            training_set.append(data)

    randomiser.shuffle(training_set)
    X_train = np.array([x[0] for x in training_set], dtype=object)
    y_train = np.array([y[1] for y in training_set])
    return (X_train, y_train)

In [6]:
"""
Saves the images by naming them with a unique path
"""
@timer
def save_images(path_to_save, X, y, class_names):
    def uniquify(path):
        filename, extension = os.path.splitext(path)
        counter = 1

        while os.path.exists(path):
            path = filename + " (" + str(counter) + ")" + extension
            counter += 1

        return path
    
    os.makedirs(os.path.dirname(path_to_save), exist_ok=True)
    for i in range(len(X)):
        img = np.array(X[i])
        label = y[i] 
        path = (os.path.join(path_to_save, class_names[label], class_names[label]+'.jpeg'))
        os.makedirs(os.path.dirname(path), exist_ok=True)
        path = uniquify(path)
        im = Image.fromarray((img * 255).astype(np.uint8))
        im.save(path)


In [7]:
"""
Saves the data in three separate folders depending if the dataset is for train, val or test
"""
@timer
def save_data(path, sets, class_names):
    paths = ['training', 'validation', 'testing']
    for i, p in enumerate(paths):
        data = sets[0]
        X = sets[i][0]
        y = sets[i][1]
        p = os.path.join(path, p)
        save_images(p, X, y, class_names)
    

In [8]:
@timer
def get_dataset(path, class_names, da=0, train=0.7, val=0.1, da_dict={}):     
    class_names = [c for c in class_names if c in os.listdir(path)]
    full_data = load_dataset(path, class_names)
    sets = split_data(full_data, train, val)
    train_orig = sets[0]
    sets[0] = data_augmentation(da, train_orig[0], train_orig[1], da_dict)
    print(class_names)
    
#     original train size
    print(np.bincount(train_orig[1]))
#     train size after augmentation
    print(np.bincount(sets[0][1]))
#     the augmenting multiplication factor
    print(np.bincount(sets[0][1]) // np.bincount(train_orig[1]))
    
    print()
    
#     the samples for each class for each set
    for s in sets:
        print(np.bincount(s[1]))
        
    return (sets, class_names)
    

In [9]:
# Dataset 1
res = get_dataset(orig_dataset1, CLASS_NAMES, 0, 0.63, 0.27)

# uncomment to save the data
# save_data(exp_dataset1, res[0], res[1])

Function 'load_dataset' executed in 11.9943s
Function 'split_data' executed in 0.0058s
Function 'data_augmentation' executed in 0.0247s
['normal', 'ome', 'tube']
[113 113  60]
[113 113  60]
[1 1 1]

[113 113  60]
[48 48 26]
[18 18 10]
Function 'get_dataset' executed in 12.0275s


In [10]:
# Dataset 2
res = get_dataset(orig_dataset2, CLASS_NAMES, 0, 0.8, 0.2)

# uncomment to save the data
# save_data(exp_dataset2, res[0], res[1])

Function 'load_dataset' executed in 3.2399s
Function 'split_data' executed in 13.8747s
Function 'data_augmentation' executed in 6.3304s
['normal', 'csom', 'myringosclerosis', 'earwax']
[144 144 144 144]
[144 144 144 144]
[1 1 1 1]

[144 144 144 144]
[36 36 36 36]
[]
Function 'get_dataset' executed in 24.2362s


In [11]:
# Dataset 3
class_names = ['aom', 'csom', 'earwax', 'normal']

res = get_dataset(orig_dataset3, class_names, 350, 0.8, 0)

# uncomment to save the data
# save_data(exp_dataset3, res[0], res[1])

Function 'load_dataset' executed in 7.5558s
Function 'split_data' executed in 25.9060s
Function 'data_augmentation' executed in 72.6842s
['aom', 'csom', 'earwax', 'normal']
[ 95  50 112 428]
[380 400 448 428]
[4 8 4 1]

[380 400 448 428]
[]
[ 24  13  28 107]
Function 'get_dataset' executed in 107.6194s


In [12]:
# Dataset 4
data_aug_1 = {
    'normal': 2,
    'ome': 7,
    'tube': 7  
}

data_aug_2 = {
    'normal': 1,
    'csom': 3,
    'myringosclerosis': 2,
    'earwax': 2
}

data_aug_3 = {
    'normal': 0,
    'aom': 11,
    'csom': 11,
    'myringosclerosis': 23,
    'earwax': 5,
    'tube': 47  
}

class_names = CLASS_NAMES

# change integer depending on dataset
aug = data_aug_1
class_names = list(aug.keys())
res = get_dataset(orig_dataset1, class_names, 0, 0.7, 0.1, aug)

# uncomment to save the data
# save_data(exp_dataset4, res[0], res[1])

Function 'load_dataset' executed in 6.9116s
Function 'split_data' executed in 0.0036s
Function 'data_augmentation' executed in 262.1094s
['normal', 'ome', 'tube']
[125 125  67]
[ 375 1000  536]
[3 8 8]

[ 375 1000  536]
[18 18  9]
[36 36 20]
Function 'get_dataset' executed in 269.0266s
