In [1]:
import os
from PIL import Image
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [7]:
class_dict = {
    1: 'normal',
    2: 'aom',
    3: 'ome',
    4: 'csom',
    5: 'myringosclerosis',
    6: 'earwax',
    7: 'tube'
}

classes = [1, 4, 5, 6]

CLASS_NAMES = [class_dict[i] for i in classes]

RANDOM_STATE = 42

randomiser = np.random.RandomState(RANDOM_STATE)

In [11]:
orig_folder = "./original"
exp_folder = "./experiment"

orig_dataset1 = os.path.join(orig_folder, "1")
orig_dataset2 = os.path.join(orig_folder, "2/Training-validation")
orig_dataset3 = os.path.join(orig_folder, "3")

exp_dataset1 = os.path.join(exp_folder, "1")
exp_dataset2 = os.path.join(exp_folder, "2")
exp_dataset3 = os.path.join(exp_folder, "3")
exp_dataset4 = os.path.join(exp_folder, "4")

In [4]:
def load_dataset(path, class_names):
    full_data = []
    
    class_dict = {}
    for i, name in enumerate(class_names):
        class_dict[name] = i
    
    for d in class_names:
        dirpath = os.path.join(path, d)
        if not os.path.exists(dirpath): continue
        image_files = [f for f in os.listdir(dirpath) if f.endswith(('.jpg', '.png', 'jpeg'))]
        label = d
        for img in image_files:
            image = Image.open(os.path.join(dirpath, img))
            
            image = tf.cast(image, tf.float32)/255.0
            
            data = np.array([image, class_dict[label]], dtype=object)
            full_data.append(data)
    
    randomiser.shuffle(np.array(full_data))
    return full_data

In [5]:
def split_data(full_data, train, validation):
    X_full = np.array([x[0] for x in full_data], dtype=object)
    y_full = np.array([y[1] for y in full_data])
    
    X_train, X_test, y_train, y_test = train_test_split(
            X_full, y_full, train_size=train, stratify=y_full, random_state=RANDOM_STATE)
    X_val = []
    y_val = []
    

    validation = validation / (1.0 - train)
    
    if validation >= 1:
        X_val = X_test
        y_val = y_test
        X_test = []
        y_test = []
        
    elif validation > 0:
        X_val, X_test, y_val, y_test = train_test_split(
                X_test, y_test, train_size=validation, stratify=y_test, random_state=RANDOM_STATE)
    
    return [(X_train, y_train), (X_val, y_val), (X_test, y_test)]

In [6]:
def get_dataset(path, class_names, da=0, train=0.7, val=0.1, da_dict={}):     
    class_names = [c for c in class_names if c in os.listdir(path)]
    full_data = load_dataset(path, class_names)
    sets = split_data(full_data, train, val)
    train_orig = sets[0]
    # sets[0] = data_augmentation(da, train_orig[0], train_orig[1], da_dict)
    print(class_names)
    print(np.bincount(train_orig[1]))
    print(np.bincount(sets[0][1]))
    print(np.bincount(sets[0][1]) // np.bincount(train_orig[1]))
    
    print()
    for s in sets:
        print(np.bincount(s[1]))
        
    return (sets, class_names)

In [8]:
def save_images(path_to_save, X, y, class_names):
    def uniquify(path):
        filename, extension = os.path.splitext(path)
        counter = 1

        while os.path.exists(path):
            path = filename + " (" + str(counter) + ")" + extension
            counter += 1

        return path
    
    os.makedirs(os.path.dirname(path_to_save), exist_ok=True)
    for i in range(len(X)):
        img = np.array(X[i])
        label = y[i] 
        path = (os.path.join(path_to_save, class_names[label], class_names[label]+'.jpeg'))
        os.makedirs(os.path.dirname(path), exist_ok=True)
        path = uniquify(path)
        im = Image.fromarray((img * 255).astype(np.uint8))
        im.save(path)


In [9]:
def save_data(path, sets, class_names):
    paths = ['training', 'validation', 'testing']
    for i, p in enumerate(paths):
        data = sets[0]
        X = sets[i][0]
        y = sets[i][1]
        p = os.path.join(path, p)
        save_images(p, X, y, class_names)

In [26]:
# Dataset 2
res = get_dataset(orig_dataset2, CLASS_NAMES, 0, 0.86, 0.12)
save_data(exp_dataset2, res[0], res[1])

['normal', 'csom', 'myringosclerosis', 'earwax']
[155 155 154 155]
[155 155 154 155]
[1 1 1 1]

[155 155 154 155]
[22 21 22 21]
[3 4 4 4]


In [17]:
# Dataset 3
res = get_dataset(orig_dataset3, CLASS_NAMES)
save_data(exp_dataset3, res[0], res[1])

['normal', 'csom', 'myringosclerosis', 'earwax']
[374  44  20  98]
[374  44  20  98]
[1 1 1 1]

[374  44  20  98]
[53  6  3 14]
[108  13   5  28]


In [33]:
import os

directory_path = "./experiment/2/testing/normal/"
count = 4

for filename in os.listdir(directory_path):
    if filename.startswith("tochange"):
        old_path = os.path.join(directory_path, filename)
        new_name = f"normal ({count}).jpeg"
        new_path = os.path.join(directory_path, new_name)
        os.rename(old_path, new_path)
        count += 1
