### Data Augmentation because of high class imbalance
Ref: https://github.com/enrico310786/brain_tumor_classification/blob/master/augment_train_dataset.py

In [5]:
import os
import shutil
import pandas as pd
import cv2
import albumentations as A

In [6]:
#pip install albumentations

In [7]:
#Not Used
transform_old = A.Compose([
    A.HueSaturationValue(p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.RandomGamma(p=0.5),
    A.Rotate(p=0.5),
    A.MultiplicativeNoise(multiplier=[0.5, 1.5], elementwise=True, per_channel=True, p=0.5),
    A.ShiftScaleRotate(shift_limit=0.0625, rotate_limit=45, p=0.5),
    A.Transpose(p=0.5),
])

In [8]:
transform = A.Compose([
    #A.HueSaturationValue(p=0.5),
    #A.RandomBrightnessContrast(p=0.5),
    A.RandomGamma(gamma_limit=[80,120], p=0.5),
    A.Sharpen(p=0.7),
    A.Rotate(limit=[-15,15], p=0.5),
    A.MultiplicativeNoise(multiplier=[0.5, 1.5], p=0.5),
    A.Transpose(p=0.5),
])


In [9]:
def clean_create_dir(path_dir):

    CHECK_FOLDER = os.path.isdir(path_dir)
    if CHECK_FOLDER:
        print("The directory '{}' exists. Deleting".format(path_dir))
        try:
            shutil.rmtree(path_dir)
        except OSError as e:
            print("Error: {}".format(e.strerror))
            raise e

        CHECK_FOLDER = os.path.isdir(path_dir)
        if not CHECK_FOLDER:
            print("Creating directory '{}'".format(path_dir))
            os.makedirs(path_dir)
    else:
        print("Creating directory '{}'".format(path_dir))
        os.makedirs(path_dir)


def make_data_augmentation(path_original_dataset, path_augmented_dataset, root_csv, final_number, df, class2label):

    #iter over directory
    for subdir, dirs, files in os.walk(path_original_dataset):
        for classe in dirs:
            path_class = os.path.join(path_original_dataset, classe)
            CHECK_FOLDER = os.path.isdir(path_class)
            if CHECK_FOLDER:
                label = class2label[classe]
                print("CLASS: {}  - LABEL: {}".format(classe, label))
                number_files = len(os.listdir(path_class))
                print("number of files in directory '{}': {}".format(path_class, number_files))

                path_directory_save = os.path.join(path_augmented_dataset, classe)
                path_directory_save_for_csv = os.path.join(root_csv, classe)
                CHECK_FOLDER = os.path.isdir(path_directory_save)
                if not CHECK_FOLDER:
                    print("Create directory '{}'".format(path_directory_save))
                    os.makedirs(path_directory_save)

                #determino il numero di volte per cui devo applicare la trasformazione su una singola immagine

                n_applications = round((final_number-number_files)/number_files)
                if n_applications < 0:
                    n_applications = 0
                print('n_applications: ', n_applications)

                for filename in os.listdir(path_class):
                    path_image = os.path.join(path_class, filename)
                    image = cv2.imread(path_image)
                    filename_no_ext, extension= filename.split(".")[0], filename.split(".")[-1]

                    # copy the original image from the sourse dir to the dest dir
                    dst_file = os.path.join(path_directory_save, filename)
                    shutil.copy2(path_image, dst_file)

                    df_temp = pd.DataFrame({'CLASS': [classe],
                                    'PATH': os.path.join(path_directory_save_for_csv, filename),
                                    'LABEL': [label]})
                    df = pd.concat([df, df_temp], ignore_index=True)

                    for i in range(n_applications):
                        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                        augmented_image = transform(image=image)['image']
                        augmented_image = cv2.cvtColor(augmented_image, cv2.COLOR_RGB2BGR)
                        new_file_name = filename_no_ext + '_' + str(i+1) + '.' + extension
                        dst_file = os.path.join(path_directory_save, new_file_name)

                        cv2.imwrite(dst_file, augmented_image)
                        df_temp = pd.DataFrame({'CLASS': [classe],
                                        'PATH': os.path.join(path_directory_save_for_csv, new_file_name),
                                        'LABEL': [label]})
                        df = pd.concat([df, df_temp], ignore_index=True)

                number_file_aumentati = len(os.listdir(path_directory_save))
                print("Final number of files in directory '{}': {}".format(path_directory_save, number_file_aumentati))
                print("---------------------------------------------")

    return df

In [10]:
FINAL_NUMBER_SAMPLES_PER_CLASSES = 1617
base_path = "C:/Users/anush/OneDrive/Documents/Sem3/AI in Health Technology/Project/"
path_dataset_train = base_path + "data/train"
path_augmented_dataset_train = base_path + "data_augmented/train_augmented"
path_augmented_csv_train = base_path + "data_augmented/train_augmented.csv"
df_train_augmented = pd.DataFrame(columns=['CLASS', 'PATH', 'LABEL'])
root_csv = "train_augmented"

# the list of classes has to be on the same order as the original dataset
list_classes = ['0', '1', '2', '3', '4']
class2label = {k: v for (v, k) in enumerate(list_classes)}

# 1 - clean and create the dataset directory
clean_create_dir(path_augmented_dataset_train)

# 2 - make data augmentation
print("Data augmentation")
print("Train set")
df_train = make_data_augmentation(path_dataset_train, path_augmented_dataset_train, root_csv, FINAL_NUMBER_SAMPLES_PER_CLASSES, df_train_augmented, class2label)

# 3 - create new csv
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train.to_csv(path_augmented_csv_train, index=False)
print("df_train info")
print(df_train.info())
print('-------------------------------------------------------------')
print("df_train:  CLASS values count")
print(df_train[["CLASS"]].value_counts())
print('-------------------------------------------------------------')

La directory 'C:/Users/anush/OneDrive/Documents/Sem3/AI in Health Technology/Project/data_augmented/train_augmented' esiste. La cancello
Creo la directory 'C:/Users/anush/OneDrive/Documents/Sem3/AI in Health Technology/Project/data_augmented/train_augmented'
Data augmentation
Train set
CLASS: 0  - LABEL: 0
number of files in directory 'C:/Users/anush/OneDrive/Documents/Sem3/AI in Health Technology/Project/data/train\0': 1434
Create dir directory 'C:/Users/anush/OneDrive/Documents/Sem3/AI in Health Technology/Project/data_augmented/train_augmented\0'
n_applications:  0
Final number of files in directory 'C:/Users/anush/OneDrive/Documents/Sem3/AI in Health Technology/Project/data_augmented/train_augmented\0': 1434
---------------------------------------------
CLASS: 1  - LABEL: 1
number of files in directory 'C:/Users/anush/OneDrive/Documents/Sem3/AI in Health Technology/Project/data/train\1': 300
Create dir directory 'C:/Users/anush/OneDrive/Documents/Sem3/AI in Health Technology/Projec