In [1]:
"""
This is to split the data into train and test sets, respectively.
Author: Booy Faassen
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil

labels = pd.read_csv("./data/labels.csv", header = None, delim_whitespace=True, names = ["image", "value", "type"])

#print(labels)
#print(labels.shape)

In [2]:
"""Make binary classification and assign 0.3 to 0.0 and assign 0.6 to 1.0"""

def binarize_dataset(labels):
    labels_binary = labels.copy(deep=True)
    
    for row in range(len(labels_binary)):
        if labels_binary.iloc[row]['value'] == 0.3333333333333333:
            labels_binary.at[row, 'value'] = 0.0
        elif labels_binary.iloc[row]['value'] == 0.6666666666666666:
            labels_binary.at[row, 'value'] = 1.0
    
    return labels_binary

In [3]:
"""Split based on mono and poly"""

def split_mono_poly(labels):
    mono_labels = pd.DataFrame(columns = ["image", "value", "type"])
    poly_labels = pd.DataFrame(columns = ["image", "value", "type"])
    
    for row in range(len(labels)):
        if labels.iloc[row]['type'] == 'mono':
            mono_labels.loc[len(mono_labels.index)] = [labels.iloc[row]['image'],
                                                       labels.iloc[row]['value'],
                                                       labels.iloc[row]['type']]
        elif labels.iloc[row]['type'] == 'poly':
            poly_labels.loc[len(poly_labels.index)] = [labels.iloc[row]['image'],
                                                       labels.iloc[row]['value'],
                                                       labels.iloc[row]['type']]
    return mono_labels, poly_labels

In [4]:
""" stratified sampling """

def stratified_sampling(labels, train_fraction = 0.75):
    train_set = labels.groupby(['type', 'value'], group_keys=False).apply(lambda x: x.sample(frac=train_fraction))

    """ create a pandas dataframe for the test_set """

    test_set = pd.DataFrame(columns = ["image", "value", "type"])
    count = 0
    for row in range(len(labels)):
        image = str(labels.iloc[row]['image'])
        if image not in train_set['image'].unique():
            count +=1
            test_set.loc[len(test_set.index)] = [labels.iloc[row]['image'],
                                                labels.iloc[row]['value'],
                                                labels.iloc[row]['type']]
    return train_set, test_set

In [5]:
"""ensure train and test folders are empty"""

def empty_train_and_test_folders(data_dir = './data'):
    test_dir_1 = data_dir + '/test/1.0'
    test_dir_0 = data_dir + '/test/0.0'
    train_dir_1 = data_dir + '/train/1.0'
    train_dir_0 = data_dir + '/train/0.0'
    
    if os.listdir(test_dir_1): # True if list is not empty
        for f in os.listdir(test_dir_1):
            os.remove(test_dir_1 + '/' + f)

    if os.listdir(test_dir_0):
        for f in os.listdir(test_dir_0):
            os.remove(test_dir_0 + '/' + f)

    if os.listdir(train_dir_1):
        for f in os.listdir(train_dir_1):
            os.remove(train_dir_1 + '/' + f)

    if os.listdir(train_dir_0):
        for f in os.listdir(train_dir_0):
            os.remove(train_dir_0 + '/' + f)

In [6]:
""" copy images to the respective folders only works for binarized dataset"""

def move_images_to_folders(train_set, test_set, data_dir = './data', all_data_dir = './data/all_data'):
    test_dir_1 = data_dir + '/test/1.0'
    test_dir_0 = data_dir + '/test/0.0'
    train_dir_1 = data_dir + '/train/1.0'
    train_dir_0 = data_dir + '/train/0.0'

    for line in range(len(train_set)):
        source = all_data_dir + '/' + str(train_set.iloc[line]['image'])[7:]
        if train_set.iloc[line]['value'] == 1.0:
            shutil.copy(source, train_dir_1)
        elif train_set.iloc[line]['value'] == 0.0:
            shutil.copy(source, train_dir_0)

    for line in range(len(test_set)):
        source = all_data_dir + '/' + str(test_set.iloc[line]['image'])[7:]
        if test_set.iloc[line]['value'] == 1.0:
            shutil.copy(source, test_dir_1)
        elif test_set.iloc[line]['value'] == 0.0:
            shutil.copy(source, test_dir_0)

In [7]:
""" normal sampling (not stratified) """

def normal_sampling(labels, train_fraction = 0.75):
    train_set = labels.sample(frac=train_fraction)

    """ create a pandas dataframe for the test_set """

    test_set = pd.DataFrame(columns = ["image", "value", "type"])
    count = 0
    for row in range(len(labels)):
        image = str(labels.iloc[row]['image'])
        if image not in train_set['image'].unique():
            count +=1
            test_set.loc[len(test_set.index)] = [labels.iloc[row]['image'],
                                                labels.iloc[row]['value'],
                                                labels.iloc[row]['type']]
    return train_set, test_set

In [14]:
""" Delete automatically created checkpoints """

def remove_checkpoints(path = './data'):
    try:
        shutil.rmtree(path + '/train/0.0/.ipynb_checkpoints')
    except FileNotFoundError:
        pass

    try:
        shutil.rmtree(path + '/train/1.0/.ipynb_checkpoints')
    except FileNotFoundError:
        pass

    try:
        shutil.rmtree(path + '/test/0.0/.ipynb_checkpoints')
    except FileNotFoundError:
        pass

    try:
        shutil.rmtree(path + '/test/1.0/.ipynb_checkpoints')
    except FileNotFoundError:
        pass

In [9]:
"""Make binary classification and remove labels 0.3 and 0.6 from the dataset"""

def remove_unconfident_labels(labels):
    new_labels = labels.copy(deep=True)
    new_labels.drop(new_labels.loc[new_labels['value'] == 0.3333333333333333].index, inplace=True)
    new_labels.drop(new_labels.loc[new_labels['value'] == 0.6666666666666666].index, inplace=True)
    
    return new_labels

In [10]:
""" Split 0.0 and 1.0 labels from 0.3 and 0.6 labels """

def split_confident_unconfident_labels(labels):
    confident_labels = pd.DataFrame(columns = ["image", "value", "type"])
    unconfident_labels = pd.DataFrame(columns = ["image", "value", "type"])
    
    for row in range(len(labels)):
        value = labels.iloc[row]['value']
        if value == 1.0 or value == 0.0:
            confident_labels.loc[len(confident_labels.index)] = [labels.iloc[row]['image'],
                                                       labels.iloc[row]['value'],
                                                       labels.iloc[row]['type']]
        elif value == 0.3333333333333333 or value == 0.6666666666666666:
            unconfident_labels.loc[len(unconfident_labels.index)] = [labels.iloc[row]['image'],
                                                       labels.iloc[row]['value'],
                                                       labels.iloc[row]['type']]
    return confident_labels, unconfident_labels

In [None]:
"""
Easy function to choose training setup

arguments:
option: takes values 1-7 for the dataset, see below for descriptions.
path: the path at which the 'labels.csv' can be found.

Options:
1. binarize data, stratified sampling by Type (mono and poly) and Value (1.0 and 0.0)
2. binarize data, normal sampling (train=75%), only Type mono
3. binarize data, normal sampling (train=75%), only Type poly
4. Only Type mono, use 1.0 and 0.0 as training, 0.3 and 0.6 as testing
5. Only Type poly, use 1.0 and 0.0 as training, 0.3 and 0.6 as testing
6. Only labels 0.0 and 1.0 without including 0.3 and 0.6 labels, both Types poly and mono, stratified sampling
7. Only labels 0.0 and 1.0 without including 0.3 and 0.6 labels, only Type mono
8. Only labels 0.0 and 1.0 without including 0.3 and 0.6 labels, only Type poly
"""

def training_data_initialisation(option = 1, data_dir = './data'):
    """ Checking for valid input """
    valid_input = [1, 2, 3, 4, 5, 6, 7, 8]
    if option not in valid_input:
        raise ValueError("Option must be set to a value 1-8 (1 default, being stratified sampling after dataset binarization)")

    """ Function logic """
    # Logic to perform regardless of the option chosen
    labels = pd.read_csv(os.path.join(data_dir, 'labels.csv'), header = None, delim_whitespace=True, names = ["image", "value", "type"])
    
    if option == 1:
        binary_labels = binarize_dataset(labels)
        train_set, test_set = stratified_sampling(binary_labels, train_fraction=0.75)
        
    elif option == 2:
        binary_labels = binarize_dataset(labels)
        mono_labels, _ = split_mono_poly(binary_labels)
        train_set, test_set = normal_sampling(mono_labels, train_fraction=0.75)
        
    elif option == 3:
        binary_labels = binarize_dataset(labels)
        _, poly_labels = split_mono_poly(binary_labels)
        train_set, test_set = normal_sampling(poly_labels, train_fraction=0.75)
        
    elif option == 4:
        mono_labels, _ = split_mono_poly(labels)
        confident_labels, unconfident_labels = split_confident_unconfident_labels(mono_labels)
        unconfident_labels = binarize_dataset(unconfident_labels)
        train_set, test_set = confident_labels, unconfident_labels
        
    elif option == 5:
        _, poly_labels = split_mono_poly(labels)
        confident_labels, unconfident_labels = split_confident_unconfident_labels(poly_labels)
        unconfident_labels = binarize_dataset(unconfident_labels)
        train_set, test_set = confident_labels, unconfident_labels
        
    elif option == 6:
        confident_labels = remove_unconfident_labels(labels)
        train_set, test_set = stratified_sampling(confident_labels, train_fraction=0.75)
        
    elif option == 7:
        confident_labels = remove_unconfident_labels(labels)
        mono_labels, _ = split_mono_poly(confident_labels)
        train_set, test_set = normal_sampling(mono_labels, train_fraction=0.75)
        
    elif option == 8:
        confident_labels = remove_unconfident_labels(labels)
        _, poly_labels = split_mono_poly(confident_labels)
        train_set, test_set = normal_sampling(poly_labels, train_fraction=0.75)

    # Logic regardless of what option is chosen
    return train_set, test_set
    
    """ Checking
    remove_checkpoints()
    if len(train_set) == len(os.listdir('data/train/1.0')) + len(os.listdir('data/train/0.0')) and len(test_set) == len(os.listdir('data/test/1.0')) + len(os.listdir('data/test/0.0')):
        print('OK')
    """

In [None]:
""" Create folders and sub folders for each dataset """

def create_dataset_folders(parent_dir = './data'):
    for i in range(1,9):
        directory = 'dataset0' + str(i)
        path = os.path.join(parent_dir, directory)
        os.mkdir(path)
        create_sub_folders(path)


def create_sub_folders(parent_dir):
    folders = {
        'train': os.path.join(parent_dir, 'train'),
        'test': os.path.join(parent_dir, 'test'),
        'train1.0': os.path.join(parent_dir, 'train/1.0'),
        'train0.0': os.path.join(parent_dir, 'train/0.0'),
        'test1.0': os.path.join(parent_dir, 'test/1.0'),
        'test0.0': os.path.join(parent_dir, 'test/0.0')
    }

    for i in range(len(folders)):
        os.mkdir(list(folders.values())[i])


In [14]:
""" populate dataset folders with the datasets """

def create_datasets(parent_dir = './data'):
    create_dataset_folders(parent_dir = parent_dir)
    datasets = ['dataset01', 'dataset02', 'dataset03', 'dataset04', 'dataset05', 'dataset06', 'dataset07', 'dataset08']
    for i in range(len(datasets)):
        train_set, test_set = training_data_initialisation(option = i+1, data_dir = parent_dir)
        path = os.path.join(parent_dir, datasets[i])
        empty_train_and_test_folders(data_dir = path)
        move_images_to_folders(train_set, test_set, data_dir = path, all_data_dir = parent_dir + '/all_data')


In [15]:
""" Remove all datasets """

def remove_all_datasets(parent_dir = './data'):
    datasets = ['dataset01', 'dataset02', 'dataset03', 'dataset04', 'dataset05', 'dataset06', 'dataset07', 'dataset08']
    for i in range(len(datasets)):
        path = os.path.join(parent_dir, datasets[i])
        remove_checkpoints(path)
        empty_train_and_test_folders(path)
        shutil.rmtree(path)
    

In [22]:
""" Count number of images in a folder of a given path """

def count_images_in_folder(root = './data/dataset01', verbose: bool = False):
    train10 = '/train/1.0'
    train00 = '/train/0.0'
    test10 = '/test/1.0'
    test00 = '/test/0.0'

    num_train_1 = 0
    num_train_0 = 0
    num_test_1 = 0
    num_test_0 = 0
    for i in os.listdir(root + train10):
        num_train_1 += 1
    for i in os.listdir(root + train00):
        num_train_0 += 1
    for i in os.listdir(root + test10):
        num_test_1 += 1
    for i in os.listdir(root + test00):
        num_test_0 += 1
    
    num_train = num_train_1 + num_train_0
    num_test = num_test_1 + num_test_0
    num_total = num_train + num_test
    
    if verbose == True:
        print('train 1.0:', num_train_1)
        print('train 0.0:', num_train_0, '\n')
        print('test 1.0:', num_test_1)
        print('test 0.0:', num_test_0, '\n')
        print('total 1.0:', num_train_1 + num_test_1)
        print('total 0.0:', num_train_0 + num_test_0)
    
    return num_train, num_test, num_total 


In [17]:
""" Main method """

create_datasets('./data')

In [16]:
""" Main method to remove dataset folders """

remove_all_datasets(parent_dir = './data')

In [30]:
count_images_in_folder(root='./data/dataset01', verbose=True)

train 1.0: 616
train 0.0: 1353 

test 1.0: 205
test 0.0: 450 

total 1.0: 821
total 0.0: 1803


(1969, 655, 2624)