## Import packages

In [1]:
%pip install -r /workspace/SpheroidAnalyzer/requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import os
current_dir = os.getcwd()
os.chdir('/workspace/SpheroidAnalyzer')
print("You set a new current directory")
current_dir = os.getcwd()
current_dir

You set a new current directory


'/workspace/SpheroidAnalyzer'

## Install Kaggle

In [3]:
# install kaggle package
%pip install kaggle==1.5.12

Note: you may need to restart the kernel to use updated packages.


In [6]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 /workspace/SpheroidAnalyzer/kaggle.json

Set Kaggle Dataset and Download

In [9]:
KaggleDatasetPath = 'andrgndel/spheroidtestset'
DestinationFolder = 'inputs/RawImages'

Downloading spheroidtestset.zip to /workspace/SpheroidAnalyzer
100%|███████████████████████████████████████▉| 135M/135M [00:03<00:00, 44.6MB/s]
100%|████████████████████████████████████████| 135M/135M [00:03<00:00, 36.2MB/s]


In [21]:
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Downloading spheroidtestset.zip to inputs/RawImages
 95%|██████████████████████████████████████▏ | 129M/135M [00:03<00:00, 30.4MB/s]
100%|████████████████████████████████████████| 135M/135M [00:04<00:00, 34.9MB/s]


In [22]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/spheroidtestset.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/spheroidtestset.zip')

## Check and Remove non-image files by func 'remove-non-image-file'

In [23]:
def remove_non_image_file(my_data_dir):
    image_extension = ('.tif')
    folders = os.listdir(my_data_dir)
    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)
        # print(files)
        i = []
        j = []
        for given_file in files:
            if not given_file.lower().endswith(image_extension):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location)  # remove non image file
                i.append(1)
            else:
                j.append(1)
                pass
        print(f"Folder: {folder} - has image file", len(j))
        print(f"Folder: {folder} - has non-image file", len(i))
        

In [24]:
remove_non_image_file(my_data_dir = 'inputs')

Folder: RawImages - has image file 32
Folder: RawImages - has non-image file 0


## Split Data and get Labels

In [61]:
import os
import shutil
import random
import joblib
import math

# define function
def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    # 23... means class 1 and 24 means class 2
    fileList = os.listdir(my_data_dir + '/RawImages')  # it should get only the folder name
    n = len(fileList)
    ntrain = math.floor(n*train_set_ratio)
    nval = math.floor(n*validation_set_ratio)
    ntest = n - ntrain - nval
    
    # create train, test folders with classes labels sub-folder
    for folder in ['train', 'validation', 'test']:
        for label in ['alive','dead']:
            if os.path.isdir(my_data_dir + '/' + folder + '/' + label):
                pass
            else:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label)
    if os.path.isdir(my_data_dir + '/test/unknown'):
        pass
    else:
        os.makedirs(name=my_data_dir + '/test/unknown')

    random.shuffle(fileList)
    lblList = []
    print(n)
    count = 1
    for file in fileList:
        fileLbl = file[-6:]
        fileLbl = fileLbl[:2]
        if fileLbl == '23':
            lblList.append(1)
            lblFolder = 'alive'
        elif fileLbl == '24':
            lblList.append(2)
            lblFolder = 'dead'
        else:
            lblList.append(0)
            lblFolder = 'unknown'
        
        if lblFolder == 'unknown':
            # move given file to test set
                    shutil.move(my_data_dir + '/RawImages' + '/' + file,
                                my_data_dir + '/test/' + lblFolder + '/' + file)
                    ntest = ntest - 1
                    if ntest<0:
                        ntest = 0
                        nval = nval - 1

                    if nval<0:
                        ntrain = ntrain - 1
        else:
            if count<=ntrain:
                # move a given file to the train set
                    shutil.move(my_data_dir + '/RawImages' + '/' + file,
                                my_data_dir + '/train/' + lblFolder + '/' + file)
            elif count<=(ntrain+nval):
                # move a given file to the validation set
                    shutil.move(my_data_dir + '/RawImages' + '/' + file,
                                my_data_dir + '/validation/' + lblFolder + '/' + file)
            else:
                # move a given file to the test set
                    shutil.move(my_data_dir + '/RawImages' + '/' + file,
                                my_data_dir + '/test/' + lblFolder + '/' + file)
        count += 1
        
    print(lblList)

In [62]:
split_train_validation_test_images(my_data_dir=f"inputs",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )

8
[2, 1, 1, 1, 0, 1, 2, 2]
