In [None]:
import os
import shutil
import numpy as np

### Load functions

In [None]:
path = os.getcwd()

In [None]:
def move_to_combined():
    splits = ['test', 'train', 'valid']

    if os.path.exists(path + f"/combined_ds"):
        shutil.rmtree(path + f"/combined_ds")
    os.mkdir(path + f"/combined_ds")
    os.mkdir(path + f"/combined_ds/images")
    os.mkdir(path + f"/combined_ds/labels")

    
    for split in splits:
        for img in os.listdir(f'{path}/b_ds/{split}/images'):
            shutil.copy(f'{path}/b_ds/{split}/images/{img}', f'{path}/combined_ds/images/{img}')
        for label in os.listdir(f'{path}/b_ds/{split}/labels'):
            shutil.copy(f'{path}/b_ds/{split}/labels/{label}', f'{path}/combined_ds/labels/{label}')

In [None]:
def organize_to_names():
    names = ['anabaena', 'aphanizomenon', 'detritus', 'dolichospermum', 'microcystis', 'oscillatoria', 'synechococcus', 'water bubble', 'woronichinia']
    names_with_freqs = [0 for i in range(len(names))]
    x=0

    for name in names:
        if os.path.exists(path + f"/org_ds/{name}"):
            shutil.rmtree(path + f"/org_ds/{name}")
        os.makedirs(path + f"/org_ds/{name}")
    
    if os.path.exists(path + f"/org_ds/labels"):
            shutil.rmtree(path + f"/org_ds/labels")
    os.makedirs(path + f"/org_ds/labels")
    
    for label in os.listdir(path + "/combined_ds/labels"):
        # organize everything into folders based on
        
        with open(path + f"/combined_ds/labels/{label}") as file:
            # read first line
            asdf = file.readline().split(" ")[0]
            # print(asdf)
            try:
                numval = int(asdf)
                real_name = names[numval]
                
                # move image and label to folder
                names_with_freqs[numval] += 1

                shutil.copy(path + f"/combined_ds/images/{label[:-4]}.jpg", path + f"/org_ds/{real_name}/{f'{real_name}_{names_with_freqs[numval]}'}.jpg")
                shutil.copy(path + f"/combined_ds/labels/{label}", path + f"/org_ds/labels/{f'{real_name}_{names_with_freqs[numval]}'}.txt")
                x+=1
            except:
                # label does not exist
                continue
    print(x)

In [None]:
def get_train_val_test_splits():
    names = ['anabaena', 'aphanizomenon', 'detritus', 'dolichospermum', 'microcystis', 'oscillatoria', 'synechococcus', 'water bubble', 'woronichinia']
    path = os.getcwd()

    ftrain, fval, ftest = np.array([]), np.array([]), np.array([])
    # stratify splitting of data
    for name in names:
        allFileNames = os.listdir(path + f"/org_ds/{name}")
        np.random.seed(42)
        np.random.shuffle(allFileNames)

        train, val, test = np.split(np.array(allFileNames),[int(len(allFileNames)*0.8), int(len(allFileNames)*0.9)])

        ftrain = np.concatenate((ftrain, train))
        fval = np.concatenate((fval, val))
        ftest = np.concatenate((ftest, test))
    print("final lengths after stratified split: ", len(ftrain), len(fval), len(ftest))
    
    return ftrain, fval, ftest

In [None]:
def check_freqs(ftrain, fval, ftest):
    names = ['anabaena', 'aphanizomenon', 'detritus', 'dolichospermum', 'microcystis', 'oscillatoria', 'synechococcus', 'water bubble', 'woronichinia']
    splits = [ftrain, fval, ftest]
    print("printing frequency information")
    print(len(ftrain), len(fval), len(ftest))
    print(len(ftrain)/(len(ftrain)+len(fval)+len(ftest)))
    for split in splits:
        name_counter = [0 for i in range(len(names))]
        print("split", split)
        for f in split:
            for n in names:
                if n in f:
                    name_counter[names.index(n)] += 1

        asdfasdf = 0
        for name in range(len(names)):
            print(names[name], name_counter[name])
            asdfasdf += name_counter[name]
        print(asdfasdf)

In [None]:
def reorganize_to_final(ftrain, fval, ftest):
    splits = ['train', 'valid', 'test']
    
    # clear existing final ds
    if os.path.exists(path + "/final_ds"):
        shutil.rmtree(path + "/final_ds")
    os.mkdir(path + "/final_ds")
    
    for split in splits:
        os.mkdir(path + f"/final_ds/{split}")
        os.mkdir(path + f"/final_ds/{split}/images")
        os.mkdir(path + f"/final_ds/{split}/labels")
        
        if split == 'train':
            thing = ftrain
        elif split == 'valid':
            thing = fval
        elif split == 'test':
            thing = ftest
        
        for file in thing:
            c = file.split("_")[0] # class name
            shutil.copy(path + f"/org_ds/{c}/{file}", path + f"/final_ds/{split}/images/{file}")
            shutil.copy(path + f"/org_ds/labels/{file[:-4]}.txt", path + f"/final_ds/{split}/labels/{file[:-4]}.txt")

    # add data.yaml file
    with open(path + "/final_ds/data.yaml", "w") as file:
        file.write("train: final_ds/train/images"+"\n")
        file.write("test: final_ds/test/images"+"\n")
        file.write("val: final_ds/valid/images"+"\n")
        file.write("nc: 9"+"\n")
        file.write("names: ['anabaena', 'aphanizomenon', 'detritus', 'dolichospermum', 'microcystis', 'oscillatoria', 'synechococcus', 'water bubble', 'woronichinia']")

In [None]:
def rebalance_dataset():
    move_to_combined()
    organize_to_names()
    ftrain, fval, ftest = get_train_val_test_splits()
    # check_freqs(ftrain, fval, ftest)
    print(fval)
    print(ftest)
    
    reorganize_to_final(ftrain, fval, ftest)
    # send to zip
    shutil.make_archive("final_ds", 'zip', path + "/final_ds")

In [None]:
rebalance_dataset()
# print(len(os.listdir(path + "/final_ds/train/images")))
# print(len(os.listdir(path + "/final_ds/valid/images")))
# print(len(os.listdir(path + "/final_ds/test/images")))

In [None]:
print(len(os.listdir(path + "/final_ds/train/images")))
print(len(os.listdir(path + "/final_ds/valid/images")))
print(len(os.listdir(path + "/final_ds/test/images")))

In [None]:
check_freqs(ftrain, fval, ftest)