In [None]:
import os
import shutil
import numpy as np
import cv2
import matplotlib.pyplot as plt
import json

names = ['aphanizomenon', 'detritus', 'dolichospermum', 'microcystis', 'oscillatoria', 'water bubble', 'woronichinia']

In [None]:
shutil.copytree('b_ds', 'b_ds_copy')

In [None]:
# reset b_ds

shutil.rmtree('b_ds')
shutil.copytree('b_ds_copy', 'b_ds')

In [None]:
# read data.yaml from b_ds
with open('b_ds/data.yaml', 'r') as f:
    data = f.read()

# remove everything after "roboflow"
data = data.split('roboflow')[0] # this will be the same...

# get all names from data.yaml
names = data.split('names: ')[1]

# convert string names to list
names = eval(names)
print(names)

# rewrite train, val, test to go to final_ds/{split}/images instead of ../{split}/images
data = data.replace('../train/images', 'final_ds/train/images')
data = data.replace('../valid/images', 'final_ds/valid/images')
data = data.replace('../test/images', 'final_ds/test/images')

print(data)

### Load functions

In [None]:
path = os.getcwd()

In [None]:
# remove all background images from b_ds
for split in os.listdir('b_ds'):
    if split == 'train' or split == 'valid' or split == 'test':
        for filename in os.listdir(f'b_ds/{split}/images'):
            name, ext = os.path.splitext(filename)
            if 'background' in filename:
                os.remove(f'b_ds/{split}/images/{filename}')
                os.remove(f'b_ds/{split}/labels/{filename[:-4]}.txt')

print(len(os.listdir('b_ds/train/images')) + len(os.listdir('b_ds/valid/images')) + len(os.listdir('b_ds/test/images')))

In [None]:
# shutil.rmtree('b_ds')
# shutil.copytree('b_ds_copy', 'b_ds')
# clean up roboflow imported dataset

def clean_roboflow_dataset():
    # get index of "_jpg" 
    names = ['aphanizomenon', 'detritus', 'dolichospermum', 'microcystis', 'oscillatoria', 'water bubble', 'woronichinia']
    for split in os.listdir('b_ds'):
        if split == 'test' or split == 'train' or split == 'valid':
            print(split)
            for image in os.listdir(f'b_ds/{split}/images'):
                # rename image at this path to remove "_jpg"
                # get index of "_jpg"
                name, ext = os.path.splitext(image)
                # b_ds/test/images/woronichinia-259918842-original_jpg.rf.6aa07c318a25214c7d100b271443c1d6.jpg
                new_image = image[:image.rfind('_')] + ext
                # replace current image with new image
                # get image path

                label = image[:-4] + '.txt'
                os.rename(f'b_ds/{split}/images/{image}', f'b_ds/{split}/images/{new_image}')
                os.rename(f'b_ds/{split}/labels/{label}', f'b_ds/{split}/labels/{new_image[:-4]}.txt')

                found = False
                for comp_name in names:
                    if comp_name.lower() in new_image.lower():
                        found = True
                        break
                
                if not found:
                    # this is for lab images that do not have the name associated to it
                    
                    # retrieve label from corresponding txt file
                    if os.path.exists(f'b_ds/{split}/labels/{new_image[:-4]}.txt'):   
                        with open(f'b_ds/{split}/labels/{new_image[:-4]}.txt', 'r') as f:
                            label = f.read()
                            if len(label) > 0:
                                label = int(label.split(' ')[0])
                                print(new_image, label)
                                # copy image to corresponding folder
                                os.rename(f'b_ds/{split}/images/{new_image}', f'b_ds/{split}/images/{names[label]}-{new_image}')
                                # copy label to corresponding folder
                                os.rename(f'b_ds/{split}/labels/{new_image[:-4]}.txt', f'b_ds/{split}/labels/{names[label]}-{new_image[:-4]}.txt')
clean_roboflow_dataset()

In [None]:
def check_for_incorrect_labels():
    for split in os.listdir('b_ds'):
        if split == 'train' or split == 'valid' or split == 'test':
            for image in os.listdir(f'b_ds/{split}/images'):
                matches = False
                for name in names:
                    if name.lower() in image.lower():
                        if not os.path.exists(f'b_ds/{split}/labels/{image[:-4]}.txt'):
                            continue
                        with open(f'b_ds/{split}/labels/{image[:-4]}.txt', 'r') as f:
                            data = f.read()
                            if len(data) == 0:
                                continue
                            data = int(data.split(' ')[0])

                            if names[data] == name:
                                matches = True
                                break
                
                if not matches:
                    print(split, image)

check_for_incorrect_labels()

In [None]:
def delete_all_folders():
    shutil.rmtree(path + "/combined_ds")
    shutil.rmtree(path + "/org_ds")
    shutil.rmtree(path + "/final_ds")
try:
    delete_all_folders()
except:
    pass

In [None]:

splits = ['test', 'train', 'valid']

for split in splits:
        for img in os.listdir(f'{path}/b_ds/{split}/images'):
                if "0911191156c" in img:
                        print(img, split)

In [None]:
def move_to_combined(exclude_classes = ['anabaena']):
    splits = ['test', 'train', 'valid']

    if os.path.exists(path + f"/combined_ds"):
        shutil.rmtree(path + f"/combined_ds")
    os.mkdir(path + f"/combined_ds")
    os.mkdir(path + f"/combined_ds/images")
    os.mkdir(path + f"/combined_ds/labels")

    # 
    seen = []
    for split in splits:
        for img in os.listdir(f'{path}/b_ds/{split}/images'):
            if len(exclude_classes) > 0:
                for exclude_class in exclude_classes:
                    if not exclude_class in img:
                        shutil.copy(f'{path}/b_ds/{split}/images/{img}', f'{path}/combined_ds/images/{img}')
            else:
                shutil.copy(f'{path}/b_ds/{split}/images/{img}', f'{path}/combined_ds/images/{img}')
        for label in os.listdir(f'{path}/b_ds/{split}/labels'):
            if len(exclude_classes) > 0:
                for exclude_class in exclude_classes:            
                    if not exclude_class in label:
                        shutil.copy(f'{path}/b_ds/{split}/labels/{label}', f'{path}/combined_ds/labels/{label}')
            else:
                shutil.copy(f'{path}/b_ds/{split}/labels/{label}', f'{path}/combined_ds/labels/{label}')

In [None]:
def preprocess(useCLAHE = False, grayscale = False):
    # images are already resized to 512x512
    # apply CLAHE to images
    for split in os.listdir(f'b_ds'):
        if split == 'train' or split == 'test' or split == 'valid':
            for img_path in os.listdir(f'b_ds/{split}/images'):
                
                if useCLAHE:
                    if not grayscale:
                        image = cv2.imread(f'b_ds/{split}/images/{img_path}')
                        image = cv2.resize(image, (512, 512))
                        lab = cv2.cvtColor(cv2.cvtColor(image, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2LAB)
                        lab_planes = list(cv2.split(lab))
                        
                        clahe = cv2.createCLAHE(clipLimit=4, tileGridSize=(32,32))
                        lab_planes[0] = clahe.apply(lab_planes[0])
                        lab = cv2.merge(tuple(lab_planes))
                        bgr = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
                        image = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
                    else:
                        image = cv2.imread(f'b_ds/{split}/images/{img_path}', cv2.IMREAD_GRAYSCALE)
                        image = cv2.resize(image, (512, 512))
                        clahe = cv2.createCLAHE(clipLimit=4, tileGridSize=(32,32))
                        image = clahe.apply(image)

                cv2.imwrite(f'{path}/combined_ds/images/{img_path}', image)

In [None]:
def organize_to_names(backgrounds=0.1, augment=0):
    with open('licenses.json') as f:
        licenses = json.load(f)
        cannot_use = set([])
        for l, vals in licenses.items():
            if l == 'null':
                for name, imgs in licenses[l].items():
                    for img_name, url in imgs.items():
                        cannot_use.add(img_name)

    names_with_freqs = [0 for i in range(len(names))]
    x=0
    if os.path.exists(path + f"/org_ds"):
        shutil.rmtree(path + f"/org_ds")
    os.mkdir(path + f"/org_ds")
    for name in names:
        if os.path.exists(path + f"/org_ds/{name}"):
            shutil.rmtree(path + f"/org_ds/{name}")
        os.makedirs(path + f"/org_ds/{name}")
    
    if os.path.exists(path + f"/org_ds/labels"):
            shutil.rmtree(path + f"/org_ds/labels")
    os.makedirs(path + f"/org_ds/labels")

    if backgrounds > 0:
        if os.path.exists(path + f"/org_ds/backgrounds"):
            shutil.rmtree(path + f"/org_ds/backgrounds")
        os.makedirs(path + f"/org_ds/backgrounds")
    
    for label in os.listdir(path + "/combined_ds/labels"):
        # organize everything into folders based on
        if not os.path.exists(path + f"/combined_ds/labels/{label}"):
            print(label, "does not exist")
            continue
        with open(path + f"/combined_ds/labels/{label}") as file:
            # read first line
            asdf = file.readline()
            if len(asdf) == 0:
                print(label, "is empty")
                continue
            
            asdf = asdf.split(" ")[0]
            # print(asdf)
            numval = int(asdf)
            real_name = names[numval]
            
            # move image and label to folder
            names_with_freqs[numval] += 1
            
            name, extension = os.path.splitext(label)
            # assume that the file is already in jpg form
            if not name in cannot_use:
                print(name)
                shutil.copy(path + f"/combined_ds/images/{label[:-4]}.jpg", path + f"/org_ds/{real_name}/{name}.jpg")
                shutil.copy(path + f"/combined_ds/labels/{label}", path + f"/org_ds/labels/{name}.txt")

                # if augment > 0:
                #     augment_image(curr_name)
                    
                x+=1
            else:
                print(f"{name} is copyrighted, cannot be used")
    
    if backgrounds > 0:
        wanted_backgrounds = int(backgrounds * x)/(1-backgrounds)
        print(f"Total number of images: {x}")
        print(f"Total number of backgrounds: {wanted_backgrounds}")
        
        # get names of all files in Bacteria Dataset
        
        lab_images = []
        for folder in os.listdir(path + "/Bacteria Dataset/Dataset-1"):
            if os.path.isfile(folder) or (folder in names) or folder == "FlowCam Library 2021" or folder == "FlowCam Library Instructions.docx":
                continue
            for file in os.listdir(path + "/Bacteria Dataset/Dataset-1/" + folder):
                if file == ".DS_Store":
                    continue
                
                lab_images.append(file)
                name, extension = os.path.splitext(file)
                # assume that the file is already in jpg form
                image = cv2.imread(path + "/Bacteria Dataset/Dataset-1/" + folder + "/" + file)
                try:
                    image = cv2.resize(image, (512, 512))
                except Exception as e:
                    print(e)
                    print(f"Could not resize {file}")
                    0/0
                    continue
                lab = cv2.cvtColor(cv2.cvtColor(image, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2LAB)
                lab_planes = list(cv2.split(lab))
                
                clahe = cv2.createCLAHE(clipLimit=4, tileGridSize=(32,32))
                lab_planes[0] = clahe.apply(lab_planes[0])
                lab = cv2.merge(tuple(lab_planes))
                bgr = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
                image = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)

                cv2.imwrite(f'{path}/org_ds/backgrounds/{name}_background.jpg', image)


        for folder in os.listdir(path + "/Bacteria Dataset/Dataset-2"):
            if os.path.isfile(folder) or (folder in names) or folder == "FlowCam Library 2021" or folder == "FlowCam Library Instructions.docx":
                continue
                
            for file in os.listdir(path + "/Bacteria Dataset/Dataset-2/" + folder):
                if file == ".DS_Store":
                    continue

                lab_images.append(file)
                name, extension = os.path.splitext(file)

                image = cv2.imread(path + "/Bacteria Dataset/Dataset-2/" + folder + "/" + file)
                
                image = cv2.resize(image, (512, 512))
                lab = cv2.cvtColor(cv2.cvtColor(image, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2LAB)
                lab_planes = list(cv2.split(lab))
                
                clahe = cv2.createCLAHE(clipLimit=4, tileGridSize=(32,32))
                lab_planes[0] = clahe.apply(lab_planes[0])
                lab = cv2.merge(tuple(lab_planes))
                bgr = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
                image = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)

                cv2.imwrite(f'{path}/org_ds/backgrounds/{name}_background.jpg', image)

        np.random.seed(0)
        np.random.shuffle(lab_images)

        lab_images = lab_images[:int(wanted_backgrounds)]
        print(len(lab_images))

In [None]:
globftrain = 0
globfvalid = 0
globftest = 0

In [None]:
def get_train_val_test_splits(include_backgrounds=False):
    path = os.getcwd()

    ftrain, fval, ftest = np.array([]), np.array([]), np.array([])
    # stratify splitting of data
    print("getting splits")
    for name in names:
        allFileNames = os.listdir(path + f"/org_ds/{name}")
        np.random.seed(0)
        np.random.shuffle(allFileNames)

        train, val, test = np.split(np.array(allFileNames),[int(len(allFileNames)*0.8), int(len(allFileNames)*0.9)])

        ftrain = np.concatenate((ftrain, train))
        fval = np.concatenate((fval, val))
        ftest = np.concatenate((ftest, test))

        print(name, len(train), len(val), len(test))
    
    

    if include_backgrounds:
        allFileNames = os.listdir(path + f"/org_ds/backgrounds")
        np.random.seed(0)
        np.random.shuffle(allFileNames)

        train, val, test = np.split(np.array(allFileNames),[int(len(allFileNames)*0.8), int(len(allFileNames)*0.9)])

        ftrain = np.concatenate((ftrain, train))
        fval = np.concatenate((fval, val))
        ftest = np.concatenate((ftest, test))

    print("final lengths after stratified split: ", len(ftrain), len(fval), len(ftest))
    
    global globftrain
    global globfval
    global globftest
    globftrain = ftrain
    globfval = fval
    globftest = ftest
    
    return ftrain, fval, ftest

In [None]:
def check_freqs(ftrain, fval, ftest):
    splits = [ftrain, fval, ftest]
    print("printing frequency information")
    print(len(ftrain), len(fval), len(ftest))
    print(len(ftrain)/(len(ftrain)+len(fval)+len(ftest)))
    splitarr = ['train', 'val', 'test']
    # dictionary storing percentage frequencies across splits
    freqs = {}
    for name in names:
        freqs[name] = [0,0,0]
    
    freqs["background"] = [0,0,0]

    print("total dataset size: ", len(ftrain)+len(fval)+len(ftest))
    for i in range(len(splits)):
        for name in names:
            for file in splits[i]:
                if name in file:
                    freqs[name][i] += 1
    
    for name in names:
        for i in range(len(freqs[name])):
            print(name, splitarr[i], freqs[name][i]/sum(freqs[name]), freqs[name][i])
        
        # print total number of images
        print(f"total number of {name}: ", sum(freqs[name]))

    print("background frequencies")
    # check background frequencies
    for i in range(len(splits)):
        for file in splits[i]:
            if "background" in file:
                freqs["background"][i] += 1

    for i in range(len(freqs["background"])):
        print("background", splitarr[i], freqs["background"][i]/sum(freqs["background"]), freqs["background"][i])

    print("total background images", sum(freqs["background"]))
    # print proportion of background images
    print("proportion of background images: ", sum(freqs["background"])/(len(ftrain)+len(fval)+len(ftest)))

In [None]:
def check_freqs_dir(base = "final_ds"):
    splits = [os.listdir(f'{base}/train/images'), os.listdir(f'{base}/valid/images'), os.listdir(f'{base}/test/images')]
    print("printing frequency information")
    trainlen, vallen, testlen = len(os.listdir(f'{base}/train/images')), len(os.listdir(f'{base}/valid/images')), len(os.listdir(f'{base}/test/images'))
    print(trainlen, vallen, testlen)
    print("total dataset size: ", trainlen + vallen + testlen)

    splitarr = ['train', 'valid', 'test']
    # dictionary storing percentage frequencies across splits
    freqs = {}
    for name in names:
        freqs[name] = [0,0,0]
    
    freqs["background"] = [0,0,0]

    
    for i in range(len(splits)):
        for name in names:
            for file in splits[i]:
                if name.lower() in file.lower():
                    freqs[name][i] += 1
    print("background frequencies")
    # check background frequencies
    for i in range(len(splits)):
        for file in splits[i]:
            if "background" in file:
                freqs["background"][i] += 1
    
    print(freqs)
    
    for name in names:
        if sum(freqs[name]) == 0:
            continue
        for i in range(len(freqs[name])):
            print(name, splitarr[i], freqs[name][i]/sum(freqs[name]), freqs[name][i])
        
        # print total number of images
        print(f"total number of {name}: ", sum(freqs[name]))

    

    for i in range(len(freqs["background"])):
        if sum(freqs["background"]) == 0:
            continue
        print("background", splitarr[i], freqs["background"][i]/sum(freqs["background"]), freqs["background"][i])

    print("total background images", sum(freqs["background"]))
    # print proportion of background images
    print("proportion of background images: ", sum(freqs["background"])/(trainlen+vallen+testlen))

In [None]:
check_freqs_dir()

In [None]:
def reorganize_to_final(ftrain, fval, ftest):
    splits = ['train', 'valid', 'test']
    
    # clear existing final ds
    if os.path.exists(path + "/final_ds"):
        shutil.rmtree(path + "/final_ds")
    os.mkdir(path + "/final_ds")
    
    for split in splits:
        os.mkdir(path + f"/final_ds/{split}")
        os.mkdir(path + f"/final_ds/{split}/images")
        os.mkdir(path + f"/final_ds/{split}/labels")
        
        if split == 'train':
            thing = ftrain
        elif split == 'valid':
            thing = fval
        elif split == 'test':
            thing = ftest
        
        for file in thing:
            classname = None
            for name in names:
                if name.lower() in file.lower():
                    classname = name
                    print(classname)
                    break
            
            if os.path.exists(path + f"/org_ds/labels/{file[:-4]}.txt"):
                shutil.copy(path + f"/org_ds/labels/{file[:-4]}.txt", path + f"/final_ds/{split}/labels/{file[:-4]}.txt")
                shutil.copy(path + f"/org_ds/{classname}/{file}", path + f"/final_ds/{split}/images/{file}")
            elif os.path.exists(path + f"/org_ds/backgrounds/{file}"):
                shutil.copy(path + f"/org_ds/backgrounds/{file}", path + f"/final_ds/{split}/images/{file}")
            


    # add data.yaml file
    with open(path + "/final_ds/data.yaml", "w") as file:
        file.write(data)

In [None]:
s = "aphanizomenon-88539393-original_jpg.rf.9fa1e6ed949dae36fb066abackground.jpg"
secondPart = s.split(".")[-2]
print(secondPart[len(secondPart)-10:])

In [None]:
def rebalance_dataset(**kwargs):
    print("deleting folders")
    def delete_all_folders():
        shutil.rmtree(path + "/combined_ds")
        shutil.rmtree(path + "/org_ds")
        shutil.rmtree(path + "/final_ds")
    try:
        delete_all_folders()
    except:
        pass
    useCLAHE = kwargs.get('useCLAHE', False)
    augmentImages = kwargs.get('augment', 0)
    backgrounds = kwargs.get('backgrounds', 0.1)

    print("moving everything to combined")
    move_to_combined()

    print("preprocessing")
    preprocess(useCLAHE = useCLAHE)
    
    print("organizing to names")

    organize_to_names(backgrounds, augment=augmentImages)
    
    ftrain, fval, ftest = get_train_val_test_splits(include_backgrounds = (backgrounds > 0))
    # check_freqs(ftrain, fval, ftest)
    print(fval)
    print(ftest)
    
    reorganize_to_final(ftrain, fval, ftest)
    # send to zip
    shutil.make_archive("final_ds", 'zip', path + "/final_ds")

In [None]:
# 030322Planktothrix_GHT3_031122-40X_jpg.rf.17da996a5f726ba5d2b9ffe62258c547.jpg
rebalance_dataset(useCLAHE=True, backgrounds=0.1)
# print(len(os.listdir(path + "/final_ds/train/images")))
# print(len(os.listdir(path + "/final_ds/valid/images")))
# print(len(os.listdir(path + "/final_ds/test/images")))

In [None]:
for split in os.listdir('final_ds'):
    if split == 'train' or split == 'test' or split == 'valid':
        for file in os.listdir(f'final_ds/{split}/images'):
            print(file)
            if not ".jpg" in file:
                
                print(file)

In [None]:
print(len(os.listdir('images/aphanizomenon')))

In [None]:
# print(len(os.listdir(path + "/final_ds/train/labels")))
# print(len(os.listdir(path + "/final_ds/valid/labels")))
# print(len(os.listdir(path + "/final_ds/test/labels")))

# check if all labels have corresponding images
for label in os.listdir(path + "/final_ds/train/labels"):
    with open(path + f"/final_ds/train/labels/{label}") as file:
        if len(file.readlines()) == 0:
            print(label)
    if not os.path.exists(path + f"/final_ds/train/images/{label[:-4]}.jpg"):
        print(label)

for label in os.listdir(path + "/final_ds/valid/labels"):
    with open(path + f"/final_ds/valid/labels/{label}") as file:
        if len(file.readlines()) == 0:
            print(label)
    if not os.path.exists(path + f"/final_ds/valid/images/{label[:-4]}.jpg"):
        print(label)

for label in os.listdir(path + "/final_ds/test/labels"):
    with open(path + f"/final_ds/test/labels/{label}") as file:
        if len(file.readlines()) == 0:
            print(label)
    if not os.path.exists(path + f"/final_ds/test/images/{label[:-4]}.jpg"):
        print(label)

In [None]:
check_freqs_dir()

In [None]:
print(len(os.listdir(path + "/final_ds/train/images")) + len(os.listdir(path + "/final_ds/valid/images")) + len(os.listdir(path + "/final_ds/test/images")))
# print(len(os.listdir(path + "/final_ds/valid/images")))
# print(len(os.listdir(path + "/final_ds/test/images")))

In [None]:
# find number of images from Bacteria Dataset folder that are in train, val, test of final_ds
dict_trainvaltest = {"train": 0, "val": 0, "test": 0}

for file in globftrain:
    if "default" in file:
        dict_trainvaltest["train"] += 1

for file in globfval:
    if "default" in file:
        dict_trainvaltest["val"] += 1   

for file in globftest:
    if "default" in file:
        dict_trainvaltest["test"] += 1

In [None]:
print(dict_trainvaltest)

In [None]:
# compare CLAHE vs no CLAHE
image = cv2.imread(f'b_ds/train/images/113466794-medium_jpeg.rf.2d09325e3a8b7a92660be0f29ed856f7.jpg', cv2.IMREAD_GRAYSCALE) # need to convert to grayscale
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
clahe_image = clahe.apply(image)

clahe_grid = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(16,16))
clahe_grid_image = clahe_grid.apply(image)

# display original and CLAHE-ed images
fig, ax = plt.subplots(1, 3, figsize=(15, 5))
ax[0].imshow(image, cmap='gray')
ax[0].set_title('Original')
ax[1].imshow(clahe_image, cmap='gray')
ax[1].set_title('CLAHE')
ax[2].imshow(clahe_grid_image, cmap='gray')
ax[2].set_title('CLAHE Grid')
plt.show()


# save CLAHE-ed image

In [None]:
print(len(os.listdir(path + "/final_ds/train/images")))
print(len(os.listdir(path + "/final_ds/valid/images")))
print(len(os.listdir(path + "/final_ds/test/images")))