In [14]:
import requests
import json
import os
import cv2
from PIL import Image
import shutil
import pandas as pd
import random
import numpy as np

In [None]:
## define globals
path = os.getcwd()
data = ""
names = []

#### This assumes that you have a dataset with the following structure
dataset

|--> train
<br>
    |--> images
    <br>
    |--> labels

|--> valid
<br>
    |--> images
    <br>
    |--> labels

|--> test
<br>
    |--> images
    <br>
    |--> labels

data.yaml

#### 1. rename all files in the dataset to consistent names

#### 2. split samples based on how many objects there are per image (try to approximate as well as possible)

#### 3. create data.yaml file


# Functions

In [10]:
def get_initial_classes(new_base='final_ds', old_base='ds'):
    # doing stuff
    shutil.rmtree(old_base)
    shutil.copytree(old_base, f'{old_base}_copy')

    # read data.yaml from {old_base}
    with open(f'{old_base}/data.yaml', 'r') as f:
        data = f.read()

    # remove everything after "roboflow"
    data = data.split('roboflow')[0] # this will be the same...

    # get all names from data.yaml
    names = eval(data.split('names: ')[1])
    print(names)

    # rewrite train, val, test to go to final_ds/{split}/images instead of ../{split}/images
    data = data.replace('../train/images', f'{new_base}/train/images').replace('../valid/images', f'{new_base}/valid/images').replace('../test/images', f'{new_base}/test/images')

    print('data: ', data)

    return data

def remove_existing_backgrounds(old_base='ds', symbol='background'):
    # remove all background images from {old_base}
    for split in os.listdir(old_base):
        if split == 'train' or split == 'valid' or split == 'test':
            for filename in os.listdir(f'{old_base}/{split}/images'):
                first_part, ext = os.path.splitext(filename)
                if symbol in filename:
                    os.remove(f'{old_base}/{split}/images/{filename}')
                    os.remove(f'{old_base}/{split}/labels/{first_part}.txt')

    print(len(os.listdir(f'{old_base}/train/images')) + len(os.listdir(f'{old_base}/valid/images')) + len(os.listdir(f'{old_base}/test/images')))


def clean_roboflow_dataset(old_base='ds'):
    # get index of "_jpg" 
    names = ['aphanizomenon', 'detritus', 'dolichospermum', 'microcystis', 'oscillatoria', 'water bubble', 'woronichinia']
    for split in os.listdir(old_base):
        if split == 'test' or split == 'train' or split == 'valid':
            print(split)
            for image in os.listdir(f'{old_base}/{split}/images'):
                # get index of "_jpg"
                image_name, ext = os.path.splitext(image)
                new_image = image[:image.rfind('_')] + ext
                # replace current image with new image
                # get image path
                new_image_name  = image[:image.rfind('_')]

                os.rename(f'{old_base}/{split}/images/{image}', f'{old_base}/{split}/images/{new_image}')
                os.rename(f'{old_base}/{split}/labels/{image_name}.txt', f'{old_base}/{split}/labels/{new_image_name}.txt')

                found = False
                for comp_name in names:
                    if comp_name.lower() in new_image.lower():
                        found = True
                        break
                
                if not found:
                    # this is for lab images that do not have the name associated to it
                    
                    # retrieve label from corresponding txt file
                    if os.path.exists(f'{}/{split}/labels/{new_image_name}.txt'):   
                        with open(f'{old_base}/{split}/labels/{new_image_name}.txt', 'r') as f:
                            label = f.read()
                            if len(label) > 0:
                                label = int(label.split(' ')[0])
                                # copy image to corresponding folder
                                os.rename(f'{old_base}/{split}/images/{new_image}', f'{old_base}/{split}/images/{names[label]}-{new_image}')
                                # copy label to corresponding folder
                                os.rename(f'{old_base}/{split}/labels/{new_image_name}.txt', f'{old_base}/{split}/labels/{names[label]}-{new_image_name}.txt')

def generate_CLAHE(image_path, output_path, grayscale = False):
    if not grayscale:
        image = cv2.imread(image_path)
        image = cv2.resize(image, (512, 512))
        lab = cv2.cvtColor(cv2.cvtColor(image, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2LAB)
        lab_planes = list(cv2.split(lab))
        
        clahe = cv2.createCLAHE(clipLimit=4, tileGridSize=(32,32))
        lab_planes[0] = clahe.apply(lab_planes[0])
        lab = cv2.merge(tuple(lab_planes))
        bgr = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
        image = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
    else:
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        image = cv2.resize(image, (512, 512))
        clahe = cv2.createCLAHE(clipLimit=4, tileGridSize=(32,32))
        image = clahe.apply(image)
    
    cv2.imwrite(output_path, image)

def preprocess(useCLAHE = False, grayscale = False, base = 'final_ds'):
    # images are already resized to 512x512
    # apply CLAHE to images
    for split in os.listdir(base):
        if split == 'train' or split == 'test' or split == 'valid':
            for img_path in os.listdir(f'{base}/{split}/images'):
                
                image = cv2.imread(f'{base}/{split}/images/{img_path}')
                image = cv2.resize(image, (512, 512))

                cv2.imwrite(f'{base}/{split}/images/{img_path}', image)
                
                if useCLAHE:
                    generate_CLAHE(f'{base}/{split}/images/{img_path}', f'{base}/{split}/images/{img_path}', grayscale)

                else:
                    cv2.imwrite(f'{base}/{split}/images/{img_path}', image)

def check_for_incorrect_labels(old_base='ds'):
    for split in os.listdir(f'{old_base}'):
        if split == 'train' or split == 'valid' or split == 'test':
            for image in os.listdir(f'{old_base}/{split}/images'):
                matches = False
                for name in names:
                    if name.lower() in image.lower():
                        if not os.path.exists(f'{old_base}/{split}/labels/{image[:-4]}.txt'):
                            continue
                        with open(f'{old_base}/{split}/labels/{image[:-4]}.txt', 'r') as f:
                            data = f.read()
                            if len(data) == 0:
                                continue
                            data = int(data.split(' ')[0])

                            if names[data] == name:
                                matches = True
                                break
                if not matches:
                    print(f'{old_base}/{split}/images/{image}')



In [11]:
# delete_all_folders()
clean_names('dataset')
set_name_and_paths('dataset')

valid
test
train
