# Dataset Generator

## Configuration

### Customizing names of folders

If you want to change names of the directories that will be created.

In [None]:
datasetToGenerate = "regenerate" #@param ["cortex", "main", "mest_main", "mest_glom", "inflammation", "regenerate"]

# Path to the file with info to recreate the same dataset
recreateValDatasetFilePath = "dataset.json" #@param {type:"string"}

# Name of the folder where to found base images and annotations
rawDataset = "raw_dataset" #@param {type:"string"}
# Name of the folder that will be used to store temporary needed files
tempDataset = "temp_dataset" #@param {type:"string"}
# Name that will be used to create folders for training and validation of the 
# cortex training
cortexDatasetPath = "nephrology_cortex_dataset" #@param {type:"string"}
mestDatasetPath = f"nephrology_{datasetToGenerate}_dataset" #@param {type:"string"}
# Name of the folder that will contain every images and division that cannot be used
unusedDirPath = "nephrology_dataset_unused" #@param {type:"string"}

# Name that will be used to create folders for training and validation of the 
# main training
mainDataset = "nephrology_dataset" #@param {type:"string"}
# Name of the folder for files that cannot be used for the main training
mainDatasetUnusedDirPath = "nephrology_dataset_unused" #@param {type:"string"}

### Customizing reading and dividing of images

Here, you can change the division size and minimum overlap. You can also force the generator to use a specific annotation format if there are different ones in the raw directory.

In [None]:
# If you want to force one annotation format to be read :
# 1) Import the AnnotationAdapter you want
# 2) Set the `adapter` variable to a reference of the imported adapter
from datasetTools.AnnotationAdapter import *
from datasetTools.ASAPAdapter import ASAPAdapter
from datasetTools.LabelMeAdapter import LabelMeAdapter
adapter = None #@param ["ASAPAdapter", "LabelMeAdapter", "None"] {type:"raw", allow-input: true}

# The image format to use for the datasets
imageFormat = "jpg" #@param ["jpg", "png"]

# Side size for the divisions that will be created
divisionSize = 1024 #@param {type:"slider", min:896, max:1024, step:1}

# Minimum overlap between divisions between 0.0 and 1.0
minDivisionOverlap = 0.33 #@param {type:"slider", min:0, max:1, step:0.01}

cortexSize = 2048 #@param {type:"slider", min:1024, max:4096, step:1}
cortexResize = (cortexSize, cortexSize)

# Minimum overlap between cortex divisions between 0.0 and 1.0
minCortexDivisionOverlap = 0.00 #@param {type:"slider", min:0, max:1, step:0.01}

### Customizing generator's behaviour

In [None]:
# Set to True if you want the previous directories to be deleted, 
# keeping only new ones
cleanBeforeStart = False #@param {type:"boolean"}

# When creating masks, if an image has more than one cortex mask, they will be 
# fused. Setting this to True will delete the original masks and keep only the 
# fused one.
deleteBaseCortexMasks = True #@param {type:"boolean"}

# If False, some images will be separated from the dataset before being divided
# to create the validation dataset. This avoid divisions from the same image
# being in training and validation dataset.
# If True, images will be divided, then some of the divisions will be separated 
# to create the validation dataset. 
separateDivInsteadOfImage = False #@param {type:"boolean"}

# If True and separateDivInsteadOfImage is False, will select some patients from
# the dataset and create the val dataset with all the images of these patients
separateByPatient = True #@param {type:"boolean"}

## Generation of the dataset

In [None]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    from datasetTools import datasetFormator as df

In [None]:
if datasetToGenerate == "main":
    df.generateDataset(
        rawDataset=rawDataset, tempDataset=tempDataset, unusedDirPath=unusedDirPath,
        mainDataset=mainDataset, mainDatasetUnusedDirPath=mainDatasetUnusedDirPath,
        deleteBaseCortexMasks=deleteBaseCortexMasks, cleanBeforeStart=cleanBeforeStart,
        recreateValList=None, divisionSize=divisionSize, imageFormat=imageFormat,
        separateDivInsteadOfImage=separateDivInsteadOfImage, adapter=adapter,
        separateByPatient=separateByPatient, minDivisionOverlapping=minDivisionOverlap
    )
elif datasetToGenerate == "cortex":
    df.generateCortexDataset(
        rawDataset=rawDataset, outputDataset=cortexDatasetPath,
        cleanBeforeStart=cleanBeforeStart, resize=cortexResize,
        overlap=minCortexDivisionOverlap, recreateValList=None,
        separateDivInsteadOfImage=separateDivInsteadOfImage
    )
elif "mest_" in datasetToGenerate:
    submode = datasetToGenerate.split('_')[-1]
    separateMode = "div" if separateDivInsteadOfImage else ("patient" if separateByPatient else "images")
    df.generateMESTCDataset(
        rawDataset=rawDataset, outputDataset=mestDatasetPath, mode=submode,
        cleanBeforeStart=cleanBeforeStart, imageFormat=imageFormat,
        divisionSize=divisionSize, overlap=minDivisionOverlap,
        separate=separateMode, adapter=adapter
    )
elif datasetToGenerate == "inflammation":
    separateMode = "div" if separateDivInsteadOfImage else ("patient" if separateByPatient else "images")
    df.generateInflammationDataset(
        rawDataset=rawDataset, outputDataset=mestDatasetPath,
        cleanBeforeStart=cleanBeforeStart, imageFormat=imageFormat,
        divisionSize=divisionSize, overlap=minDivisionOverlap,
        separate=separateMode, adapter=adapter
    )
elif datasetToGenerate == "regenerate":
    df.regenerateDataset(
        rawDataset=rawDataset, adapter=adapter,
        recreateFilePath=recreateValDatasetFilePath
    )
print("##### END #####")