## Console for running data-related jobs

### Unzip Data

In [1]:
from subprocess import run

# Enter the path to a zip file containing all data
# or to a directory containing zip files for each category
# If the path to a zip file is provided, it is expected that the zip file
# contains a single directory with zipped subdirectories for each category
# e.g., /home/danny/mavira/FashionTraining/data/classifier/outfits-zipped.zip
# or /home/danny/mavira/FashionTraining/data/classifier/outfits-zipped/
dataset_path = (
    "/home/danny/mavira/FashionTraining/data/classifier/outfits-zipped.zip"
)

# Enter a path to a directory where the data will be stored once unzipped
# This must be within the data directory of the project for later steps to work
# e.g., /home/danny/mavira/FashionTraining/data/classifier/classifier_main/
# or /home/danny/mavira/FashionTraining/data/classifier/classifier_main/
unzip_path = (
    "/home/danny/mavira/FashionTraining/data/classifier/classifier362/"
)

# Enter a regular expression to match the category names to unzip.
# e.g., "*.zip"
pattern = "*.zip"

# Make sure that you have verified all parameters before running this cell
unzip_result = run(
    [
        "python3",
        "/home/danny/mavira/FashionTraining/scripts/unzip_data.py",
        dataset_path,
        unzip_path,
        pattern,
    ]
)

# Note that "caution: filename not matched" warnings are normal

02:37:01 - Unzipping /home/danny/mavira/FashionTraining/data/classifier/outfits-zipped.zip to temp directory /home/danny/mavira/FashionTraining/data/classifier/outfits-zipped-temp...
02:37:19 - Unzipping files matching *.zip at /home/danny/mavira/FashionTraining/data/classifier/outfits-zipped-temp/outfits-zipped to ../data/classifier/classifier362/...
caution: excluded filename not matched:  **/.DS_Store
caution: excluded filename not matched:  **/*([0-9]).jpg
caution: excluded filename not matched:  **/*([0-9]).png
caution: excluded filename not matched:  **/.DS_Store
caution: excluded filename not matched:  **/*([0-9]).jpg
caution: excluded filename not matched:  **/*([0-9]).png
caution: excluded filename not matched:  **/*([0-9]).jpg
caution: excluded filename not matched:  **/*([0-9]).png
caution: excluded filename not matched:  **/.DS_Store
caution: excluded filename not matched:  **/*([0-9]).jpg
caution: excluded filename not matched:  **/*([0-9]).png
caution: excluded filename n

### Dataset Registration (only for raw datasets)

In [2]:
from maviratrain.utils.registration.register_data import register_dataset

# Enter a path to the directory of the dataset as a string
# e.g., "/home/danny/mavira/FashionTraining/data/classifier/classifier_main"
data_path = "/home/danny/mavira/FashionTraining/data/classifier/classifier362/"

# Enter any notes that you would like to include about the dataset as a string
# Leave as None if you would not like to include any notes
# e.g., "This dataset is for testing only"
notes = "362 categories"

# Make sure that you have verified all parameters before running this cell
dataset_id = register_dataset(data_path=data_path, notes=notes)
print(f"Dataset registered with id {dataset_id}")

02:44:28 - Registering dataset at ../data/classifier/classifier362/ in database...
02:44:29 - Dataset registered successfully!


Dataset registered with id 1


### Data Processing Jobs

In [3]:
from subprocess import run

# Display the help message for the run_data_processing.py script
help = run(
    [
        "python",
        "/home/danny/mavira/FashionTraining/scripts/run_data_processing.py",
        "-h",
    ]
)

usage: run_data_processing.py [-h] [-r RAW] [--resized RESIZED]
                              [--split SPLIT] [--normalized NORMALIZED]
                              [--converted CONVERTED] [--height HEIGHT]
                              [--width WIDTH] [--interp INTERP]
                              [--deduplicate DEDUPLICATE] [--ratios RATIOS]
                              [--seed SEED] [--norm_method NORM_METHOD]
                              [--final_format FINAL_FORMAT]
                              [--quality QUALITY]
                              [--working_format WORKING_FORMAT] [-c CLEANUP]
                              [--notes NOTES]
                              {-1,0,1,2,3,4}

Run the data processing pipeline

positional arguments:
  {-1,0,1,2,3,4}        Specifies after which stage the processing
                        pipelineshould stop. If -1, then the whole processing
                        pipeline is run on the data at --raw according to any
                      

In [1]:
from subprocess import run

# The stage at which to stop the processing pipeline. If -1, run the entire
# pipeline. If 0, stop after cleaning the filenames. If 1, stop after resizing.
# If 2, stop after splitting. If 3, stop after normalizing the split images.
# If 4, stop after converting the normalized images to a different format.
stage = -1

# If applicable, the path to the directory of the raw dataset as a string.
# Ignored if not starting from raw data.
raw = "/home/danny/mavira/FashionTraining/data/classifier/classifier362/"
raw = None

# If starting from raw data, this is a path to where the resized images
# will be stored. If not starting from raw data, this is a
# path to the directory of a dataset that has already been resized.
# Ignored if not resizing images.
resized = None
# resized = (
#     "/home/danny/mavira/FashionTraining/data/classifier/classifier362-r2/"
# )

# If starting from raw or resized data, this is the path to
# where the split images will be stored. If not starting from raw or resized
# data, this is a path to the directory of a dataset that has already
# been split into train/val/test sets. Ignored if not splitting images.
split = None
# split = (
#     "/home/danny/mavira/FashionTraining/data/classifier/classifier362-r2-s3/"
# )

# If starting from raw, resized, or split data, this is a path
# to where the normalized images will be stored. If starting from
# normalized data, this is a path to the directory of a dataset that has
# already been normalized. Ignored if not normalizing images.
normalized = None

# If performing format conversion, this is a path to where the
# converted images will be stored. Ignored if not converting formats.
converted = None

# The height that images will be resized to. Ignored if not resizing images.
height = 224

# The width that images will be resized to. Ignored if not resizing images.
width = 224

# The interpolation method used for resizing. Ignored if not resizing images.
interp = "bicubic"

# Whether or not to deduplicate images during resizing (0 for no, 1 for yes).
# Ignored if not resizing images.
deduplicate = 1

# Ratio of the dataset to use for training/validation/testing.
# Ignored if not splitting into train/val/test sets.
ratios = "64/16/20"

# The random seed to use for splitting the dataset. Ignored if not splitting.
seed = 42

# The normalization method to use. Ignored if not normalizing images.
# Options are "zscore", "pixelz", "localz", "minmax", "minmaxextended",
# "localminmax", and "localminmaxextended".
norm_method = "zscore"

# The format to convert images to in final conversion, if performed.
# Options are "jpg", "npy", and "pt". Ignored if no final conversion.
final_format = "npy"

# The quality 0-100 to use for JPEG compression.
# Ignored if not converting to JPEG. Default is 95.
quality = 95

# The format to use for intermediate processing steps.
# Options are "npy" and "pt".
working_format = "npy"

# Whether or not to remove any intermediate files created during processing
# (0 for no, 1 for yes). Ignored if not running the entire pipeline.
cleanup = 1

# Any notes to include about the dataset. Leave as an empty string if no notes.
notes = ""

# Make sure that you have verified all parameters before running this cell
processing_result = run(
    [
        "python",
        "/home/danny/mavira/FashionTraining/scripts/run_data_processing.py",
        f"--raw={raw}",
        f"--resized={resized}",
        f"--split={split}",
        f"--normalized={normalized}",
        f"--converted={converted}",
        f"--height={height}",
        f"--width={width}",
        f"--interp={interp}",
        f"--deduplicate={deduplicate}",
        f"--ratios={ratios}",
        f"--seed={seed}",
        f"--norm_method={norm_method}",
        f"--final_format={final_format}",
        f"--quality={quality}",
        f"--working_format={working_format}",
        f"--cleanup={cleanup}",
        f"--notes={notes}",
        f"{stage}",
    ]
)

04:08:15 - Running data processing pipeline with job ID 4...
04:08:15 - Registering data processing job 4 in database...
04:08:15 - Data processing job registered successfully!
04:08:15 - Beginning normalization of data at ../data/classifier/classifier362-r2-s3/ using zscore and saving at ../data/classifier/classifier362-r2-s3-n4...
04:37:24 - Registering dataset at ../data/classifier/classifier362-r2-s3-n4/ in database...
04:37:24 - Dataset registered successfully!
04:37:24 - Done normalizing dataset!
04:37:24 - Updating data processing job 4 in database...
04:37:24 - Data processing job updated successfully!
04:37:24 - Logs zipped and moved to ../logs/archive/data_processing/j4.zip
04:37:24 - Finished running job 4!


  adding: ../logs/archive/data_processing/j4 (deflated 95%)
