Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
# Conflicts:
#	sla_cli/__init__.py
  • Loading branch information
DavidWalshe93 committed Apr 15, 2021
2 parents 389a61f + cfbd288 commit c92ca8a
Show file tree
Hide file tree
Showing 35 changed files with 911 additions and 94 deletions.
4 changes: 3 additions & 1 deletion .sla_cli_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

# Template configuration file for sla-cli.

# The path where data is to be downloaded and referenced off.
data_directory: "./data/"

# Unzip archives as they are downloaded or leave them as an archive file.
unzip: true
Expand All @@ -15,6 +17,6 @@ convert: jpeg

isic:
# Number of images to download per batch, API upper limit is >> 300 <<.
batch_size: 300
batch_size: 25
# Number of concurrent download workers, setting this too high can have adverse effects.
max_workers: 5
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ their research efforts.
- Dataset summaries and label distribution.
- Console-based dashboards.
- Full support for downloading datasets + metadata from the ISIC Archive API.
- Full support for downloading PH2 dataset.
- Full support for downloading PAD-UFES-20 dataset.
- Full support for downloading MEDNODE dataset.

**WIP**

Expand Down Expand Up @@ -58,14 +61,14 @@ The table below shows the dataset currently available to acquire via the tool.
| JID Editorial Images 2018 ||
| MClass (Dermoscopy) | ⚠️ |
| MClass (Dermoscopy) | ⚠️ |
| MEDNODE | |
| MEDNODE | |
| MSK-1 ||
| MSK-2 ||
| MSK-3 ||
| MSK-4 ||
| MSK-5 ||
| PAD-UFES-20 | |
| PH2 | |
| PAD-UFES-20 | |
| PH2 | |
| SONIC ||
| Sydney MIA SMDC 2020 ISIC Challenge Contribution ||
| UDA-1 ||
Expand Down
2 changes: 2 additions & 0 deletions alias.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
alias cli="python ./sla_cli/entry.py"
cli -v
2 changes: 1 addition & 1 deletion dev_scripts/db_schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ datasets:
info:
availability: public
capture_method: dermoscopy
size: 11
size: 112
references:
- "https://www.fc.up.pt/addi/ph2%20database.html"
download:
Expand Down
3 changes: 3 additions & 0 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ coverage==5.5
coveralls==3.0.1
cycler==0.10.0
docopt==0.6.2
fuzzywuzzy==0.18.0
httpretty==1.0.5
idna==2.10
importlib-metadata==3.10.0
Expand All @@ -19,6 +20,7 @@ matplotlib==3.3.4
numpy==1.20.2
packaging==20.9
pandas==1.2.3
patool==1.12
pep517==0.10.0
Pillow==8.2.0
pluggy==0.13.1
Expand All @@ -31,6 +33,7 @@ pytz==2021.1
PyYAML==5.4.1
requests==2.25.1
six==1.15.0
-e git+https://github.com/DavidWalshe93/SL-CLI.git@f3b3213157d4fe9c4be9b5df39b292944b658df9#egg=sla_cli
tabulate==0.8.9
toml==0.10.2
typing-extensions==3.7.4.3
Expand Down
2 changes: 1 addition & 1 deletion sla_cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
Date: 08 April 2021
"""

__version__ = '0.2.0'
__version__ = '0.2.1'
2 changes: 1 addition & 1 deletion sla_cli/db/db.json
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@
"info": {
"availability": "public",
"capture_method": "dermoscopy",
"size": 11,
"size": 112,
"references": [
"https://www.fc.up.pt/addi/ph2%20database.html"
],
Expand Down
7 changes: 5 additions & 2 deletions sla_cli/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

from sla_cli.src.common.logger.init_logger import init_logger
from sla_cli.src.common.versioning import get_version
from sla_cli.src.common.console import init_colorama
from sla_cli.src.common.console import init_colorama, init_progress_bars
from sla_cli.src.common.config import Config

from sla_cli.src.cli.context import GROUP_CONTEXT_SETTINGS
Expand All @@ -29,6 +29,7 @@
# Commands
from sla_cli.src.cli.commands.ls import ls
from sla_cli.src.cli.commands.download import download
from sla_cli.src.cli.commands.organise import organise

logger = logging.getLogger(__name__)

Expand All @@ -45,6 +46,7 @@ class CliParameters:
@click.option("-d", "--debug", is_flag=True, help="Runs the tool in debug mode.")
@click.option("-f", "--config-file", type=click.STRING, help="Explicitly load a file configuration from a given path.")
@init_colorama
@init_progress_bars
@init_logger
@kwargs_to_dataclass(CliParameters)
@click.pass_context
Expand Down Expand Up @@ -75,7 +77,8 @@ def cli(ctx: Context, params: CliParameters):
# ==================================================
commands = [
ls,
download
download,
organise
]

for command in commands:
Expand Down
17 changes: 0 additions & 17 deletions sla_cli/src/cli/autocompletion.py

This file was deleted.

62 changes: 27 additions & 35 deletions sla_cli/src/cli/commands/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@
import click
from click import Context

from sla_cli.src.common.console import init_colorama

from sla_cli.src.cli.context import COMMAND_CONTEXT_SETTINGS
from sla_cli.src.cli.utils import kwargs_to_dataclass
from sla_cli.src.cli.utils import kwargs_to_dataclass, default_from_context
from sla_cli.src.db.accessors import AccessorFactory
from sla_cli.src.download import Downloader, DownloaderOptions, DummyDownloader

from sla_cli.src.download.isic import IsicMetadataDownloader, IsicImageDownloader
from sla_cli.src.download.ph2 import Ph2Downloader
from sla_cli.src.download.pad_ufes_20 import PadUfes20Downloader
from sla_cli.src.download.mednode import MednodeDownloader

logger = logging.getLogger(__name__)

Expand All @@ -28,34 +29,42 @@ class DownloadParameters:
datasets: List[str]
directory: str
force: bool
clean: bool
skip: bool
metadata_as_name: bool
isic_meta: bool


@click.command(**COMMAND_CONTEXT_SETTINGS, short_help="Downloads available datasets.")
@click.argument("datasets", type=click.STRING, nargs=-1)
@click.option("-d", "--directory", type=click.STRING, default=os.getcwd(), help="The destination directory for the download.")
@click.option("-f", "--force", is_flag=True, help="Force download a dataset, even if it already exists on the filesystem.")
@click.option("--isic-meta", is_flag=True, help="Download the ISIC Archive metadata instead of a dataset.")
@click.option("--metadata-as-name", is_flag=True, help="Saves the dataset metadata as the dataset name. Helpful for viewing in excel, not optimal for ML pipelines.")
@click.option("-d", "--directory", type=click.STRING, cls=default_from_context("data_directory"), help="The destination directory for the downloaded content. Default is the current work directory.")
@click.option("-f", "--force", type=click.BOOL, is_flag=True, help="Force download a dataset, even if it already exists on the filesystem.")
@click.option("-c", "--clean", type=click.BOOL, is_flag=True, help="Remove archive files directly after extraction.")
@click.option("-s", "--skip", type=click.BOOL, is_flag=True, help="Skip the download phase, useful for running builds on previously downloaded archives.")
@click.option("--isic-meta", type=click.BOOL, is_flag=True, help="Download the ISIC Archive metadata instead of a dataset.")
@click.option("--metadata-as-name", type=click.BOOL, is_flag=True, help="Saves the dataset metadata as the dataset name. Helpful for viewing in excel, not optimal for ML pipelines.")
@kwargs_to_dataclass(DownloadParameters)
@click.pass_context
def download(ctx: Context, params: DownloadParameters):
datasets = AccessorFactory.create_datasets()

removals = []
# Remove datasets that dont exist in the tool before continuing.
keep = []
for dataset in params.datasets:
if dataset not in datasets.datasets.names:
if dataset in datasets.datasets.names:
keep.append(dataset)
else:
logger.warning(f"'{dataset}' does not exist for download, removing...")
removals.append(dataset)

params.datasets = [dataset for dataset in params.datasets if dataset not in removals]
params.datasets = keep

options = DownloaderOptions(
destination_directory=params.directory,
config=ctx.obj,
force=params.force,
metadata_as_name=params.metadata_as_name
metadata_as_name=params.metadata_as_name,
clean=params.clean,
skip=params.skip
)

# Download only the ISIC metadata.
Expand All @@ -71,8 +80,9 @@ def download(ctx: Context, params: DownloadParameters):
downloader = downloader_factory(dataset)

# Add dataset to options.
options.dataset = convert(dataset=dataset)
options.dataset = dataset
options.url = datasets.datasets[dataset].info.download[0]
options.size = datasets.datasets[dataset].info.size

# Download the dataset.
downloader = downloader(options=options)
Expand All @@ -97,11 +107,14 @@ def downloader_factory(dataset) -> Downloader:
"isic_2020_vienna_part_1": IsicImageDownloader,
"isic_2020_vienna_part_2": IsicImageDownloader,
"jid_editorial_images_2018": IsicImageDownloader,
"mednode": MednodeDownloader,
"msk_1": IsicImageDownloader,
"msk_2": IsicImageDownloader,
"msk_3": IsicImageDownloader,
"msk_4": IsicImageDownloader,
"msk_5": IsicImageDownloader,
"pad_ufes_20": PadUfes20Downloader,
"ph2": Ph2Downloader,
"sonic": IsicImageDownloader,
"sydney_mia_smdc_2020_isic_challenge_contribution": IsicImageDownloader,
"uda_1": IsicImageDownloader,
Expand All @@ -112,25 +125,4 @@ def downloader_factory(dataset) -> Downloader:
)


def convert(dataset: str) -> str:
"""Translates the CLI argument name into the Metadata value."""
return dict(
bcn_20000="BCN_20000",
bcn_2020_challenge="BCN_2020_Challenge",
brisbane_isic_challenge_2020="Brisbane ISIC Challenge 2020",
dermoscopedia_cc_by="Dermoscopedia (CC-BY)",
ham10000="HAM10000",
isic_2020_challenge_mskcc_contribution="ISIC 2020 Challenge - MSKCC contribution",
isic_2020_vienna_part_1="ISIC_2020_Vienna_part_1",
isic_2020_vienna_part_2="ISIC_2020_Vienna_part2",
jid_editorial_images_2018="2018 JID Editorial Images",
msk_1="MSK-1",
msk_2="MSK-2",
msk_3="MSK-3",
msk_4="MSK-4",
msk_5="MSK-5",
sonic="SONIC",
sydney_mia_smdc_2020_isic_challenge_contribution="Sydney (MIA / SMDC) 2020 ISIC challenge contribution",
uda_1="UDA-1",
uda_2="UDA-2"
).get(dataset, dataset)

59 changes: 59 additions & 0 deletions sla_cli/src/cli/commands/organise.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""
Author: David Walshe
Date: 14 April 2021
"""

import logging
import logging
import os
from typing import List
from dataclasses import dataclass, asdict
import hashlib

import click
from click import Context
from click.exceptions import BadOptionUsage

from sla_cli.src.cli.context import COMMAND_CONTEXT_SETTINGS
from sla_cli.src.cli.utils import kwargs_to_dataclass, default_from_context
from sla_cli.src.cli.converters import match_datasets_cb

logger = logging.getLogger(__name__)


@dataclass
class OrganiseParameters:
datasets: List[str]
directory: str
include: List[str]
exclude: List[str]


@click.command(**COMMAND_CONTEXT_SETTINGS, short_help="Organises datasets into train/validation/splits.")
@click.argument("datasets", type=click.STRING, callback=match_datasets_cb, nargs=-1)
@click.option("-d", "--directory", type=click.STRING, cls=default_from_context("data_directory"), help="The destination directory for the downloaded content. Default is the current work directory.")
@click.option("-i", "--include", type=click.STRING, multiple=True, default=None, callback=match_datasets_cb,
help="Used to exclude specific classes in the data. Option in mutually exclusive to '-e/--exclude'.")
@click.option("-e", "--exclude", type=click.STRING, multiple=True, default=None, callback=match_datasets_cb,
help="Used to include specific classes in the data. Option in mutually exclusive to '-i/--include'.")
@kwargs_to_dataclass(OrganiseParameters)
@click.pass_context
def organise(ctx: Context, params: OrganiseParameters):
if all([params.include, params.exclude]):
raise BadOptionUsage("include", f"'-i/--include' and '-e/--exclude' switches cannot be used together.")

# Filter archive files and an exclusion list.
# available_datasets = [dataset.lower(). for dataset in os.listdir(params.directory) if dataset.find(".") == -1]
print(available_datasets)

for dataset in params.datasets:
if dataset not in available_datasets:
logger.error(f"Missing data for '{dataset}', use 'sla-cli download <DATASET>' to continue.")


def keep_includes(params: OrganiseParameters) -> List[str]:
pass


def remove_excludes(params: OrganiseParameters) -> List[str]:
pass

0 comments on commit c92ca8a

Please sign in to comment.