In [None]:
from pathlib import Path
import zipfile
import subprocess
from tqdm.auto import tqdm

In [None]:
dataset_dir = Path("dataset")
dataset_dir.mkdir(exist_ok=True)

In [None]:
# data is from here https://zenodo.org/records/5036991
scenes = {
    "url": "https://zenodo.org/records/5036991/files/Sentinel-2_L1C.zip?download=1",
    "filename": "Sentinel-2_L1C.zip",
}
val_data = {
    "url": "https://zenodo.org/records/5036991/files/PixBox-S2-CMIX.zip?download=1",
    "filename": "PixBox-S2-CMIX.zip",
}

In [None]:
def download_file(url: str, file_path: Path, force_download=False):
    """Download with multi-part/multi-connection using aria2c."""
    if file_path.exists() and not force_download:
        print(
            f"File {file_path.name} already exists. "
            "Set force_download=True to re-download."
        )
        return

    file_path.parent.mkdir(parents=True, exist_ok=True)

    aria2_cmd = [
        "aria2c",
        "--continue=true",
        "--max-connection-per-server=8",
        "--split=8",
        "--min-split-size=1M",
        "--max-tries=5",
        "--retry-wait=3",
        "--timeout=60",
        "--connect-timeout=30",
        "--auto-file-renaming=false",
        "--allow-overwrite=true",
        "--summary-interval=1",
        "--header=User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36",
        "--header=Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;"
        "q=0.8",
        "--header=Accept-Language: en-US,en;q=0.5",
        f"--dir={file_path.parent}",
        f"--out={file_path.name}",
        url,
    ]

    print(f"Downloading {file_path.name} with 8 parallel connections...")
    subprocess.run(aria2_cmd, check=True)
    print(f"âœ“ Successfully downloaded {file_path.name}")

In [None]:
def download_and_extract(url: str, file_path: Path, force_download=False):
    download_file(url, file_path, force_download)
    try:
        with zipfile.ZipFile(file_path, "r") as zip_ref:
            zip_ref.extractall(dataset_dir)
    except zipfile.BadZipFile:
        assert force_download is not True, """Faild to unzip even after a 
        fresh download."""
        print(f"Error: {file_path} is not a valid zip file.")
        print("Trying to download again...")
        download_and_extract(url, file_path, force_download=True)
        return

In [None]:
for data in [val_data, scenes]:
    url = data["url"]
    filename = data["filename"]
    file_path = dataset_dir / filename
    download_and_extract(url, file_path)

In [None]:
# upzip scenes
scenes_dir = dataset_dir / "Sentinel-2_L1C"
zipped_scenes = list(scenes_dir.glob("*.zip"))
rar_scenes = list(scenes_dir.glob("*.rar"))
len(zipped_scenes), len(rar_scenes)

In [None]:
for zipped_scene in tqdm(zipped_scenes):
    with zipfile.ZipFile(zipped_scene, "r") as zip_ref:
        zip_ref.extractall(scenes_dir)
    zipped_scene.unlink()

For some reason one the the scenes is a .rar file, it's not simple to extract this with python so head to the path below and extract the scene manually. 

In [None]:
rar_scenes