In [1]:
from pathlib import Path
import zipfile
import subprocess

In [2]:
dataset_dir = Path("dataset")
dataset_dir.mkdir(exist_ok=True)

In [3]:
# data is from here https://zenodo.org/records/5040271
scenes = {
    "url": "https://zenodo.org/records/5040271/files/Landsat8_L1.zip?download=1",
    "filename": "Landsat8_L1.zip",
}
val_data = {
    "url": "https://zenodo.org/records/5040271/files/PixBox-L8-CMIX.zip?download=1",
    "filename": "PixBox-L8-CMIX.zip",
}

In [None]:
def download_file(url: str, filepath: Path, force_download=False):
    """Download with multi-part/multi-connection using aria2c."""
    if filepath.exists() and not force_download:
        print(
            f"File {filepath.name} already exists."
            "Set force_download=True to re-download."
        )
        return

    filepath.parent.mkdir(parents=True, exist_ok=True)

    aria2_cmd = [
        "aria2c",
        "--continue=true",
        "--max-connection-per-server=8",
        "--split=8",
        "--min-split-size=1M",
        "--max-tries=5",
        "--retry-wait=3",
        "--timeout=60",
        "--connect-timeout=30",
        "--auto-file-renaming=false",
        "--allow-overwrite=true",
        "--summary-interval=1",
        "--header=User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36",
        "--header=Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;"
        "q=0.8",
        "--header=Accept-Language: en-US,en;q=0.5",
        f"--dir={filepath.parent}",
        f"--out={filepath.name}",
        url,
    ]

    print(f"Downloading {filepath.name} with 8 parallel connections...")
    subprocess.run(aria2_cmd, check=True)
    print(f"âœ“ Successfully downloaded {filepath.name}")

In [5]:
def download_and_extract(url: str, file_path: Path, force_download=False):
    """
    Download a file from a URL and extract it if it's a zip file.
    """
    download_file(url, file_path, force_download)

    try:
        with zipfile.ZipFile(file_path, "r") as zip_ref:
            print(f"Extracting {file_path.name}...")
            if file_path.name == "Landsat8_L1.zip":
                zip_ref.extractall(dataset_dir / "Landsat8_L1")
            else:
                zip_ref.extractall(dataset_dir)
    except zipfile.BadZipFile:
        assert force_download is not True, """Failed to unzip even after a 
        fresh download."""
        print(f"Error: {file_path} is not a valid zip file.")
        print("Trying to download again...")
        download_and_extract(url, file_path, force_download=True)
        return

In [6]:
for data in [val_data, scenes]:
    url = data["url"]
    filename = data["filename"]
    file_path = dataset_dir / filename
    download_and_extract(url, file_path)

File PixBox-L8-CMIX.zip already exists. Set force_download=True to re-download.
Extracting PixBox-L8-CMIX.zip...
File Landsat8_L1.zip already exists. Set force_download=True to re-download.
Extracting Landsat8_L1.zip...
