In [None]:
from tqdm.auto import tqdm
import requests
from pathlib import Path
import zipfile

In [None]:
dataset_dir = Path("dataset")
dataset_dir.mkdir(exist_ok=True)

In [None]:
# data is from here https://zenodo.org/records/5036991
scenes = {
    "url": "https://zenodo.org/records/5036991/files/Sentinel-2_L1C.zip?download=1",
    "filename": "Sentinel-2_L1C.zip",
}
val_data = {
    "url": "https://zenodo.org/records/5036991/files/PixBox-S2-CMIX.zip?download=1",
    "filename": "PixBox-S2-CMIX.zip",
}

In [None]:
def download_file(url: str, file_path: Path, force_download=False):

    if file_path.exists() and not force_download:
        print(f"File {file_path} already exists, skipping download.")
        return
    response = requests.get(url, stream=True)

    total_size = int(response.headers.get("content-length", 0))
    block_size = 1024

    with tqdm(
        total=total_size,
        unit="B",
        unit_scale=True,
        desc=f"Downloading {file_path.name}",
    ) as progress_bar:

        with open(file_path, "wb") as file:
            for data in response.iter_content(block_size):
                progress_bar.update(len(data))
                file.write(data)

In [None]:
def download_and_extract(url: str, file_path: Path, force_download=False):

    download_file(url, file_path, force_download)
    try:
        with zipfile.ZipFile(file_path, "r") as zip_ref:
            zip_ref.extractall(dataset_dir)
    except zipfile.BadZipFile:
        assert (
            force_download is not True
        ), """Faild to unzip even after a 
        fresh download."""
        print(f"Error: {file_path} is not a valid zip file.")
        print("Trying to download again...")
        download_and_extract(url, file_path, force_download=True)
        return

In [None]:
for data in [val_data, scenes]:
    url = data["url"]
    filename = data["filename"]
    file_path = dataset_dir / filename
    download_and_extract(url, file_path)

In [None]:
# upzip scenes
scenes_dir = dataset_dir / "Sentinel-2_L1C"
zipped_scenes = list(scenes_dir.glob("*.zip"))
rar_scenes = list(scenes_dir.glob("*.rar"))
len(zipped_scenes), len(rar_scenes)

In [None]:
for zipped_scene in tqdm(zipped_scenes):
    with zipfile.ZipFile(zipped_scene, "r") as zip_ref:
        zip_ref.extractall(scenes_dir)
    zipped_scene.unlink()

For some reason one the the scenes is a .rar file, it's not simple to extract this with python so head to the path below and extract the scene manually. 

In [None]:
rar_scenes