This notebook downloads the Kappaset dataset from zenodo

The only thing you will need to change in this is the base_dataset_dir to a local drive with 300 GB of available storage

In [1]:
from pathlib import Path
import rasterio as rio
from tqdm.auto import tqdm
import netCDF4
import numpy as np
import requests
import zipfile
from multiprocessing import Pool

In [None]:
base_dataset_dir = Path("/media/nick/4TB Working 7/Datasets/OCM datasets")

In [None]:
dataset_url = "https://zenodo.org/records/7100327/files/kappaset.zip"

In [None]:
kappaset_raw_dir = base_dataset_dir / "Kappaset raw"
kappaset_raw_dir.mkdir(parents=True, exist_ok=True)

In [None]:
dst_dir = base_dataset_dir / "Kappaset"
dst_dir.mkdir(exist_ok=True, parents=True)

In [None]:
# Full path for the downloaded file
download_path = kappaset_raw_dir / "kappaset.zip"

# Get the file size first
response = requests.head(dataset_url)
total_size = int(response.headers.get("content-length", 0))

# Download with progress bar
response = requests.get(dataset_url, stream=True)
response.raise_for_status()
if not download_path.exists():
    with (
        download_path.open("wb") as file,
        tqdm(
            desc="Downloading",
            total=total_size,
            unit="B",
            unit_scale=True,
            unit_divisor=1024,
        ) as pbar,
    ):
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
            pbar.update(len(chunk))

    print(f"Downloaded to: {download_path.absolute()}")

In [None]:
# Extract the zip file
with zipfile.ZipFile(download_path, "r") as zip_ref:
    zip_ref.extractall(kappaset_raw_dir)
    print(f"Extracted to: {kappaset_raw_dir.absolute()}")

In [None]:
bands = ["B04", "B03", "B8A"]
clip_size = 509

In [None]:
nc_files = list(kappaset_raw_dir.rglob("[!.]*.nc"))
len(nc_files)

In [None]:
nc_files[0]

In [None]:
netcdf_file = netCDF4.Dataset(nc_files[0], "r")

In [None]:
def remap_label(label):
    # 0-> 99
    # 1-> 0
    # 2-> 3
    # 3-> 2
    # 4-> 1
    # 5-> 99
    new_label = np.zeros_like(label, dtype=np.uint8)
    new_label[label == 0] = 99
    new_label[label == 1] = 0
    new_label[label == 2] = 3
    new_label[label == 3] = 2
    new_label[label == 4] = 1
    new_label[label == 5] = 99
    return new_label

In [None]:
label_profile = {
    "driver": "GTiff",
    "dtype": "uint8",
    "count": 1,
    "crs": None,
    "transform": None,
    "width": clip_size,
    "height": clip_size,
    "nodata": None,
    "compression": "lzw",
}

In [None]:
image_profile = {
    "driver": "GTiff",
    "dtype": "float32",
    "count": len(bands),
    "crs": None,
    "transform": None,
    "width": clip_size,
    "height": clip_size,
    "nodata": None,
}

In [None]:
# for nc_file in tqdm(nc_files):
def process_nc_file(nc_file):
    file_name = f"Kappaset_{nc_file.name}"

    label_out_path = dst_dir / file_name.replace(".nc", "_train_509_high_label.tif")
    image_out_path = dst_dir / file_name.replace(".nc", "_train_509_high_image_l1c.tif")

    if label_out_path.exists() and image_out_path.exists():
        return

    dataset = netCDF4.Dataset(nc_file, "r")

    label = dataset["Label"][:].data[:clip_size, :clip_size]
    new_label = remap_label(label)

    image = []
    for band in bands:
        image.append(dataset[band][:].data[:clip_size, :clip_size])
    image = np.array(image)

    with rio.open(label_out_path, "w", **label_profile) as dst:
        dst.write(new_label, 1)

    with rio.open(image_out_path, "w", **image_profile) as dst:
        dst.write(image)


In [None]:
# for nc_file in tqdm(nc_files):
#     process_nc_file(nc_file)

with Pool() as pool:
    list(tqdm(pool.imap(process_nc_file, nc_files), total=len(nc_files)))