In [1]:
import os
import sys
from datetime import datetime
from pathlib import Path, PosixPath
import json
from typing import Literal

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import gc
from tqdm.notebook import tqdm

In [2]:
class CFG:
    ### data config
    N_TIMES_BEFORE: int = 4
    N_TIMES_AFTER: int = 3
    band_interval = range(8, 16 + 1)
    img_size = [256, 256]
    output_channels: int = 1

    ### train params
    epochs = 10 #15  # 20  # 10
    ## 0 = silent, 1 = progress bar, 2 = single line. Progress bar is not useful when logged to a file, so verbose=2 is recommended when not running interactively
#     verbose = 1 if INTERACTIVE else 2
    thresholds: float = 0.30  # to compute metrics & submit prediction

    class model:
        name: Literal["u_mobilenet_v3", "u_mobilenet_v2"] = "u_mobilenet_v3"  # u_mobilenet_v2
        full: str = "u_mobilenet_v3.h5"  # "full_unet_model.h5"
        best_weights: str = "best_weight_u_mobilenet_v3.h5"  # "best_weight_cnn_06-02.h5"
        target: str = "human_pixel_masks.npy"  # aggregation from human_individual_masks.npy

    class paths:
        ### outputs from last run (weights, history & config)
        last_run = Path("/kaggle/input/identify-contrails")
        ### competition input data folder
        base_dir: Path = Path(
            "/kaggle/input/google-research-identify-contrails-reduce-global-warming"
        )
        ### model & weights folder
        model_dir: Path = Path("/kaggle/input/cnn-unet-identify-contrails")
        bands_bound: Path = Path("/kaggle/input/bands-bounds/bounds_min_max.csv")

    class train:
        name_dir: str = "train"
        batch_size: int = 128
        start_stop_range = slice(0, None)  # slice(None)

    class validation:
        name_dir: str = "validation"
        batch_size = 32
        start_stop_range = slice(0, None)  # None)  # slice(None)

    class test:
        name_dir: str = "test"
        batch_size: int = 128

    class execute:
        compute_bands_bound = False
        all_layers_train = False
        train = True
        evaluate = False
        fast_test = False

In [3]:
class npfast:
    ### from https://www.kaggle.com/competitions/google-research-identify-contrails-reduce-global-warming/discussion/414549
    def load(file):
        file = open(file, "rb")
        header = file.read(128)
        descr = str(header[19:25], "utf-8").replace("'", "").replace(" ", "")
        shape = tuple(
            int(num)
            for num in str(header[60:120], "utf-8")
            .replace(", }", "")
            .replace("(", "")
            .replace(")", "")
            .split(",")
        )
        datasize = np.lib.format.descr_to_dtype(descr).itemsize
        for dimension in shape:
            datasize *= dimension
        return np.ndarray(shape, dtype=descr, buffer=file.read(datasize))


def open_record(record_path: Path):
    bands = [
        npfast.load(record_path / f"band_{band_num:02d}.npy") for band_num in CFG.band_interval
    ]
    pixel_mask = npfast.load(record_path / CFG.model.target)
    return np.array(bands), pixel_mask


def false_color(band11, band14, band15):
    """
    convert bands to rgb that labelers saw
    """

    def normalize(band, bounds):
        return (band - bounds[0]) / (bounds[1] - bounds[0])

    _T11_BOUNDS = (243, 303)
    _CLOUD_TOP_TDIFF_BOUNDS = (-4, 5)
    _TDIFF_BOUNDS = (-4, 2)

    r = normalize(band15 - band14, _TDIFF_BOUNDS)
    g = normalize(band14 - band11, _CLOUD_TOP_TDIFF_BOUNDS)
    b = normalize(band14, _T11_BOUNDS)

    return np.clip(np.stack([r, g, b], axis=2), 0, 1)


def open_record_false_color(
    record_path: Path, pixel_mask=True, N_TIMES_BEFORE=CFG.N_TIMES_BEFORE, fast=True
):
    _np = npfast if fast else np
    band11 = _np.load(record_path / "band_11.npy")[..., N_TIMES_BEFORE]
    band14 = _np.load(record_path / "band_14.npy")[..., N_TIMES_BEFORE]
    band15 = _np.load(record_path / "band_15.npy")[..., N_TIMES_BEFORE]
    if pixel_mask == True:
        pixel_mask = _np.load(record_path / CFG.model.target)
    return (
        false_color(band11, band14, band15),
        pixel_mask,
    )

In [4]:
data_dir = Path("/kaggle/input/google-research-identify-contrails-reduce-global-warming/train")

In [5]:
for i, record_path in enumerate(data_dir.iterdir()):
    image, label = open_record_false_color(record_path)
    Path(record_path.name).mkdir(exist_ok=True)
    if label.sum()>0:
        #save only records with contrails
        np.save(f"{record_path.name}/image.npy",image)
        np.save(f"{record_path.name}/label.npy",label)
    del image, label
    if i % 100==0:
        gc.collect()