In [1]:
import rasterio as rio
import pandas as pd
from pathlib import Path
import numpy as np
import sys

sys.path.append(Path.cwd().parents[1].as_posix())
import omnicloudmask

In [2]:
ocm_version = omnicloudmask.__version__
print(f"OmniCloudMask version: {ocm_version}")

OmniCloudMask version: 1.3.1


In [3]:
dataset_dir = Path("dataset")
val_points_path = dataset_dir / "PixBox-L8-CMIX/pixbox_landsat8_cmix_20150527.csv"

In [4]:
# Point at the predictions directory
preds_dir = dataset_dir / f"OCM preds v{ocm_version}"
print(f"Predictions directory: {preds_dir}")
preds_dir.exists()

Predictions directory: dataset/OCM preds v1.3.1


True

In [5]:
# Load the validation data
val_data = pd.read_csv(val_points_path)
val_data.head()

Unnamed: 0,ID,PRODUCT_ID,PIXEL_X,PIXEL_Y,LATITUDE,LONGITUDE,PIXEL_SURFACE_TYPE_ID,ATMOSPHERIC_PROPERTIES_ID,WATER_BODY_TYPE_ID,WATER_BODY_CHARACTERISTICS_ID,GLINT_ID,CLOUD_CHARACTERISTICS_ID,CLOUD_HEIGHT_ID,CLOUD_SHADOW_ID,SHALLOWNESS_ID,SURFACE_TYPE_ID,CLIMATE_ZONE_ID,SEASON_ID,DAY_TIME_ID
0,1191608739,1191608737,5871,2068,43.661255,5.609816,2,0,2,3,0,0,0,0,0,14,3,4,1
1,1191608740,1191608737,6015,1963,43.688351,5.664592,1,0,2,3,0,0,0,0,0,12,3,4,1
2,1191608741,1191608737,6075,1939,43.694302,5.687191,1,0,2,3,0,0,0,0,0,12,3,4,1
3,1191608742,1191608737,6096,1947,43.691959,5.694904,1,0,2,3,0,0,0,0,0,12,3,4,1
4,1191608743,1191608737,6049,1948,43.692101,5.677414,1,0,2,3,0,0,0,0,0,12,3,4,1


In [6]:
# Add mapping from product_id to scene_name
product_id_to_scene_name = {
    1093075446: "LC81970222014109LGN00",
    1113395679: "LC81980232014276LGN00",
    1113398638: "LC82030242015058LGN00",
    1191608737: "LC81960302014022LGN00",
    1278115010: "LC81970182015080LGN00",
    1278117898: "LC81970222013186LGN00",
    1710711454: "LC81980222014260LGN00",
    1797484437: "LC81990242014075LGN00",
    1978132186: "LC81990242014107LGN00",
    2055810806: "LC82030242014103LGN00",
    2055813758: "LC82040212013251LGN00",
}
val_data["SCENE_NAME"] = val_data["PRODUCT_ID"].map(product_id_to_scene_name)

In [7]:
# Reclassify the validation data
val_data["CLOUD"] = val_data["PIXEL_SURFACE_TYPE_ID"].isin([0, 1]).astype(bool)
val_data["SHADOW"] = val_data["CLOUD_SHADOW_ID"].isin([1]).astype(bool)
val_data["CLEAR"] = (val_data["SHADOW"] + val_data["CLOUD"]) == 0
# Clear = 0 | Cloud = 1 | Shadow = 2
val_data["TARGET"] = (
    0 * val_data["CLEAR"] + 1 * val_data["CLOUD"] + 2 * val_data["SHADOW"]
)


print(f"Clouds: {val_data['CLOUD'].sum()}")
print(f"Shadows: {val_data['SHADOW'].sum()}")
print(f"Clear: {val_data['CLEAR'].sum()}")

Clouds: 5478
Shadows: 1396
Clear: 12365


In [8]:
# Loop over scenes and add OCM predictions
scene_results = []
for PRODUCT_ID in product_id_to_scene_name.keys():
    val_df_filt = val_data[val_data["PRODUCT_ID"] == PRODUCT_ID].copy()
    scene_name = product_id_to_scene_name[PRODUCT_ID]
    ocm_pred_path = list(preds_dir.glob(f"{scene_name}_OCM_*.tif"))[0]
    pred_array = rio.open(ocm_pred_path).read(1)
    val_df_filt["OCM_PRED"] = pred_array[
        val_df_filt["PIXEL_Y"].values, val_df_filt["PIXEL_X"].values
    ]
    scene_results.append(val_df_filt)

In [9]:
# Combine the results back into a single dataframe
scene_results_df = pd.concat(scene_results)

In [10]:
scene_results_df.to_csv(dataset_dir / "scene_results.csv", index=False)

In [11]:
# Reclassify the OCM predictions combining thick and thin clouds
scene_results_df["OCM_CLOUD"] = scene_results_df["OCM_PRED"].isin([1, 2]).astype(bool)
scene_results_df["OCM_SHADOW"] = scene_results_df["OCM_PRED"].isin([3]).astype(bool)
scene_results_df["OCM_CLEAR"] = scene_results_df["OCM_PRED"].isin([0]).astype(bool)
# clear = 0 | cloud = 1 and 2 | shadow = 3
scene_results_df["OCM_PRED_RECLASS"] = (
    0 * scene_results_df["OCM_CLEAR"]
    + 1 * scene_results_df["OCM_CLOUD"]
    + 2 * scene_results_df["OCM_SHADOW"]
)

In [12]:
# Get the stats for the predictions
def get_stats(labels, preds):
    tp = np.sum(labels * preds)
    tn = np.sum((1 - labels) * (1 - preds))
    fp = np.sum((1 - labels) * preds)
    fn = np.sum(labels * (1 - preds))
    ua = tp / (tp + fp)
    pa = tp / (tp + fn)
    return {
        "TP": tp,
        "TN": tn,
        "FP": fp,
        "FN": fn,
        "UA": ua,
        "PA": pa,
        "OA": (tp + tn) / (tp + tn + fp + fn),
        "BOA": 0.5 * (pa + (tn / (tn + fp))),
    }

In [13]:
#  Get the stats for each class
class_stats = {}
for class_name in ["CLEAR", "CLOUD", "SHADOW"]:
    labels = scene_results_df[class_name]
    preds = scene_results_df[f"OCM_{class_name}"]
    stats = get_stats(labels, preds)
    class_stats[f"{class_name}"] = stats

In [14]:
model_summary = pd.DataFrame(class_stats).T

In [15]:
# Format the model summary
for col in ["UA", "PA", "OA", "BOA"]:
    model_summary[col] = model_summary[col].map(lambda x: "{:.1%}".format(x))
for col in ["TP", "TN", "FP", "FN"]:
    model_summary[col] = model_summary[col].map(lambda x: "{:.0f}".format(x))

In [16]:
model_summary

Unnamed: 0,TP,TN,FP,FN,UA,PA,OA,BOA
CLEAR,12314,5887,578,51,95.5%,99.6%,96.7%,95.3%
CLOUD,4975,13318,34,503,99.3%,90.8%,97.1%,95.3%
SHADOW,908,17413,21,488,97.7%,65.0%,97.3%,82.5%
