In [None]:
import os
import pathlib

import pandas as pd

In [None]:
# exp_dir = "../datasets/oor/inference/experiment_2907/"
# total_images = 23322

exp_dir = "../datasets/oor/inference/experiment_1208/"
exclude_set = set([f"0-D12M08Y2024-H10M10S02-{i}" for i in range(7425, 7615)])
# exclude_set = set()
total_images = 14986 - len(exclude_set)

exp_names = sorted(next(os.walk(exp_dir))[1])

In [None]:
all_tps = set.union(
    *(set([f.stem 
           for f in pathlib.Path(os.path.join(exp_dir, name, "actual_containers")).glob("*.jpg")
          ]) for name in exp_names)) - exclude_set

print(f"Total number of TPs: {len(all_tps)}")

In [None]:
experiments = []

for name in exp_names:
    detections = set([f.stem for f in pathlib.Path(os.path.join(exp_dir, name, "detected_images")).glob("*.jpg") if f.stem not in exclude_set])
    tps = set([f.stem for f in pathlib.Path(os.path.join(exp_dir, name, "actual_containers")).glob("*.jpg") if f.stem not in exclude_set])
    # tps = set.intersection(detections, all_tps)
    experiments.append(
        {
            "name": name,
            "detections": detections,
            "n_detections": len(detections),
            "true_positives": tps,
            "n_true_positives": len(tps),
            "precision": len(tps) / len(detections),
            "partial_recall": len(tps) / len(all_tps),
            "fnr": len(detections - tps) / total_images,
        }
    )

In [None]:
exp_df = pd.DataFrame(data=experiments)[['name', 'n_detections', 'n_true_positives', 'precision', 'partial_recall', 'fnr']]
exp_df

In [None]:
# Load frame metadata

import pathlib
import geopandas as gpd

# metadata_folder = "../datasets/oor/experiment_240812/"
metadata_folder = "../datasets/oor/metadata_240826/3/"

RD_CRS = "EPSG:28992"  # CRS code for the Dutch Rijksdriehoek coordinate system
LAT_LON_CRS = "EPSG:4326"  # CRS code for WGS84 latitude/longitude coordinate system

def load_metadata_csv(metadata_file: str) -> pd.DataFrame:
    df = pd.read_csv(metadata_file)
    frame_base_name = f"0-{pathlib.Path(metadata_file).stem.split(sep='-', maxsplit=1)[1]}"
    df["frame_name"] = [f"{frame_base_name}-{frame_id}" for frame_id in df["pylon://0_frame_counter"]]
    return df.set_index("frame_name")

metadata_files = pathlib.Path(metadata_folder).glob("*.csv")
metadata_df = pd.concat(
    [load_metadata_csv(metadata_file) for metadata_file in metadata_files]
)

metadata_gdf = gpd.GeoDataFrame(
    metadata_df,
    geometry=gpd.points_from_xy(
        x=metadata_df.gps_lon,
        y=metadata_df.gps_lat,
        crs=LAT_LON_CRS,
    ),
).sort_values(by="pylon://0_frame_counter").to_crs(RD_CRS)

del metadata_df, metadata_files

In [None]:
import shapely.geometry as sg

valid_points = metadata_gdf[metadata_gdf.distance(sg.Point(121000, 488000)) < 250000].geometry
print(f"Total distance: {sg.LineString(valid_points).length / 1000:.3f} km")

In [None]:
tp_gdf = metadata_gdf[[True if name in all_tps else False for name in metadata_gdf.index]]
tp_gdf = tp_gdf[["pylon://0_frame_counter", "geometry"]]
tp_gdf.columns = ["frame_counter", "geometry"]

In [None]:
for e in experiments:
    version = e["name"].split(sep="_", maxsplit=1)[0]
    tps = [True if name in e["true_positives"] else False for name in tp_gdf.index]
    tp_gdf[e["name"]] = tps

In [None]:

def _color_red_or_green(val):
    color = 'green' if val else 'red'
    return 'color: %s' % color

tp_gdf[['v1.0', 'v1.1', 'v2.0.0', 'v2.0.1', 'v2.1a', 'v2.1b', 'v2.1c0.01']].style.applymap(_color_red_or_green).to_excel('1208_comparison.xlsx', engine = 'openpyxl')

In [None]:
metadata_gdf.geometry.value_counts()

In [None]:
sum(metadata_gdf.index.str.startswith("8-D26M08Y2024-H10M55S09"))

## Confidence threshold

In [None]:
import os
from typing import Union

import numpy as np

def get_conf_for_frame(frame: str, labels_dir: Union[str, os.PathLike], target_class: int = 2) -> float:
    label_file = os.path.join(labels_dir, f"{frame}.txt")
    with open(label_file, "r") as f:
        max_conf = 0.
        for line in f.readlines():
            obj_class, _, _, _, _, conf = line.split(sep=" ")[0:6]
            obj_class = int(obj_class)
            conf = float(conf)
            if (obj_class == target_class) and (conf > max_conf):
                max_conf = conf
    return max_conf

def fb_score(precision, recall, beta=1.):
    return (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)

In [None]:
name = "v2.1c0.01_solar_spaceship_10_conf_0.01"

tps = set([f.stem for f in pathlib.Path(os.path.join(exp_dir, name, "actual_containers")).glob("*.jpg") if f.stem not in exclude_set])
detections = {f.stem: get_conf_for_frame(f.stem, os.path.join(exp_dir, name, "detected_labels")) 
              for f in pathlib.Path(os.path.join(exp_dir, name, "detected_images")).glob("*.jpg")
              if f.stem not in exclude_set}

In [None]:
confs = np.arange(0., 1., 0.1)

data = {
    "conf": [],
    "n_det": [],
    "n_tp": [],
    "precision": [],
    "partial_recall": [],
    "fnr": [],
}

for conf in confs:
    conf_detections = set([frame for frame, confidence in detections.items() if confidence >= conf])
    conf_tps = set.intersection(conf_detections, tps)
    data["conf"].append(conf)
    data["n_det"].append(len(conf_detections))
    data["n_tp"].append(len(conf_tps))
    data["precision"].append(len(conf_tps) / len(conf_detections))
    data["partial_recall"].append(len(conf_tps) / len(all_tps))
    data["fnr"].append(len(conf_detections - conf_tps) / total_images)


In [None]:
conf_df = pd.DataFrame(data=data).set_index("conf")
conf_df["f1"] = fb_score(conf_df["precision"], conf_df["partial_recall"])
conf_df["f0.5"] = fb_score(conf_df["precision"], conf_df["partial_recall"], beta=0.5)
conf_df["f2"] = fb_score(conf_df["precision"], conf_df["partial_recall"], beta=2)

In [None]:
ax = conf_df[["precision", "partial_recall"]].plot()

In [None]:
ax.get_figure().savefig("exp1208_v2.1_pr_curve.png")

In [None]:
ax = conf_df[["f1", "f0.5", "f2"]].plot()

In [None]:
ax.get_figure().savefig("exp1208_v2.1_f1_curve.png")

## Gather dataset

In [None]:
import os
import pathlib
import shutil

input_base_dir = pathlib.Path(exp_dir)
output_dir = pathlib.Path("../datasets/oor/240812_all_tps")

In [None]:
# all_tps
#   minus: tps of both v2.1 models
# fps of v2.1 standard model

# v21_tps = set.intersection(*(e["true_positives"] for e in experiments if e["name"].split(sep="_", maxsplit=1)[0] in ('v2.1a', 'v2.1b')))
# reduced_all_tps = all_tps - v21_tps
reduced_all_tps = all_tps

v21_detections = set.union(*(e["detections"] for e in experiments if e["name"].split(sep="_", maxsplit=1)[0] in ('v2.1a', 'v2.1b')))
v21_fps = v21_detections - all_tps

In [None]:
len(tp_gdf)

In [None]:
models_sorted = list(reversed(["v2.1c0.01_solar_spaceship_10_conf_0.01", "v1.0_norect_500_100", "v1.1_600_best", "v2.0.0_rosy_grass_5", "v2.0.1_expert_jazz_9", "v2.1a_solar_spaceship_10", "v2.1b_vital-armadillo-11"]))

all_tps_map = {}

for frame in reduced_all_tps:
    for model in models_sorted:
        if tp_gdf.loc[frame, model]:
            all_tps_map[frame] = input_base_dir / model
            break

In [None]:
len(all_tps_map)

In [None]:
v21_detections = {
    experiments[-3]["name"]: experiments[-3]["detections"],
    experiments[-2]["name"]: experiments[-2]["detections"],
}

In [None]:
models_sorted = ["v2.1a_solar_spaceship_10", "v2.1b_vital-armadillo-11"]

v21_fps_map = {}

for frame in v21_fps:
    for model in models_sorted:
        if frame in v21_detections[model]:
            v21_fps_map[frame] = input_base_dir / model
            break

In [None]:
# new_data_map = {**all_tps_map, **v21_fps_map}
new_data_map = all_tps_map

In [None]:
os.makedirs(output_dir / "detections", exist_ok=True)
os.makedirs(output_dir / "labels", exist_ok=True)

for frame, src_path in new_data_map.items():
    shutil.copy2(src_path / "detected_images" / f"{frame}.jpg",
                 output_dir / "detections/")
    shutil.copy2(src_path / "detected_labels" / f"{frame}.txt",
                 output_dir / "labels/")