# Comparison Analysis Notebook

This notebook loads `comparison_report_distance.csv` and `comparison_report_classes.csv`, computes metrics, and visualizes distributions. 
You can apply the filters `AVAILABLE_IMAGES_ONLY` and `MAX_DISTANCE` to, respectively, restrict the analysis to:
- only images we can manually verify,
- only images with a matching distance below the distance cutoff.

In [180]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# === Configurable Filters ===
AVAILABLE_IMAGES_ONLY = False
# Set to a numeric distance cutoff (e.g. 20), or to None for no cutoff
MAX_DISTANCE = None

In [182]:
dist_df   = pd.read_csv("comparison_report_distance.csv")
class_df  = pd.read_csv("comparison_report_classes.csv")

dist_df["image_available"]  = dist_df["image_available"].astype(bool)
class_df["image_available"] = class_df["image_available"].astype(bool)

print(f"Distance report: {len(dist_df)} rows")
print(f"Class report:    {len(class_df)} rows")


Distance report: 56 rows
Class report:    56 rows


In [None]:
# CDF helper
def plot_cdf(data, label):
    sorted_d = np.sort(data)
    y = np.arange(1, len(sorted_d)+1) / len(sorted_d)
    plt.plot(sorted_d, y, label=label)


def filtered_analysis(df_dist, df_class):
    """
    This function applies the AVAILABLE_IMAGES_ONLY and MAX_DISTANCE filters,
    computes and prints the following:
    1) Percentage of distance‐based matches that also match in classes
    2) CDF of distances in both distance and class reports
    3) Boxplots of distances by automatic class
    """

    ddf = df_dist[df_dist["image_available"]] if AVAILABLE_IMAGES_ONLY else df_dist.copy()
    cdf = df_class[df_class["image_available"]] if AVAILABLE_IMAGES_ONLY else df_class.copy()
    
    if MAX_DISTANCE is not None:
        ddf = ddf[ddf["distance_m"] <= MAX_DISTANCE]
        cdf = cdf[cdf["distance_m"] <= MAX_DISTANCE]
    
    print("-"*60)
    if AVAILABLE_IMAGES_ONLY:
        print("Restricting to images available")
    if MAX_DISTANCE is not None:
        print(f"Applying distance cutoff: ≤ {MAX_DISTANCE} m")
    print(f"Working on {len(ddf)} distance‐rows (from {len(df_dist)}) and {len(cdf)} (from {len(df_class)}) class‐rows\n")
        
    
    # 1) % of distance-based matches that also match in classes
    merged = (
        ddf[["manual_image","matched_auto_image"]]
        .merge(cdf[["manual_image","matched_auto_image"]],
               on=["manual_image","matched_auto_image"],
               how="left",
               indicator=True)
    )
    total = len(ddf)
    both = (merged["_merge"]=="both").sum()
    pct = both / total * 100 if total else 0
    print(f"{both}/{total} = {pct:.1f}% distance‐based matches also share a class")
    print("-"*60)
    
    # 2) CDF of distance_m
    plt.figure(figsize=(8,4))
    plot_cdf(ddf["distance_m"],  "Distance Report")
    plot_cdf(cdf["distance_m"],  "Class Report")
    plt.xlabel("Distance (m)")
    plt.ylabel("CDF")
    plt.title("Cumulative Distribution of Distances")
    plt.legend()
    plt.grid(True)
    plt.show()
    print("-"*60)
    
    # 3) Boxplots by automatic class
    exp = cdf.assign(
        auto_class=cdf["auto_classes"].str.split(";")
    ).explode("auto_class")
    exp["auto_class"] = exp["auto_class"].astype(int)
    
    plt.figure(figsize=(4,2))
    exp.boxplot(column="distance_m", by="auto_class")
    plt.xlabel("Object Class")
    plt.ylabel("Distance (m)")
    plt.title("Distance by Object Class")
    plt.suptitle("")  # remove default
    plt.show()
    print("-"*60)
    
def compute_metrics(df):
    """
    Applies AVAILABLE_IMAGES_ONLY and MAX_DISTANCE filters,
    computes TP/FP/FN and returns metrics as a dict.
    
    True Positives (TP) are objects matched by class,
    False Positives (FP) are objects matched by distance but not by class,
    False Negatives (FN) are objects not matched by class.
    If using class_df, FP is always 0 and precision is always 1.0 because we match by classes by design.
    If there is no distance cutoff, FP == FN.
    If there is a distance cutoff, additional FNs may be added from the beyond‐cutoff set, so FN >= FP.
    """
    df = df.copy()
    df["manual_classes"] = df["manual_classes"].astype(str)
    df["auto_classes"  ] = df["auto_classes"  ].astype(str)

    if AVAILABLE_IMAGES_ONLY:
        df = df[df["image_available"]]

    N_manual = len(df)

    if MAX_DISTANCE is not None:
        df = df[df["distance_m"] <= MAX_DISTANCE]
    N_after_cut = len(df)

    # Build class_match with list comprehension
    def has_overlap(mclasses, aclasses):
        mset = {int(c) for c in mclasses.split(";") if c}
        aset = {int(c) for c in aclasses.split(";") if c}
        return bool(mset & aset)

    df["class_match"] = [
        has_overlap(m, a) for m,a in zip(df["manual_classes"], df["auto_classes"])
    ]

    TP = df["class_match"].sum()
    FP = N_after_cut - TP
    FN = N_manual - TP

    precision = TP / (TP + FP) if (TP+FP)>0 else 0
    recall    = TP / N_manual    if N_manual>0 else 0
    f1        = (2*precision*recall/(precision+recall)) if (precision+recall)>0 else 0

    return {
        "TP":          int(TP),
        "FP":          int(FP),
        "FN":          int(FN),
        "N_manual":    N_manual,
        "N_after_cut": N_after_cut,
        "precision":   precision,
        "recall":      recall,
        "f1":          f1
    }
    
def plot_recall_vs_distance(df, distance_steps=None):
    """
    Plot Recall as a function of distance cutoff,
    respecting AVAILABLE_IMAGES_ONLY and MAX_DISTANCE.
    """

    base_df = df[df["image_available"]] if AVAILABLE_IMAGES_ONLY else df


    if distance_steps is None:
        df_for_steps = base_df
        if MAX_DISTANCE is not None:
            df_for_steps = df_for_steps[df_for_steps["distance_m"] <= MAX_DISTANCE]
        distance_steps = sorted(df_for_steps["distance_m"].unique())

    # Sweep and collect recalls
    recalls = []
    original_cut = globals().get("MAX_DISTANCE", None)
    for d in distance_steps:
        globals()["MAX_DISTANCE"] = d
        m = compute_metrics(df)
        recalls.append(m["recall"])
    globals()["MAX_DISTANCE"] = original_cut

    plt.figure(figsize=(8,4))
    plt.plot(distance_steps, recalls, lw=1)
    plt.scatter(distance_steps, recalls, marker="o")
    plt.xlabel("Distance cutoff (m)")
    plt.ylabel("Recall")
    plt.title("Recall vs. Distance cutoff\n"
              f"{'Images only, ' if AVAILABLE_IMAGES_ONLY else ''}"
              f"Max preset cutoff: {original_cut if original_cut is not None else 'None'}")
    plt.grid(True)
    plt.show()



In [None]:
print("Running analysis with current configuration:",
      "AVAILABLE_IMAGES_ONLY =", AVAILABLE_IMAGES_ONLY,
      "MAX_DISTANCE =", MAX_DISTANCE)
filtered_analysis(dist_df, class_df)

print("Computing metrics...")
m = compute_metrics(dist_df)
print(f"TP={m['TP']}, FP={m['FP']}, FN={m['FN']}")
print(f"Precision={m['precision']:.3f}, Recall={m['recall']:.3f}, F1={m['f1']:.3f}")
print("-"*60)
# Recall may not reach 1.0 if using dist_df. 
# Recall always reaches 1.0 if using class_df (with no distance cutoff).
plot_recall_vs_distance(dist_df)