In [2]:
import pandas as pd
import glob
import os

### !--- PARAMETERS
### Paramters of...
MIN_WIDTH = 0.1   # Lower bound (inclusive) for valid width (in meters)
MAX_WIDTH = 3.0   # Upper bound (inclusive) for valid width (in meters)

# !--- INPUT DIRECTORY
CSV_DIR = "../YOUR/PATH/outputs"

# !--- OUTPUT DIRECTORY
OUTPUT_DIR = "/home/slieu3/CS_street_buffer/complete_street/v3/manual_annotation"
os.makedirs(OUTPUT_DIR, exist_ok=True)

### ---------- LOAD ALL CSVs ----------
csv_files = glob.glob(os.path.join(CSV_DIR, "*.csv"))

df_list = []
for file in csv_files:
    df = pd.read_csv(
        file,
        dtype={"buffer_width": "object"},
        keep_default_na=False,   # preserve "None" as literal string
        na_values=[""]           # still treat empty cells as NaN
    )
    df_list.append(df)

combined_df = pd.concat(df_list, ignore_index=True)
print(f"Total CSV files loaded: {len(csv_files)}")
print(f"Combined shape: {combined_df.shape}")

# ---------- CATEGORIZE Cases AS VALID / INVALID ----------
def categorize_buffer_case(row):
    width = row.get("buffer_width", None)

    if isinstance(width, str):
        try:
            width_val = float(width)
            width = width_val
        except ValueError:
            pass  # keep "None" or other non-numeric strings

    # Case 1: numeric width
    if isinstance(width, (int, float)) and not pd.isna(width):
        if MIN_WIDTH <= width <= MAX_WIDTH:
            return "valid", "Valid: width in range"
        else:
            return "invalid", "Invalid: width out of range"

    # Case 2: "None" (no street buffer)
    if isinstance(width, str) and width == "None":
        return "valid", "Valid: None (no street buffer)"

    # Case 3: NaN / missing
    if pd.isna(width):
        return "invalid", "Invalid: NaN (no width / detection failed)"

    # Fallback
    return "invalid", "Invalid: other"


combined_df[["validity", "validity_detail"]] = combined_df.apply(
    lambda row: pd.Series(categorize_buffer_case(row)),
    axis=1
)


# ---------- SUMMARY TABLE ----------
total_rows = len(combined_df)

summary_detail = (
    combined_df.groupby(["validity", "validity_detail"])
    .agg(count=("validity_detail", "size"))
    .reset_index()
    .sort_values(["validity", "count"], ascending=[True, False])
)

summary_detail["percentage"] = summary_detail["count"].apply(
    lambda c: f"{100 * c / total_rows:.2f}%"
)

print("\n" + "=" * 60)
print("DETAILED SUMMARY: Breakdown by Reason")
print("=" * 60)
print(summary_detail.to_string(index=False))
print("=" * 60)

# ---------- SAVE VALID / INVALID Cases ----------
valid_csv_path = os.path.join(OUTPUT_DIR, "valid_cases.csv")
invalid_csv_path = os.path.join(OUTPUT_DIR, "invalid_cases.csv")

valid_df = combined_df[combined_df["validity"] == "valid"]
invalid_df = combined_df[combined_df["validity"] == "invalid"]

valid_df.to_csv(valid_csv_path, index=False)
invalid_df.to_csv(invalid_csv_path, index=False)

print(f"✅ Saved valid street buffer cases to: {valid_csv_path}")
print(f"✅ Saved invalid street buffer cases to: {invalid_csv_path}")


Total CSV files loaded: 1360
Combined shape: (2720, 11)

DETAILED SUMMARY: Breakdown by Reason
validity                            validity_detail  count percentage
 invalid Invalid: NaN (no width / detection failed)    584     21.47%
 invalid                Invalid: width out of range    174      6.40%
   valid             Valid: None (no street buffer)   1630     59.93%
   valid                      Valid: width in range    332     12.21%
✅ Saved valid street buffer cases to: /home/slieu3/CS_street_buffer/complete_street/v3/manual_annotation/valid_cases.csv
✅ Saved invalid street buffer cases to: /home/slieu3/CS_street_buffer/complete_street/v3/manual_annotation/invalid_cases.csv
