In [1]:
import xgboost as xgb

import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

CSV_PATH = r"C:\Hackathon\gunshot\modelB_predictors.csv"   # input with columns: t, outward_fraction, mean_outward_speed_mps


EVENT_START_S = 25202.5
EVENT_END_S   = 25227.5

TRAIN_FRAC = 0.70   # first 70% of time for train/tune
VAL_FRAC   = 0.15   # next 15% for validation (threshold tuning)
# last 15% becomes the test slice

PERSIST_TICKS = 2   # require >= 2 consecutive ticks above the cutoff

CUTOFF_GRID = np.linspace(0.50, 0.95, 10)  # candidate probability cutoffs to try on validation


In [2]:
df = pd.read_csv(CSV_PATH)

# Basic sanity
assert {"t","outward_fraction","mean_outward_speed_mps"}.issubset(df.columns), \
       "CSV must contain t, outward_fraction, mean_outward_speed_mps"

# Label: 1 inside the event window, else 0
df["label"] = ((df["t"] >= EVENT_START_S) & (df["t"] <= EVENT_END_S)).astype(int)

# Sort by time (very important for time-based split)
df = df.sort_values("t").reset_index(drop=True)

# Work out the time-based indices for train / val / test
unique_t = df["t"].unique()
nT = len(unique_t)
t_train_end = unique_t[int(nT * TRAIN_FRAC) - 1]
t_val_end   = unique_t[int(nT * (TRAIN_FRAC + VAL_FRAC)) - 1]

# Split masks
is_train = df["t"] <= t_train_end
is_val   = (df["t"] > t_train_end) & (df["t"] <= t_val_end)
is_test  = df["t"] > t_val_end

print("Time split:")
print(f"  Train: t <= {t_train_end}")
print(f"  Val:   {t_train_end} < t <= {t_val_end}")
print(f"  Test:  t > {t_val_end}")

# Build feature matrices (2 columns only) and labels
FEATS = ["outward_fraction", "mean_outward_speed_mps"]

X_train, y_train = df.loc[is_train, FEATS].values, df.loc[is_train, "label"].values
X_val,   y_val   = df.loc[is_val,   FEATS].values, df.loc[is_val,   "label"].values
X_test,  y_test  = df.loc[is_test,  FEATS].values, df.loc[is_test,  "label"].values

print("Shapes:", X_train.shape, X_val.shape, X_test.shape)


Time split:
  Train: t <= 30237.5
  Val:   30237.5 < t <= 36717.5
  Test:  t > 36717.5
Shapes: (12096, 2) (2592, 2) (2593, 2)


In [3]:
# Handle class imbalance a bit (helps if evac frames are rare)
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
scale_pos_weight = (neg / max(pos, 1)) if pos > 0 else 1.0

clf = xgb.XGBClassifier(
    n_estimators=60,         # small model
    max_depth=2,            # very shallow trees
    learning_rate=0.10,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    objective="binary:logistic",
    eval_metric="logloss",
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

clf.fit(X_train, y_train)

# Quick AUC check (optional)
try:
    print("Train AUC:", roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]).round(3))
    print("Val   AUC:", roc_auc_score(y_val,   clf.predict_proba(X_val)[:,1]).round(3))
except Exception as e:
    print("AUC calc skipped:", e)


Train AUC: 0.999
AUC calc skipped: Only one class present in y_true. ROC AUC score is not defined in that case.


In [4]:
proba_val = clf.predict_proba(X_val)[:, 1]

best = None
rows = []
for thr in CUTOFF_GRID:
    preds = (proba_val >= thr).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(y_val, preds, average="binary", zero_division=0)
    rows.append((thr, p, r, f1))
    if (best is None) or (p > best[1]) or (p == best[1] and f1 > best[3]):  # prefer precision, break ties by F1
        best = (thr, p, r, f1)

cutoff = float(best[0])
val_precision, val_recall, val_f1 = best[1], best[2], best[3]

print("Validation sweep (thr, precision, recall, f1):")
for r in rows:
    print("  thr=%.2f  P=%.3f  R=%.3f  F1=%.3f" % r)

print("\nChosen cutoff (prefers higher precision):", cutoff)
print("Val metrics at cutoff →  P=%.3f  R=%.3f  F1=%.3f" % (val_precision, val_recall, val_f1))


Validation sweep (thr, precision, recall, f1):
  thr=0.50  P=0.000  R=0.000  F1=0.000
  thr=0.55  P=0.000  R=0.000  F1=0.000
  thr=0.60  P=0.000  R=0.000  F1=0.000
  thr=0.65  P=0.000  R=0.000  F1=0.000
  thr=0.70  P=0.000  R=0.000  F1=0.000
  thr=0.75  P=0.000  R=0.000  F1=0.000
  thr=0.80  P=0.000  R=0.000  F1=0.000
  thr=0.85  P=0.000  R=0.000  F1=0.000
  thr=0.90  P=0.000  R=0.000  F1=0.000
  thr=0.95  P=0.000  R=0.000  F1=0.000

Chosen cutoff (prefers higher precision): 0.5
Val metrics at cutoff →  P=0.000  R=0.000  F1=0.000


In [5]:
# Get predicted probabilities for the full df (so we can report by time)
df["proba"] = clf.predict_proba(df[FEATS].values)[:, 1]

# Determine the typical tick size (seconds) from the data
tick = df["t"].diff().dropna().mode().iloc[0] if df["t"].nunique() > 1 else 2.5

# Base decision per frame (before persistence)
df["pred_raw"] = (df["proba"] >= cutoff).astype(int)

# Apply persistence ONLY within each split (train/val/test) to keep evaluation clean
def apply_persistence(sub):
    sub = sub.sort_values("t").copy()
    roll = sub["pred_raw"].rolling(window=PERSIST_TICKS, min_periods=PERSIST_TICKS).sum()
    sub["pred_persist"] = (roll >= PERSIST_TICKS).astype(int)
    return sub

df.loc[is_train, "pred_persist"] = apply_persistence(df.loc[is_train]).loc[is_train, "pred_persist"]
df.loc[is_val,   "pred_persist"] = apply_persistence(df.loc[is_val]).loc[is_val,   "pred_persist"]
df.loc[is_test,  "pred_persist"] = apply_persistence(df.loc[is_test]).loc[is_test,  "pred_persist"]

# Test metrics
test = df.loc[is_test].sort_values("t")
p, r, f1, _ = precision_recall_fscore_support(test["label"].values,
                                              test["pred_persist"].values,
                                              average="binary",
                                              zero_division=0)
print("\nTEST metrics with persistence (P,R,F1):", round(p,3), round(r,3), round(f1,3))



TEST metrics with persistence (P,R,F1): 0.0 0.0 0.0


In [6]:
test = test.reset_index(drop=True)
on = test.loc[test["pred_persist"] == 1, "t"].to_numpy()

if on.size == 0:
    print("No Trigger B intervals on TEST with current cutoff & persistence.")
else:
    blocks = []
    start = on[0]; prev = on[0]
    for tt in on[1:]:
        if tt - prev > tick + 1e-6:       # gap → close the block
            blocks.append((start, prev))
            start = tt
        prev = tt
    blocks.append((start, prev))
    print("Trigger B active intervals on TEST (t_start → t_end):")
    for a,b in blocks:
        dur = b - a + tick
        print(f"  {a:.1f}s → {b:.1f}s   (~{dur:.1f}s)")


No Trigger B intervals on TEST with current cutoff & persistence.


In [7]:
# Per-frame classifier outputs (full day)
out_cols = ["t", "outward_fraction", "mean_outward_speed_mps", "label", "proba", "pred_raw", "pred_persist"]
df[out_cols].to_csv(r"C:\Hackathon\gunshot\modelB_classifier_outputs.csv", index=False)

print("\nSaved: modelB_classifier_outputs.csv  with", df.shape[0], "rows")
print("Columns:", out_cols)
print("\nTip: Use 'pred_persist' as Trigger B when fusing with Trigger A (A AND B within 3–5s).")



Saved: modelB_classifier_outputs.csv  with 17281 rows
Columns: ['t', 'outward_fraction', 'mean_outward_speed_mps', 'label', 'proba', 'pred_raw', 'pred_persist']

Tip: Use 'pred_persist' as Trigger B when fusing with Trigger A (A AND B within 3–5s).


In [8]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

# ---- Small knobs you can tune (must match your earlier choices) ----
EVENT_START_S = 25202.5
EVENT_END_S   = 25227.5

TRAIN_FRAC = 0.70   # first 70% of time → train
VAL_FRAC   = 0.15   # next 15% → validation (choose cutoff here)
PERSIST_TICKS = 2   # require >= 2 consecutive ticks above cutoff

CUTOFF_GRID = np.linspace(0.50, 0.95, 10)  # thresholds to try on validation
FEATS = ["outward_fraction", "mean_outward_speed_mps"]  # only two features


In [9]:
# Load the predictors you already built locally (no SageMaker needed)
df = pd.read_csv(r"C:\Hackathon\gunshot\modelB_predictors.csv")
assert set(["t"]+FEATS).issubset(df.columns), "modelB_predictors.csv missing required columns."

# Label = 1 inside the event window, else 0 (used only for validation/testing metrics)
df = df.sort_values("t").reset_index(drop=True)
df["label"] = ((df["t"] >= EVENT_START_S) & (df["t"] <= EVENT_END_S)).astype(int)

# Time-based split (prevents leakage)
unique_t = df["t"].unique()
t_train_end = unique_t[int(len(unique_t)*TRAIN_FRAC) - 1]
t_val_end   = unique_t[int(len(unique_t)*(TRAIN_FRAC+VAL_FRAC)) - 1]

is_train = df["t"] <= t_train_end
is_val   = (df["t"] > t_train_end) & (df["t"] <= t_val_end)
is_test  = df["t"] > t_val_end

# Detect the typical tick spacing (seconds), used later when summarizing intervals
tick = df["t"].diff().dropna().mode().iloc[0] if df["t"].nunique() > 1 else 2.5

print("Time split:")
print(f"  Train: t <= {t_train_end}")
print(f"  Val:   {t_train_end} < t <= {t_val_end}")
print(f"  Test:  t > {t_val_end}")


Time split:
  Train: t <= 30237.5
  Val:   30237.5 < t <= 36717.5
  Test:  t > 36717.5


In [10]:
proba_all = None

# Case 1: Use an in-memory model called `clf` if it exists (from your modelB.ipynb workflow)
if "clf" in globals():
    try:
        proba_all = clf.predict_proba(df[FEATS].values)[:, 1]
        print("Used in-memory classifier `clf` to get probabilities.")
    except Exception as e:
        print("Found `clf` but couldn't use predict_proba:", e)

# Case 2: Try to load a saved local model (JSON from xgboost Booster or a joblib/sklearn pickle)
if proba_all is None:
    import os
    try:
        if os.path.exists("modelB_xgb.json"):
            booster = xgb.Booster()
            booster.load_model("modelB_xgb.json")
            dmat = xgb.DMatrix(df[FEATS].values)
            proba_all = booster.predict(dmat)
            print("Loaded probabilities from modelB_xgb.json (XGBoost Booster).")
        elif os.path.exists("modelB_xgb.pkl"):
            import joblib
            clf = joblib.load("modelB_xgb.pkl")
            proba_all = clf.predict_proba(df[FEATS].values)[:, 1]
            print("Loaded probabilities from modelB_xgb.pkl (sklearn-style model).")
    except Exception as e:
        print("Tried to load a saved model but failed:", e)

# Case 3: Tiny fallback training here (quick MVP)
if proba_all is None:
    print("Training a tiny fallback XGBoost model locally (MVP).")
    from xgboost import XGBClassifier
    X_tr, y_tr = df.loc[is_train, FEATS].values, df.loc[is_train, "label"].values
    pos = (y_tr == 1).sum()
    neg = (y_tr == 0).sum()
    spw = (neg / max(pos, 1)) if pos > 0 else 1.0  # handle imbalance a bit

    clf = XGBClassifier(
        n_estimators=80,
        max_depth=2,
        learning_rate=0.10,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        objective="binary:logistic",
        n_jobs=-1,
        scale_pos_weight=spw,
        random_state=42
    )
    clf.fit(X_tr, y_tr)
    proba_all = clf.predict_proba(df[FEATS].values)[:, 1]

# Attach probabilities to the full table
df["proba"] = proba_all

# Quick sanity: AUCs (optional)
try:
    auc_tr  = roc_auc_score(df.loc[is_train, "label"], df.loc[is_train, "proba"])
    auc_val = roc_auc_score(df.loc[is_val,   "label"], df.loc[is_val,   "proba"])
    print(f"AUC Train={auc_tr:.3f}  AUC Val={auc_val:.3f}")
except Exception:
    pass


Used in-memory classifier `clf` to get probabilities.


In [13]:
# 1) Pick a cutoff on the validation slice (prefer higher precision, tie-break by F1)
val_tbl = df.loc[is_val, ["t","label","proba"]].reset_index(drop=True)
best = None
for thr in CUTOFF_GRID:
    preds = (val_tbl["proba"].values >= thr).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(val_tbl["label"].values, preds,
                                                  average="binary", zero_division=0)
    if (best is None) or (p > best[1]) or (p == best[1] and f1 > best[3]):
        best = (float(thr), float(p), float(r), float(f1))
cutoff = best[0]

# 2) Persistence: require >= PERSIST_TICKS consecutive ticks above cutoff
def add_persistence(sub_df, cutoff, persist_ticks):
    sub_df = sub_df.sort_values("t").reset_index(drop=True).copy()
    sub_df["pred_raw"] = (sub_df["proba"] >= cutoff).astype(int)
    roll = sub_df["pred_raw"].rolling(window=persist_ticks, min_periods=persist_ticks).sum()
    sub_df["pred_persist"] = (roll >= persist_ticks).astype(int)
    return sub_df

df_out = add_persistence(df[["t","label","proba"]], cutoff, PERSIST_TICKS)

# 3) Save the exact file your fusion script looks for
df_out.to_csv(r"C:\Hackathon\gunshot\modelB_classifier_outputs.csv", index=False)
print("Saved modelB_classifier_outputs_sagemaker.csv with columns:", list(df_out.columns))

Saved modelB_classifier_outputs_sagemaker.csv with columns: ['t', 'label', 'proba', 'pred_raw', 'pred_persist']
