In [None]:
import os, json, numpy as np, pandas as pd, boto3, sagemaker
from sagemaker import Session
from sagemaker.s3 import S3Uploader, S3Downloader
from sagemaker.xgboost.estimator import XGBoost
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

sess: Session = sagemaker.Session()
region = boto3.Session().region_name or "us-west-2"
from sagemaker import get_execution_role
role = get_execution_role()

bucket = sess.default_bucket()  # or set your own bucket name
prefix = "gunshot-mvp/modelB-xgb"  # S3 folder prefix for this project

print("Region:", region)
print("Role:", role)
print("Bucket:", bucket)
print("S3 prefix:", prefix)

LOCAL_INPUT = r"C:\Hackathon\gunshot\modelB_predictors.csv"   # change if your file is elsewhere

EVENT_START_S = 25202.5
EVENT_END_S   = 25227.5

TRAIN_FRAC = 0.70   # first 70% of time
VAL_FRAC   = 0.15   # next 15% (tune cutoff here)

PERSIST_TICKS = 2   # e.g., 2 ticks × 2.5s ≈ 5 seconds

CUTOFF_GRID = np.linspace(0.50, 0.95, 10)


In [None]:
# Load your predictors (2 features + time)
df = pd.read_csv(LOCAL_INPUT)
assert {"t","outward_fraction","mean_outward_speed_mps"}.issubset(df.columns)

# Label: 1 for evac-like frames inside the event window, else 0
df["label"] = ((df["t"] >= EVENT_START_S) & (df["t"] <= EVENT_END_S)).astype(int)

# Sort by time and check cadence
df = df.sort_values("t").reset_index(drop=True)
tick_mode = df["t"].diff().dropna().mode()
tick = float(tick_mode.iloc[0]) if len(tick_mode) else 2.5
print(f"Detected tick ~ {tick:.2f} s")

# Time-based split borders
unique_t = df["t"].unique()
nT = len(unique_t)
t_train_end = unique_t[int(nT * TRAIN_FRAC) - 1]
t_val_end   = unique_t[int(nT * (TRAIN_FRAC + VAL_FRAC)) - 1]

is_train = df["t"] <= t_train_end
is_val   = (df["t"] > t_train_end) & (df["t"] <= t_val_end)
is_test  = df["t"] > t_val_end

print("Time split:")
print(f"  Train: t <= {t_train_end}")
print(f"  Val:   {t_train_end} < t <= {t_val_end}")
print(f"  Test:  t > {t_val_end}")

# Keep only the two features for XGBoost
FEATS = ["outward_fraction", "mean_outward_speed_mps"]

# Build CSVs for SageMaker XGBoost (label FIRST, no header) for TRAIN/VAL
train_csv = pd.concat([df.loc[is_train, "label"], df.loc[is_train, FEATS]], axis=1)
val_csv   = pd.concat([df.loc[is_val,   "label"], df.loc[is_val,   FEATS]], axis=1)

train_path_local = "train.csv"
val_path_local   = "validation.csv"
train_csv.to_csv(train_path_local, index=False, header=False)
val_csv.to_csv(val_path_local,     index=False, header=False)

# For inference, features ONLY (no label, no header) for VAL and TEST
val_feats_local  = "val_features.csv"
test_feats_local = "test_features.csv"
df.loc[is_val,  FEATS].to_csv(val_feats_local,  index=False, header=False)
df.loc[is_test, FEATS].to_csv(test_feats_local, index=False, header=False)

# Keep time and labels locally to rejoin later
val_meta  = df.loc[is_val,  ["t","label"]].reset_index(drop=True)
test_meta = df.loc[is_test, ["t","label"]].reset_index(drop=True)

print("Local files ready:",
      train_path_local, val_path_local, val_feats_local, test_feats_local)


In [None]:
train_s3_uri = S3Uploader.upload(train_path_local,  f"s3://{bucket}/{prefix}/train")
val_s3_uri   = S3Uploader.upload(val_path_local,    f"s3://{bucket}/{prefix}/validation")
valX_s3_uri  = S3Uploader.upload(val_feats_local,   f"s3://{bucket}/{prefix}/inference/val")
testX_s3_uri = S3Uploader.upload(test_feats_local,  f"s3://{bucket}/{prefix}/inference/test")

print("S3 URIs:")
print("  train :", train_s3_uri)
print("  val   :", val_s3_uri)
print("  val X :", valX_s3_uri)
print("  test X:", testX_s3_uri)


In [None]:
# Output folders (S3) for batch predictions
batch_out_val  = f"s3://{bucket}/{prefix}/batch-out/val"
batch_out_test = f"s3://{bucket}/{prefix}/batch-out/test"

transformer_val = xgb.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=batch_out_val
)
transformer_val.transform(
    data=valX_s3_uri,
    content_type="text/csv",
    split_type="Line"
)
transformer_val.wait()

transformer_test = xgb.transformer(
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=batch_out_test
)
transformer_test.transform(
    data=testX_s3_uri,
    content_type="text/csv",
    split_type="Line"
)
transformer_test.wait()

print("Batch transform complete.")


In [None]:
# Helper: download first/only output file from the transform folder
def download_single_output(s3_uri_folder, local_name):
    files = S3Downloader.list(s3_uri_folder)
    # Find the *.out file
    out_files = [u for u in files if u.endswith(".out")]
    assert out_files, f"No .out files found under {s3_uri_folder}"
    S3Downloader.download(out_files[0], local_name)
    return local_name

val_pred_file  = download_single_output(batch_out_val,  "val_preds.out")
test_pred_file = download_single_output(batch_out_test, "test_preds.out")

# The built-in XGBoost returns one probability per line for binary:logistic
val_proba  = pd.read_csv(val_pred_file,  header=None).iloc[:,0].astype(float).values
test_proba = pd.read_csv(test_pred_file, header=None).iloc[:,0].astype(float).values

# Join with meta for evaluation
val_tbl  = val_meta.copy()
val_tbl["proba"] = val_proba

test_tbl = test_meta.copy()
test_tbl["proba"] = test_proba

# Pick cutoff on validation (prefer higher precision; tie-break on F1)
best = None
rows = []
for thr in CUTOFF_GRID:
    preds = (val_tbl["proba"].values >= thr).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(val_tbl["label"].values, preds, average="binary", zero_division=0)
    rows.append((thr, p, r, f1))
    if (best is None) or (p > best[1]) or (p == best[1] and f1 > best[3]):
        best = (thr, p, r, f1)

cutoff = float(best[0])
print("Validation sweep (thr, P, R, F1):")
for r in rows:
    print("  thr=%.2f  P=%.3f  R=%.3f  F1=%.3f" % r)
print("\nChosen cutoff (prefers precision): %.2f  (P=%.3f, R=%.3f, F1=%.3f)" %
      (best[0], best[1], best[2], best[3]))

# Persistence rule: require >= PERSIST_TICKS consecutive ticks with proba >= cutoff
def apply_persistence(df_in, cutoff, persist_ticks):
    df_in = df_in.sort_values("t").reset_index(drop=True).copy()
    df_in["pred_raw"] = (df_in["proba"] >= cutoff).astype(int)
    roll = df_in["pred_raw"].rolling(window=persist_ticks, min_periods=persist_ticks).sum()
    df_in["pred_persist"] = (roll >= persist_ticks).astype(int)
    return df_in

val_tbl  = apply_persistence(val_tbl,  cutoff, PERSIST_TICKS)
test_tbl = apply_persistence(test_tbl, cutoff, PERSIST_TICKS)

# Metrics (TEST)
p, r, f1, _ = precision_recall_fscore_support(test_tbl["label"], test_tbl["pred_persist"],
                                              average="binary", zero_division=0)
try:
    auc_val  = roc_auc_score(val_tbl["label"],  val_tbl["proba"])
    auc_test = roc_auc_score(test_tbl["label"], test_tbl["proba"])
except Exception:
    auc_val = auc_test = np.nan

print("\nTEST metrics with persistence:")
print(f"  Precision={p:.3f}  Recall={r:.3f}  F1={f1:.3f}  (AUC Val={auc_val:.3f}, AUC Test={auc_test:.3f})")

# Show intervals when Trigger B is ON in TEST
on = test_tbl.loc[test_tbl["pred_persist"] == 1, "t"].to_numpy()
if on.size == 0:
    print("\nNo Trigger B intervals on TEST with current cutoff/persistence.")
else:
    # Find contiguous blocks
    blocks = []
    start = on[0]; prev = on[0]
    for tt in on[1:]:
        if tt - prev > tick + 1e-6:
            blocks.append((start, prev))
            start = tt
        prev = tt
    blocks.append((start, prev))
    print("\nTrigger B active intervals on TEST (t_start → t_end):")
    for a,b in blocks:
        dur = b - a + tick
        print(f"  {a:.1f}s → {b:.1f}s   (~{dur:.1f}s)")
