In [None]:
import pandas as pd
import numpy as np

# ---------- Generic helpers ----------
def _to_dt(s, dayfirst=False):
    """Safe datetime parser (returns NaT on bad rows)."""
    return pd.to_datetime(s, errors="coerce", dayfirst=dayfirst)

def _to_binary_fraud(s):
    """Map common fraud flags to {0,1}. Unmapped -> NaN."""
    m = {"y":1, "yes":1, "true":1, "t":1, "1":1,
         "n":0, "no":0, "false":0, "f":0, "0":0}
    return s.astype(str).str.strip().str.lower().map(m)

# ============================================================
# 1) Temporal features from incident_date (and optional time)
# ============================================================
def add_incident_month(df, date_col="incident_date"):
    out = df.copy()
    dt = _to_dt(out[date_col])
    out["incident_month"] = dt.dt.month
    return out

def add_incident_dayofweek(df, date_col="incident_date"):
    out = df.copy()
    dt = _to_dt(out[date_col])
    # Monday=0 ... Sunday=6
    out["incident_dayofweek"] = dt.dt.dayofweek
    return out

def add_incident_hour(df, time_col="incident_hour_of_the_day"):
    """
    If you have a dedicated integer hour column use that directly.
    If you have a string time like '14:35', the parser below tries to extract the hour.
    """
    out = df.copy()
    s = out[time_col]
    # Try to coerce strings like '14', '14:35', '14.0'
    hour = pd.to_numeric(s, errors="coerce")
    needs_parse = hour.isna()
    if needs_parse.any():
        # string parse fallback
        parsed = pd.to_datetime(s[needs_parse], errors="coerce", format="%H:%M")
        hour.loc[needs_parse] = parsed.dt.hour
    out["incident_hour"] = hour.astype("Int64")
    return out

# ============================================================
# 2) days_to_report = police_report_date - incident_date
# ============================================================
def add_days_to_report(df, incident_col="incident_date", report_col="police_report_date"):
    out = df.copy()
    inc = _to_dt(out[incident_col])
    rep = _to_dt(out[report_col])
    out["days_to_report"] = (rep - inc).dt.days
    return out

# ============================================================
# 3) policy_tenure (years from bind date to incident)
# ============================================================
def add_policy_tenure_years(df, bind_col="policy_bind_date", incident_col="incident_date"):
    out = df.copy()
    bind = _to_dt(out[bind_col])
    inc  = _to_dt(out[incident_col])
    # Use 365.25 to account for leap years
    out["policy_tenure_years"] = ((inc - bind).dt.days / 365.25).round(3)
    return out

# ============================================================
# 4) policy_deductible_ratio = deductible / annual_premium
# ============================================================
def add_policy_deductible_ratio(df, ded_col="policy_deductable", prem_col="policy_annual_premium"):
    out = df.copy()
    denom = pd.to_numeric(out[prem_col], errors="coerce").replace({0: np.nan})
    numer = pd.to_numeric(out[ded_col], errors="coerce")
    out["policy_deductible_ratio"] = numer / denom
    return out

# ============================================================
# 5) vehicle_age = current_year - auto_year
# ============================================================
def add_vehicle_age(df, auto_year_col="auto_year", current_year=None):
    out = df.copy()
    year = pd.to_numeric(out[auto_year_col], errors="coerce")
    if current_year is None:
        # If incident year exists, prefer that; else use today's year
        inc_dt = _to_dt(out.get("incident_date"))
        current_year = int(inc_dt.dt.year.mode().iloc[0]) if "incident_date" in out and inc_dt.notna().any() \
                       else pd.Timestamp.today().year
    out["vehicle_age"] = current_year - year
    return out

# ============================================================
# 6) total_claim_amount = injury + property + vehicle (robust)
# ============================================================
def add_total_claim_amount(df, injury_col="injury_claim", property_col="property_claim", vehicle_col="vehicle_claim"):
    out = df.copy()
    i = pd.to_numeric(out[injury_col], errors="coerce").fillna(0)
    p = pd.to_numeric(out[property_col], errors="coerce").fillna(0)
    v = pd.to_numeric(out[vehicle_col], errors="coerce").fillna(0)
    out["total_claim_amount"] = i + p + v
    return out

# ============================================================
# 7) high_claim_flag (>= 90th percentile of total_claim_amount)
# ============================================================
def add_high_claim_flag(df, total_col="total_claim_amount", q=90):
    out = df.copy()
    vals = pd.to_numeric(out[total_col], errors="coerce")
    thresh = np.nanpercentile(vals, q) if np.isfinite(vals).any() else np.nan
    out["high_claim_flag"] = (vals >= thresh).astype("Int64")
    out.attrs["high_claim_threshold"] = thresh  # stored as metadata for reference
    return out

# ============================================================
# 8) Geographic: mismatch + frequency risk by state
# ============================================================
def add_state_mismatch(df, insured_state_col="policy_state", incident_state_col="incident_state"):
    out = df.copy()
    out["insured_vs_incident_state_mismatch"] = (
        out[insured_state_col].astype(str).str.upper()
        != out[incident_state_col].astype(str).str.upper()
    ).astype("Int64")
    return out

def add_incident_state_risk(df, state_col="incident_state", fraud_col="fraud_reported"):
    """
    Encodes per-state fraud propensity: mean(fraud) within each state.
    """
    out = df.copy()
    y = _to_binary_fraud(out[fraud_col])
    grp = y.groupby(out[state_col]).transform("mean")
    out["incident_state_risk"] = grp
    return out

# ============================================================
# 9) Employment/occupation fraud rate (frequency encoding)
# ============================================================
def add_fraud_rate_by_occupation(df, occ_col="insured_occupation", fraud_col="fraud_reported", min_count=10):
    out = df.copy()
    y = _to_binary_fraud(out[fraud_col])
    grp_mean = y.groupby(out[occ_col]).transform("mean")
    grp_cnt  = out.groupby(occ_col)[occ_col].transform("count")
    # Optional smoothing: fallback to global mean when category too small
    global_mean = y.mean()
    smoothed = np.where(grp_cnt >= min_count, grp_mean, global_mean)
    out["fraud_rate_by_occupation"] = smoothed
    return out

# ============================================================
# 10) Multiple claims flag per insured (repeat customer behavior)
# ============================================================
def add_multiple_claims_flag(df, id_col="policy_number"):
    """
    Set 1 if this policy_number appears >1 times in the dataset.
    If you have a better unique key for the claimant, replace policy_number with that.
    """
    out = df.copy()
    counts = out.groupby(id_col)[id_col].transform("count")
    out["multiple_claims_flag"] = (counts > 1).astype("Int64")
    return out
