In [16]:
# --- config ---
MODEL_PATH = "xgb_best_model.pkl"   # path to your saved model
TRAIN_CSV  = "fdr_training_view_no_feature_engineering.csv"  # path to your training CSV
TARGET_COL = "label_undelivered_cb"
DROP_COLS  = ["merchant_id"]   # any columns you dropped during training
CATEGORICALS = ["vertical", "country"]  # the columns you label-encoded

In [14]:
# --- imports ---
import pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [17]:
# --- 1) Load model ---
with open(MODEL_PATH, "rb") as f:
    model = pickle.load(f)
print("Loaded model:", type(model).__name__)


Loaded model: XGBClassifier


In [18]:
# --- 2) Recreate training schema & encoders (only needed if you didn't save them separately) ---
df_train = pd.read_csv(TRAIN_CSV)
df_train = df_train.drop(columns=[c for c in DROP_COLS if c in df_train.columns], errors="ignore")

# Make sure target exists and build feature list
if TARGET_COL not in df_train.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found in {TRAIN_CSV}.")
feature_cols = [c for c in df_train.columns if c != TARGET_COL]

# Fit label encoders on training data to mirror notebook preprocessing
encoders = {}
for col in CATEGORICALS:
    if col in feature_cols:
        le = LabelEncoder()
        # Use astype(str) to be robust if values were strings in training
        le.fit(df_train[col].astype(str))
        encoders[col] = le
        # Replace training (not strictly needed for prediction, but shows mapping)
        # df_train[col] = le.transform(df_train[col].astype(str))

# Helper: apply the encoders to a single-row frame
def encode_categoricals(row_df: pd.DataFrame) -> pd.DataFrame:
    for col, le in encoders.items():
        if col in row_df.columns:
            # accept either raw strings or already-encoded ints
            # if value is unseen, raise a helpful error
            val = row_df.at[row_df.index[0], col]
            if pd.isna(val):
                continue
            if isinstance(val, (int, np.integer)) and 0 <= val < len(le.classes_):
                # assume already-encoded
                continue
            # otherwise, treat as raw label and encode
            label = str(val)
            if label not in set(le.classes_):
                raise ValueError(
                    f"Unseen category '{label}' for column '{col}'. "
                    f"Known: {list(le.classes_)[:10]}{'...' if len(le.classes_)>10 else ''}"
                )
            row_df.at[row_df.index[0], col] = int(le.transform([label])[0])
    return row_df


In [None]:
# --- 3) Build a full single-row feature frame in the correct order ---
def make_feature_row(custom_values: dict) -> pd.DataFrame:
    # start with all-zero row (or NaN if you prefer)
    row = pd.DataFrame([ {c: 0 for c in feature_cols} ])
    # update with provided values
    for k, v in custom_values.items():
        if k not in row.columns:
            raise KeyError(f"Provided key '{k}' is not a known feature. Expected one of: {feature_cols}")
        row.at[0, k] = v
    # encode categoricals using the same mapping as training
    row = encode_categoricals(row)

    # order columns to match training/model
    if hasattr(model, "feature_names_in_"):
        # scikit-learn style models
        ordered = list(model.feature_names_in_)
        # sanity check
        missing = [c for c in ordered if c not in row.columns]
        if missing:
            raise ValueError(f"Missing required features for the model: {missing}")
        row = row[ordered]
    else:
        # fallback to training order
        row = row[feature_cols]

    # ensure numeric dtypes
    row = row.apply(pd.to_numeric, errors="coerce")
    # optional: fill any NaNs (tweak to match your training-time imputation, if any)
    row = row.fillna(0)
    return row

In [23]:
# --- 4) Predict helper ---
def predict_label_undelivered(custom_values: dict, decision_threshold: float = 0.5):
    X_row = make_feature_row(custom_values)
    # predict probability for positive class
    if hasattr(model, "predict_proba"):
        proba = float(model.predict_proba(X_row)[:, 1][0])
    else:
        # xgboost native booster or clf without predict_proba
        proba = float(model.predict(X_row, output_margin=False)[0])
    label = int(proba >= decision_threshold)
    return proba, label

In [24]:
# --- 5) EXAMPLE USAGE ---
# Replace these with the REAL values for your custom merchant.
# For categoricals:
#   - if you pass strings (e.g., "Retail", "RO"), they will be label-encoded using training mapping
#   - if you already know the encoded ints, you may pass the ints directly

# custom_merchant = {
#     "vertical": "Retail",   # or the encoded int, e.g., 2
#     "country": "RO",        # or encoded int
#     # ---- numeric features below: fill ALL features you used in training ----
#     # Examples (replace with your actual feature names):
#     # "avg_ticket": 145.0,
#     # "txn_count_30d": 27,
#     # "chargebacks_90d": 1,
#     # "refund_ratio_30d": 0.03,
#     # "hour_of_day": 13,
#     # "is_mobile": 1,
#     # ... continue until you've covered every feature in feature_cols ...
# }

custom_merchant = {
    "vertical": "subscription_box",
    "mcc": 5968,
    "country": "SG",
    "age_months": 60,
    "new_merchant": 0.0289,
    "trust_score": 69,
    "prior_cb_rate": 0.034,
    "refund_rate": 0.0708,
    "cancel_rate": 0.1328,
    "website_uptime": 0.9847,
    "sentiment": 0.0775,
    "sales_growth_3m": 0.0497,
    "payout_delay_days": 5.5359,
    "reserve_percent": 7.11,
    "deposit_policy_percent": 10.31,
    "days_in_advance": 49,
    "booking_amount": 69.73,
    "shock_flag": 0,
    "typical_horizon": 49,
    "base_fdr": 0.1958,
}


# Quick sanity: show which features you're missing (optional helper)
missing = [c for c in feature_cols if c not in custom_merchant]
if missing:
    print(f"NOTE: You haven't provided values for {len(missing)} feature(s). "
          f"They will default to 0 unless you set them explicitly.\n"
          f"Missing feature names (first 20): {missing[:20]}")

prob, pred = predict_label_undelivered(custom_merchant, decision_threshold=0.5)
print(f"Predicted probability (label_undelivered_cb=1): {prob:.4f}")
print(f"Predicted label_undelivered_cb (threshold=0.5): {pred}")

NameError: name 'make_feature_row' is not defined

In [26]:
# predict_one.py
# Minimal script: load model, prep one custom row, predict.

import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

MODEL_PATH = "xgb_best_model.pkl"
TRAIN_CSV  = "fdr_training_view_no_feature_engineering.csv"
TARGET_COL = "label_undelivered_cb"
CATEGORICALS = ["vertical", "country"]   # adjust if you added/changed categoricals
DROP_COLS = ["merchant_id"]              # adjust if you dropped other columns during training
THRESHOLD = 0.5

def load_model(path=MODEL_PATH):
    with open(path, "rb") as f:
        return pickle.load(f)

def load_training_schema():
    df = pd.read_csv(TRAIN_CSV)
    for c in DROP_COLS:
        if c in df.columns:
            df = df.drop(columns=c)
    if TARGET_COL not in df.columns:
        raise ValueError(f"Target column '{TARGET_COL}' not found in {TRAIN_CSV}")
    feature_cols = [c for c in df.columns if c != TARGET_COL]
    return df, feature_cols

def fit_encoders(df):
    encoders = {}
    for col in CATEGORICALS:
        if col in df.columns:
            le = LabelEncoder()
            le.fit(df[col].astype(str))
            encoders[col] = le
    return encoders

def apply_encoders(row_df, encoders):
    for col, le in encoders.items():
        if col not in row_df.columns:
            continue
        val = row_df.at[row_df.index[0], col]
        if pd.isna(val):
            continue
        # if already an int in valid range, accept it
        if isinstance(val, (int, np.integer)) and 0 <= val < len(le.classes_):
            continue
        # otherwise, encode string label
        label = str(val)
        if label not in set(le.classes_):
            raise ValueError(
                f"Unseen category '{label}' for '{col}'. "
                f"Known examples: {list(le.classes_)[:10]}{'...' if len(le.classes_)>10 else ''}"
            )
        row_df.at[row_df.index[0], col] = int(le.transform([label])[0])
    return row_df

def build_row(custom_merchant, feature_cols, model, encoders):
    # start with zeros for all features, then fill provided values
    row = pd.DataFrame([{c: 0 for c in feature_cols}])
    for k, v in custom_merchant.items():
        if k not in row.columns:
            raise KeyError(f"Provided key '{k}' not in training features.\nExpected: {feature_cols}")
        row.at[0, k] = v

    # encode categoricals
    row = apply_encoders(row, encoders)

    # ensure numeric types and fill any NaNs
    row = row.apply(pd.to_numeric, errors="coerce").fillna(0)

    # align to model input order if available
    if hasattr(model, "feature_names_in_"):
        missing = [c for c in model.feature_names_in_ if c not in row.columns]
        if missing:
            raise ValueError(f"Missing features required by model: {missing}")
        row = row[list(model.feature_names_in_)]
    else:
        # keep training order (less strict fallback)
        row = row[feature_cols]

    return row

def predict_one(custom_merchant):
    model = load_model()
    df_train, feature_cols = load_training_schema()
    encoders = fit_encoders(df_train)

    # Warn if you forgot some features
    missing = [c for c in feature_cols if c not in custom_merchant]
    if missing:
        print(f"[NOTE] You did not provide {len(missing)} feature(s). They will default to 0.")
        print(f"       Missing (first 20): {missing[:20]}")

    X_row = build_row(custom_merchant, feature_cols, model, encoders)

    # probability (positive class) + binary prediction
    if hasattr(model, "predict_proba"):
        prob = float(model.predict_proba(X_row)[:, 1][0])
    else:
        # for rare cases (e.g., booster) without predict_proba
        prob = float(model.predict(X_row)[0])
    label = int(prob >= THRESHOLD)
    return prob, label

if __name__ == "__main__":
    # EXAMPLE: replace with the real merchant values
    custom_merchant = {
    "vertical": "subscription_box",
    "mcc": 5968,
    "country": "SG",
    "age_months": 60,
    "new_merchant": 0.0289,
    "trust_score": 69,
    "prior_cb_rate": 0.034,
    "refund_rate": 0.0708,
    "cancel_rate": 0.1328,
    "website_uptime": 0.9847,
    "sentiment": 0.0775,
    "sales_growth_3m": 0.0497,
    "payout_delay_days": 5.5359,
    "reserve_percent": 7.11,
    "deposit_policy_percent": 10.31,
    "days_in_advance": 49,
    "booking_amount": 69.73,
    "shock_flag": 0,
    "typical_horizon": 49,
    "base_fdr": 0.1958,
}

    prob, label = predict_one(custom_merchant)
    print(f"Predicted probability (label_undelivered_cb=1): {prob:.4f}")
    print(f"Predicted label (threshold={THRESHOLD}): {label}")


Predicted probability (label_undelivered_cb=1): 0.2396
Predicted label (threshold=0.5): 0


  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v
  row.at[0, k] = v


In [44]:
# predict_one.py
# Minimal script: load model, prep one custom row, predict.

import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

MODEL_PATH = "xgb_best_model.pkl"
TRAIN_CSV  = "fdr_training_view_no_feature_engineering.csv"
TARGET_COL = "label_undelivered_cb"
CATEGORICALS = ["vertical", "country"]   # adjust if you added/changed categoricals
DROP_COLS = ["merchant_id"]              # adjust if you dropped other columns during training
THRESHOLD = 0.5

def load_model(path=MODEL_PATH):
    with open(path, "rb") as f:
        return pickle.load(f)

def load_training_schema():
    df = pd.read_csv(TRAIN_CSV)
    for c in DROP_COLS:
        if c in df.columns:
            df = df.drop(columns=c)
    if TARGET_COL not in df.columns:
        raise ValueError(f"Target column '{TARGET_COL}' not found in {TRAIN_CSV}")
    feature_cols = [c for c in df.columns if c != TARGET_COL]
    return df, feature_cols

def fit_encoders(df):
    encoders = {}
    for col in CATEGORICALS:
        if col in df.columns:
            le = LabelEncoder()
            le.fit(df[col].astype(str))
            encoders[col] = le
    return encoders

def apply_encoders(row_df, encoders):
    # Expect categorical columns to be object at this point
    for col, le in encoders.items():
        if col not in row_df.columns:
            continue
        val = row_df.at[row_df.index[0], col]

        # Accept already-encoded ints in range
        if isinstance(val, (int, np.integer)) and 0 <= val < len(le.classes_):
            continue

        if pd.isna(val):
            # If missing, you may choose a default like the most frequent class (index 0)
            row_df.at[row_df.index[0], col] = 0
            continue

        # Otherwise encode string label
        label = str(val)
        if label not in set(le.classes_):
            raise ValueError(
                f"Unseen category '{label}' for '{col}'. "
                f"Known examples: {list(le.classes_)[:10]}{'...' if len(le.classes_)>10 else ''}"
            )
        row_df.at[row_df.index[0], col] = int(le.transform([label])[0])

    # Cast encoded categoricals to int to avoid mixed dtypes downstream
    for col in encoders.keys():
        if col in row_df.columns:
            row_df[col] = pd.to_numeric(row_df[col], errors="raise").astype("int64")

    return row_df

def build_row(custom_merchant, feature_cols, model, encoders):
    # Initialize with proper dtypes:
    # - object for categoricals (to safely hold strings before encoding)
    # - float for everything else (more permissive than int)
    init_data = {}
    for c in feature_cols:
        if c in encoders:  # categorical
            init_data[c] = pd.Series([None], dtype="object")
        else:
            init_data[c] = pd.Series([np.nan], dtype="float64")
    row = pd.DataFrame(init_data)

    # Fill provided values, coercing numerics where appropriate
    for k, v in custom_merchant.items():
        if k not in row.columns:
            raise KeyError(f"Provided key '{k}' not in training features.\nExpected: {feature_cols}")

        if k in encoders:
            # leave as string/int; encoder will handle it
            row.at[0, k] = v
        else:
            # numeric-ish: coerce to numeric
            row.at[0, k] = pd.to_numeric(v, errors="coerce")

    # Encode categoricals and cast to int
    row = apply_encoders(row, encoders)

    # Ensure numeric columns are numeric and fill any NaNs
    non_cat = [c for c in feature_cols if c not in encoders]
    if non_cat:
        row[non_cat] = row[non_cat].apply(pd.to_numeric, errors="coerce").fillna(0.0)

    # Align to model input order if available
    if hasattr(model, "feature_names_in_"):
        missing = [c for c in model.feature_names_in_ if c not in row.columns]
        if missing:
            raise ValueError(f"Missing features required by model: {missing}")
        row = row[list(model.feature_names_in_)]
    else:
        row = row[feature_cols]

    return row

def predict_one(custom_merchant):
    model = load_model()
    df_train, feature_cols = load_training_schema()
    encoders = fit_encoders(df_train)

    missing = [c for c in feature_cols if c not in custom_merchant]
    if missing:
        print(f"[NOTE] You did not provide {len(missing)} feature(s). They will default to 0.")
        print(f"       Missing (first 20): {missing[:20]}")

    X_row = build_row(custom_merchant, feature_cols, model, encoders)

    if hasattr(model, "predict_proba"):
        prob = float(model.predict_proba(X_row)[:, 1][0])
    else:
        prob = float(model.predict(X_row)[0])
    label = int(prob >= THRESHOLD)

    category = classify_partner(prob)

    return prob, label, category

def classify_partner(prob: float) -> str:
    """
    Map predicted probability to a category label.
    """
    if 0.0 <= prob <= 0.05:
        return "Trusted Partner"
    elif 0.06 <= prob <= 0.10:
        return "Established Operator"
    elif 0.11 <= prob <= 0.20:
        return "Developing Organization"
    elif 0.21 <= prob <= 0.40:
        return "High-Risk Counterparty"
    elif 0.41 <= prob <= 1.0:
        return "Fraudulent Actor"
    else:
        return "Unclassified"  # catch edge cases


if __name__ == "__main__":
    # EXAMPLE: replace with the real merchant values
    custom_merchant = {
    "vertical": "subscription_box",
    "mcc": 5968,
    "country": "SG",
    "age_months": 24,
    "new_merchant": 0,
    "trust_score": 25,
    "prior_cb_rate": 0.034,
    "refund_rate": 0.0708,
    "cancel_rate": 0.1328,
    "website_uptime": 0.9847,
    "sentiment": 0.0775,
    "sales_growth_3m": 0.0497,
    "payout_delay_days": 5.5359,
    "reserve_percent": 7.11,
    "deposit_policy_percent": 10.31,
    "days_in_advance": 49,
    "booking_amount": 69.73,
    "shock_flag": 0,
    "typical_horizon": 30,
    "base_fdr": 0.1958,
}

prob, label, category = predict_one(custom_merchant)
print(f"Predicted probability (label_undelivered_cb=1): {prob:.4f}")
print(f"Predicted label (threshold={THRESHOLD}): {label}")
print(f"Category: {category}")



Predicted probability (label_undelivered_cb=1): 0.3044
Predicted label (threshold=0.5): 0
Category: High-Risk Counterparty
