In [1]:
# 05_Inference.ipynb
# Cell 1: Imports + Load model pipeline

import numpy as np
import pandas as pd
import joblib
from pathlib import Path

# Load model (choose your best-performing one)
MODEL_NAME = "pipeline_random_forest.joblib"     # change to logistic/xgboost if needed

if not Path(MODEL_NAME).exists():
    raise FileNotFoundError(f"{MODEL_NAME} not found! Make sure the model is saved.")

model = joblib.load(MODEL_NAME)
print(f"Loaded model: {MODEL_NAME}")


Loaded model: pipeline_random_forest.joblib


In [5]:
# Create a template of all features required by the model
required_features = list(model.feature_names_in_)

feature_template = pd.DataFrame([{col: np.nan for col in required_features}])

print("Total features required:", len(required_features))
feature_template.head()


Total features required: 108


Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,disbursement_method,debt_settlement_flag
0,,,,,,,,,,,...,,,,,,,,,,


In [8]:
def predict_single(model, data_dict, threshold=0.5):
    # Create a template with all features
    required = list(model.feature_names_in_)
    row = {col: np.nan for col in required}

    # Fill user-provided values
    for key, value in data_dict.items():
        if key in row:
            row[key] = value
        else:
            print(f"Warning: '{key}' is not a known feature and will be ignored.")

    df = pd.DataFrame([row])

    # Predict
    proba = model.predict_proba(df)[0, 1]
    pred = int(proba >= threshold)

    return {"prediction": pred, "probability": round(proba, 4)}


In [9]:
# Cell 3: Example new loan application (edit values)

new_application = {
    "loan_amnt": 15000,
    "funded_amnt": 15000,
    "term": " 36 months",
    "int_rate": 12.5,
    "installment": 350.0,
    "grade": "C",
    "sub_grade": "C4",
    "emp_length": "5 years",
    "home_ownership": "RENT",
    "annual_inc": 60000,
    "verification_status": "Verified",
    "purpose": "credit_card",
    "zip_code": "123xx",
    "addr_state": "CA",
    "dti": 18.3,
    "delinq_2yrs": 0,
    "revol_bal": 8500,
    "revol_util": 55.5,
    "total_acc": 25,
    # add more fields depending on what your model needs
}

result = predict_single(model, new_application)
result


{'prediction': 0, 'probability': np.float64(0.4953)}

In [10]:
# Cell 4: Batch inference for CSV files

def predict_batch(model, csv_path, threshold=0.5):
    df = pd.read_csv(csv_path)

    # Predict probabilities
    try:
        proba = model.predict_proba(df)[:, 1]
    except:
        proba = model.decision_function(df)

    preds = (proba >= threshold).astype(int)

    df["predicted_default"] = preds
    df["probability_of_default"] = proba.round(4)

    return df

# Example usage:
# output_df = predict_batch(model, "new_applicants.csv")
# output_df.to_csv("predicted_new_applicants.csv", index=False)
# output_df.head()


In [12]:
# Build feature template with all required model columns
required_features = list(model.feature_names_in_)
feature_template = {col: np.nan for col in required_features}


In [13]:
# Cell 5: Try different thresholds

def classify_with_threshold(model, data_dict, threshold=0.5):
    
    # Start with empty feature template
    row = feature_template.copy()

    # Fill provided values
    for key, value in data_dict.items():
        if key in row:
            row[key] = value
        else:
            print(f"Warning: '{key}' ignored (not used by model).")
    
    df = pd.DataFrame([row])

    proba = model.predict_proba(df)[:, 1]
    preds = (proba >= threshold).astype(int)

    return preds, proba


In [14]:
# Cell 6: Deployment-friendly function

def score_applicant(data_dict, model=model, threshold=0.5):
    df = pd.DataFrame([data_dict])
    proba = model.predict_proba(df)[0,1]
    pred = int(proba >= threshold)

    return {
        "default_prediction": pred,
        "default_probability": round(proba, 4),
        "threshold": threshold
    }

# Example:
score_applicant(new_application)


ValueError: columns are missing: {'debt_settlement_flag', 'acc_now_delinq', 'total_bal_ex_mort', 'mths_since_last_delinq', 'il_util', 'mo_sin_old_rev_tl_op', 'open_il_12m', 'tot_hi_cred_lim', 'num_bc_sats', 'total_bc_limit', 'max_bal_bc', 'application_type', 'initial_list_status', 'num_sats', 'pct_tl_nvr_dlq', 'hardship_flag', 'collections_12_mths_ex_med', 'num_il_tl', 'recoveries', 'mths_since_recent_inq', 'total_rec_late_fee', 'tot_coll_amt', 'dti_joint', 'open_act_il', 'mo_sin_old_il_acct', 'mths_since_recent_revol_delinq', 'mths_since_last_major_derog', 'total_rec_int', 'num_tl_30dpd', 'total_rec_prncp', 'out_prncp_inv', 'mths_since_recent_bc_dlq', 'num_bc_tl', 'num_tl_op_past_12m', 'open_acc', 'mths_since_rcnt_il', 'policy_code', 'total_rev_hi_lim', 'num_op_rev_tl', 'collection_recovery_fee', 'num_actv_bc_tl', 'bc_open_to_buy', 'last_pymnt_amnt', 'percent_bc_gt_75', 'disbursement_method', 'open_rv_24m', 'pub_rec', 'total_pymnt', 'annual_inc_joint', 'num_rev_tl_bal_gt_0', 'tot_cur_bal', 'mths_since_recent_bc', 'num_actv_rev_tl', 'mo_sin_rcnt_tl', 'out_prncp', 'mort_acc', 'open_rv_12m', 'pub_rec_bankruptcies', 'acc_open_past_24mths', 'total_pymnt_inv', 'chargeoff_within_12_mths', 'pymnt_plan', 'mths_since_last_record', 'bc_util', 'num_tl_120dpd_2m', 'num_rev_accts', 'inq_last_12m', 'total_cu_tl', 'open_acc_6m', 'next_pymnt_d', 'inq_last_6mths', 'num_accts_ever_120_pd', 'funded_amnt_inv', 'last_pymnt_d', 'inq_fi', 'num_tl_90g_dpd_24m', 'emp_title', 'open_il_24m', 'delinq_amnt', 'earliest_cr_line', 'issue_d', 'tax_liens', 'verification_status_joint', 'last_credit_pull_d', 'avg_cur_bal', 'mo_sin_rcnt_rev_tl_op', 'all_util', 'total_il_high_credit_limit', 'total_bal_il'}