
 # Phase 3: Deployment Simulation

 Load the trained model pipeline (Phase 2 artifact) and demonstrate prediction on sample data using the `CreditPredictor` class.


 ## 1. Setup and Imports

In [1]:
# %% 

import pandas as pd
import numpy as np
import os
import sys
import joblib # Use joblib as artifact was saved with it


from src.predictor import CreditPredictor

# Explicitly import custom transformers needed for unpickling
try:
    from src.transformers import EmpLengthConverter, CreditHistoryCalculator, CountBinarizer
except ModuleNotFoundError:
    print("Warning: Could not import transformers directly.")


 ## 2. Define Artifact Paths and Load Predictor

In [2]:
# %% 

# Define path to the saved pipeline artifact
ARTIFACT_PATH = '.' # Assumes artifact is in the project root 

# Instantiate the predictor (loads the pipeline artifact)
try:
    predictor = CreditPredictor(artifact_path=ARTIFACT_PATH)
    print("CreditPredictor initialized and pipeline loaded.")
except Exception as e:
    print(f"Error loading predictor: {e}")
    predictor = None

Loading pipeline from: ./credit_risk_pipeline_v1.joblib
Pipeline loaded successfully.
CreditPredictor initialized with pipeline.
CreditPredictor initialized and pipeline loaded.


configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.

  setstate(state)



 ## 3. Prepare Sample Input Data for Prediction
 Input data must be in the raw format expected by the *original* pipeline (before FE/preprocessing).


 ### 3.A Define Sample Data Directly (Hardcoded)

In [3]:
# %% 

# Single sample application data
sample_data_hardcoded = [
    {
        'loan_amnt': 15000, 'funded_amnt': 15000, 'funded_amnt_inv': 15000, 
        'term': ' 36 months', 'int_rate': 10.5, 'installment': 487.5, 
        'grade': 'B', 'sub_grade': 'B3', 'emp_length': '5 years', 
        'home_ownership': 'RENT', 'annual_inc': 65000.0, 'verification_status': 'Source Verified', 
        'issue_d': 'Dec-2016', # Needed for CreditHistoryCalculator
        'purpose': 'debt_consolidation', 'addr_state': 'NY', 'dti': 22.5, 
        'delinq_2yrs': 0.0, 
        'earliest_cr_line': 'Aug-2008', # Needed for CreditHistoryCalculator
        'fico_range_low': 680.0, 'fico_range_high': 684.0, 'inq_last_6mths': 0.0, 
        'open_acc': 12.0, 
        'pub_rec': 0.0, # Needed for CountBinarizer
        'revol_bal': 18000.0, 'revol_util': 75.2, 'total_acc': 30.0, 
        'initial_list_status': 'w', 'collections_12_mths_ex_med': 0.0, 
        'application_type': 'Individual', 'acc_now_delinq': 0.0, 
        'tot_coll_amt': 0.0, 'tot_cur_bal': 150000.0, 'total_rev_hi_lim': 25000.0, 
        'acc_open_past_24mths': 4.0, 'avg_cur_bal': 12500.0, 'bc_open_to_buy': 3000.0, 
        'bc_util': 80.0, 'chargeoff_within_12_mths': 0.0, 'delinq_amnt': 0.0, 
        'mo_sin_old_il_acct': 120.0, 'mo_sin_old_rev_tl_op': 150.0, 
        'mo_sin_rcnt_rev_tl_op': 5.0, 'mo_sin_rcnt_tl': 5.0, 
        'mort_acc': 1.0, # Needed for CountBinarizer
        'mths_since_recent_bc': 10.0, 'mths_since_recent_inq': 3.0, 
        'num_accts_ever_120_pd': 0.0, 'num_actv_bc_tl': 4.0, 'num_actv_rev_tl': 6.0, 
        'num_bc_sats': 4.0, 'num_bc_tl': 8.0, 'num_il_tl': 10.0, 'num_op_rev_tl': 6.0, 
        'num_rev_accts': 15.0, 'num_rev_tl_bal_gt_0': 6.0, 'num_sats': 12.0, 
        'num_tl_120dpd_2m': 0.0, 'num_tl_30dpd': 0.0, 'num_tl_90g_dpd_24m': 0.0, 
        'num_tl_op_past_12m': 2.0, 'pct_tl_nvr_dlq': 100.0, 'percent_bc_gt_75': 75.0, 
        'pub_rec_bankruptcies': 0.0, # Needed for CountBinarizer
        'tax_liens': 0.0, 'tot_hi_cred_lim': 180000.0, 'total_bal_ex_mort': 40000.0, 
        'total_bc_limit': 15000.0, 'total_il_high_credit_limit': 30000.0, 
        'disbursement_method': 'Cash' 
        # Ensure all 69 features expected by the pipeline are present
    }
]
sample_df_hardcoded = pd.DataFrame(sample_data_hardcoded)

print("Hardcoded Sample DataFrame prepared.")
print(f"Shape: {sample_df_hardcoded.shape}")

Hardcoded Sample DataFrame prepared.
Shape: (1, 69)



 ### 3.B Load Sample Data from CSV

Adjust path to csv for prediction

In [4]:
# %% 

# Construct path to sample CSV
try:
    script_dir = os.path.dirname(os.path.abspath(__file__)) 
except NameError: 
    script_dir = os.getcwd() 

project_root = os.path.abspath(os.path.join(script_dir, '..')) 
SAMPLE_CSV_PATH = os.path.join(project_root, 'submission/data', 'sample_applications.csv')
print(f"\nPath to sample CSV: {SAMPLE_CSV_PATH}")

sample_df_csv = None
try:
    # Specify dtype for potential mixed-type columns if known
    sample_df_csv = pd.read_csv(SAMPLE_CSV_PATH)
    print(f"Successfully loaded sample data from CSV.")
    print(f"CSV Sample DataFrame shape: {sample_df_csv.shape}")
except FileNotFoundError:
    print(f"Error: Sample CSV file not found at {SAMPLE_CSV_PATH}.")
except Exception as e:
    print(f"Error loading sample CSV: {e}.")


Path to sample CSV: /Users/ua00104/Downloads/submission/data/sample_applications.csv
Successfully loaded sample data from CSV.
CSV Sample DataFrame shape: (6, 151)



 ## 4. Make Predictions
 Use the loaded predictor to generate probabilities and binary classifications.


 ### 4.A Predictions using Hardcoded Sample

In [5]:
# %% 

print("\n--- Predictions (Hardcoded Sample) ---")
if predictor is not None and predictor.pipeline is not None and sample_df_hardcoded is not None:
    try:
        probabilities_hardcoded = predictor.predict_proba(sample_df_hardcoded) 
        # Note: predict() method likely uses default 0.5 threshold unless modified in CreditPredictor
        predictions_hardcoded = predictor.predict(sample_df_hardcoded)
        
        print(f"Predicted Probability (is_bad=1): {probabilities_hardcoded[0]:.4f}")
        print(f"Binary Prediction (0=Good, 1=Bad): {predictions_hardcoded[0]}")
    except Exception as e:
        print(f"Error during prediction: {e}")
else:
    print("Skipping prediction: Predictor or sample data not available.")


--- Predictions (Hardcoded Sample) ---
Received 1 records for prediction.
Input DataFrame columns: ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'collections_12_mths_ex_med', 'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_re


 ### 4.B Predictions using CSV Sample

In [6]:
# %% 

print("\n--- Predictions (CSV Sample) ---")
if predictor is not None and predictor.pipeline is not None and sample_df_csv is not None:
    try:
        probabilities_csv = predictor.predict_proba(sample_df_csv) 
        predictions_csv = predictor.predict(sample_df_csv)
        
        print("Predicted Probabilities (is_bad=1) for CSV records:")
        for i, prob in enumerate(probabilities_csv):
            print(f"  Record {i}: {prob:.4f}")
            
        print("\nBinary Predictions (0=Good, 1=Bad) for CSV records:")
        for i, pred in enumerate(predictions_csv):
            print(f"  Record {i}: {pred}")
            
    except Exception as e:
        print(f"Error during prediction on CSV data: {e}")
        
elif predictor is None or predictor.pipeline is None:
    print("Skipping CSV prediction: Predictor not initialized.")
else: 
    print("Skipping CSV prediction: Sample CSV data not loaded.")


--- Predictions (CSV Sample) ---
Received 6 records for prediction.
Input DataFrame columns: ['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'appl