# Part 3: Modeling

### Content List
- [Imports](#Imports)
- [Read in CSV](#Read-in-CSV)
- [Defining Inputs](#Defining-Inputs)
- [Functions](#Functions)
- [Modeling: Logistic Regression](#Modeling:-Logistic-Regression)
- [Modeling: Random Forest](#Modeling:-Random-Forest)
- [Modeling: XGBoost](#Modeling:-XGBoost)
- [Conclusions](#Conclusions)

### Imports

In [30]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Import sklearn elements
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, r2_score, f1_score
from sklearn.ensemble import RandomForestClassifier

#import xgboost
from xgboost import XGBClassifier

### Read in CSV

In [31]:
#read in file from csv to dataframe
data_read = pd.read_csv('./dataset/cleaned_FEATURES.csv')


In [32]:
data = data_read

In [33]:
num_cols = data.select_dtypes(include=('int64', 'float64')).columns

In [34]:
list(num_cols)

['loan_amnt',
 'int_rate',
 'installment',
 'grade',
 'emp_length',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'out_prncp',
 'out_prncp_inv',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'last_fico_range_high',
 'last_fico_range_low',
 'collections_12_mths_ex_med',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'mort_acc',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_bc_tl',
 'num_il_tl',
 'num_op_rev_tl',
 'num_rev_accts',
 'num_rev_tl_bal_gt_0',
 'num_sats',
 'num_tl_30dpd',
 'num_tl_90g_dpd_24m',
 'num_tl_op_past_12m',
 'pct_tl_nvr_dlq',
 'percent_bc_gt_75',
 'pub_rec_bankruptcies',
 'tax_liens',
 'tot_hi_cred_lim',
 'total_bal_ex_mort',
 'total_il_high_credit_limit',
 'classes',
 

In [35]:
data.dtypes.value_counts()

bool       75
float64    71
int64       9
object      4
Name: count, dtype: int64

## Defining Inputs

In [36]:
X = data[num_cols].drop(columns='classes')
y = data['classes']

In [37]:
X.dtypes.value_counts()

float64    71
int64       8
Name: count, dtype: int64

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=420,
                                                    stratify = y)

In [39]:
# save the test set
X_test.to_csv('./dataset/test_set.csv', index=False)

In [40]:
X_train.shape

(847634, 79)

In [41]:
y_train.shape

(847634,)

## Functions

In [42]:
def metrics(model):
    preds = model.predict(X_test) #generate predictions
    
    test_conf = confusion_matrix(y_test,# True values.
                                  preds) # Predicted values.
    
    tn, fp, fn, tp = test_conf.ravel() #unravel values to use in metrics
    
    accuracy = accuracy_score(y_test, preds)
    print("Accuracy score: %.2f%%" % (accuracy * 100.0))

    recall = recall_score(y_test, preds)
    print("Recall score: %.2f%%" % (recall * 100.0))

    precision = precision_score(y_test, preds)
    print("Precision score: %.2f%%" % (precision * 100.0))

    specificity = tn / (tn+fp)
    specificity = specificity * 100.0
    print(f"Specificity score: {round((specificity),2)}%")
    
    F1_SCORE = f1_score(y_test, preds)
    F1_SCORE = F1_SCORE * 100.0
    print(f'F1 score: {round((F1_SCORE),2)}%')
    
    df_conf= pd.DataFrame(test_conf, index =  ['Actual Failure', 'Actual Success'],
                    columns = ['Predicted Failure', 'Predicted Success'])
    return(df_conf)

In [43]:
def cv_score(model):
    cv_train = cross_val_score(model, X_train, y_train, cv=3).mean()
    cv_test = cross_val_score(model, X_test, y_test, cv=3).mean()
    print(f'Mean CV Score for Training: {cv_train}')
    print(f'Mean CV Score for Testing: {cv_test}')

In [44]:
avg_rate = data['int_rate'].mean()
avg_rate = round((avg_rate),2)
print(f'The Average Interest Rate is: {avg_rate}%')

avg_loan = data['loan_amnt'].mean()
avg_loan = round((avg_loan),2)
print(f'The Average Loan amount is: {avg_loan}$')

The Average Interest Rate is: 13.3%
The Average Loan amount is: 14821.58$


## Modeling 1: Logistic Regression

**Assumptions of Logistic Regression:**

In [45]:
# Step 1: Instantiate our model.
logreg = LogisticRegression(solver='liblinear')

# Step 2: Fit our model.
logreg.fit(X_train, y_train)

In [46]:
#return the intercept and coefficients
print(f'Logistic Regression Intercept: {logreg.intercept_}')
print(f'Logistic Regression Coefficient: {logreg.coef_}')

Logistic Regression Intercept: [4.97078702e-12]
Logistic Regression Coefficient: [[ 2.91914731e-08  3.53466521e-11  1.04877728e-09  3.42339921e-11
   2.96135692e-11  3.00868089e-07  6.52821497e-11  1.01689814e-12
   3.50751240e-09  3.52739584e-09  1.73492930e-12  4.96904129e-11
   9.98035748e-13  4.67828286e-08  2.24275970e-10  1.13395583e-10
  -1.41957672e-08 -1.41928000e-08 -7.66230050e-09 -1.21761550e-09
   8.78324149e-08  4.76390364e-09  5.17227386e-09  3.56217580e-14
   1.30469390e-14  1.62396532e-09  3.13507973e-07  1.49739729e-11
   3.42623447e-08  5.46713815e-08  2.63656063e-10  3.76256088e-14
  -5.64474480e-11  7.36504947e-12  2.24643665e-12  1.48900035e-11
   2.17816178e-11  2.05269969e-11  3.92897777e-11  3.53295077e-11
   3.61029112e-11  6.95429067e-11  2.18604811e-11  4.94837665e-11
   6.40150594e-15  3.21958698e-13  6.74402308e-12  4.69021894e-10
   1.78814213e-10  7.04406806e-13  1.59727259e-13  4.29567399e-07
   1.27876872e-07  1.09983315e-07  2.86811513e-08  1.13471360

In [47]:
#how to visualize these results?
print(f'Logistic Regression train score: {logreg.score(X_train, y_train)}')
print(f'Logistic Regression test score: {logreg.score(X_test, y_test)}')

Logistic Regression train score: 0.8501534860564819
Logistic Regression test score: 0.8504564073201347


In [48]:
cv_score(logreg)

Mean CV Score for Training: 0.8006757467245151
Mean CV Score for Testing: 0.803040101371134


In [49]:
metrics(logreg)

Accuracy score: 85.05%
Recall score: 99.75%
Precision score: 84.17%
Specificity score: 30.72%
F1 score: 91.3%


Unnamed: 0,Predicted Failure,Predicted Success
Actual Failure,23774,53616
Actual Success,709,285173


In [50]:
data['classes'].value_counts(normalize=True)

classes
1    0.786963
0    0.213037
Name: proportion, dtype: float64

## Modeling 2: Random Forest

In [51]:
rf = RandomForestClassifier(max_depth= 5, max_features= 5, n_estimators= 100)
rf.fit(X_train, y_train)

In [52]:
cv_score(rf)

Mean CV Score for Training: 0.9393618038467381
Mean CV Score for Testing: 0.9406780510791665


In [53]:
print(f'Random Forest train score: {rf.score(X_train, y_train)}')
print(f'Random Forest test score: {rf.score(X_test, y_test)}')

Random Forest train score: 0.9437811602649256
Random Forest test score: 0.9432381246008501


In [54]:
metrics(rf)

Accuracy score: 94.32%
Recall score: 99.85%
Precision score: 93.39%
Specificity score: 73.91%
F1 score: 96.51%


Unnamed: 0,Predicted Failure,Predicted Success
Actual Failure,57197,20193
Actual Success,427,285455


In [55]:
feature_importance = pd.Series(data = rf.feature_importances_,
                              index = X.columns)
feature_importance.sort_values(ascending = False)

recoveries*collection_recovery_fee          1.661125e-01
recoveries                                  1.496501e-01
last_fico_range_low                         1.366130e-01
last_fico_range_high*last_fico_range_low    1.283434e-01
collection_recovery_fee                     1.123957e-01
                                                ...     
num_tl_30dpd                                2.294738e-08
acc_now_delinq                              0.000000e+00
tot_coll_amt                                0.000000e+00
delinq_amnt                                 0.000000e+00
chargeoff_within_12_mths                    0.000000e+00
Length: 79, dtype: float64

### Modeling 3: XGBoost

In [56]:
xgb = XGBClassifier()

In [57]:
xgb.fit(X_train, y_train)

In [58]:
cv_score(xgb)

Mean CV Score for Training: 0.9805175346558305
Mean CV Score for Testing: 0.9794368953145876


In [59]:
print(f'XGBoost train score: {xgb.score(X_train, y_train)}')
print(f'XGBoost test score: {xgb.score(X_test, y_test)}')

XGBoost train score: 0.9835907950837272
XGBoost test score: 0.9810472593538726


In [60]:
metrics(xgb)

Accuracy score: 98.10%
Recall score: 99.03%
Precision score: 98.57%
Specificity score: 94.69%
F1 score: 98.8%


Unnamed: 0,Predicted Failure,Predicted Success
Actual Failure,73280,4110
Actual Success,2775,283107


In [73]:
feature_importance = pd.Series(data = xgb.feature_importances_,
                              index = X.columns)
feature_importance.sort_values(ascending = False)

recoveries                                  0.655819
out_prncp                                   0.115541
last_fico_range_high                        0.108619
last_pymnt_amnt                             0.035102
desc_length*all_text_length                 0.013337
                                              ...   
last_fico_range_high*last_fico_range_low    0.000000
fico_range_high                             0.000000
last_fico_range_low                         0.000000
collection_recovery_fee                     0.000000
out_prncp_inv                               0.000000
Length: 79, dtype: float32

In [77]:
print(f"Shape of X: {X.shape}")
print(f"Number of features in model: {len(model.feature_importances_)}")
print(f"Columns in X: {X.columns.tolist()}")


Shape of X: (1210906, 79)
Number of features in model: 10
Columns in X: ['loan_amnt', 'int_rate', 'installment', 'grade', 'emp_length', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt', 'mort_acc', 'num_accts_ever_120_pd', 'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl', 'num_il_tl', 'num_op_rev_tl', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies', 'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_il_high_credit_li

In [78]:
# Get features used by the model
if hasattr(model, "get_booster"):
    model_features = model.get_booster().feature_names
else:
    model_features = [f"f{i}" for i in range(len(model.feature_importances_))]

print(f"Features used by the model: {model_features}")


Features used by the model: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9']


In [79]:
# If the model was trained on a DataFrame, get the original column names
X_train_columns = X_train.columns.tolist()[:10]  # Adjust this based on how many features you selected

# Map the generic model features to dataset columns
feature_mapping = {f"f{i}": X_train_columns[i] for i in range(len(X_train_columns))}
print("Feature Mapping:")
print(feature_mapping)


Feature Mapping:
{'f0': 'loan_amnt', 'f1': 'int_rate', 'f2': 'installment', 'f3': 'grade', 'f4': 'emp_length', 'f5': 'annual_inc', 'f6': 'dti', 'f7': 'delinq_2yrs', 'f8': 'fico_range_low', 'f9': 'fico_range_high'}


In [81]:
# Assuming `model` is your trained XGBoost model
feature_importances = model.feature_importances_  # Extract importance values

# Create a DataFrame with generic feature names
importance_df = pd.DataFrame({
    "Feature": [f"f{i}" for i in range(len(feature_importances))],
    "Importance": feature_importances
}).sort_values(by="Importance", ascending=False)

# Replace generic names with actual column names
importance_df["Feature"] = importance_df["Feature"].replace(feature_mapping)
print(importance_df)


           Feature  Importance
6              dti    0.402437
2      installment    0.210418
8   fico_range_low    0.109162
0        loan_amnt    0.103498
5       annual_inc    0.035053
1         int_rate    0.030565
9  fico_range_high    0.028768
4       emp_length    0.028494
7      delinq_2yrs    0.025808
3            grade    0.025797


In [85]:
print("Class distribution in training set:")
print(y_train.value_counts())


Class distribution in training set:
classes
1    667056
0    180578
Name: count, dtype: int64


In [86]:
from collections import Counter
from xgboost import XGBClassifier

# Calculate the ratio of majority to minority class
class_counts = Counter(y_train)
scale_pos_weight = class_counts[0] / class_counts[1]

# Define the model with the imbalance ratio
model = XGBClassifier(scale_pos_weight=scale_pos_weight)


In [None]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
from imblearn.over_sampling import RandomOverSampler

# Apply RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)



In [92]:
from sklearn.metrics import classification_report, roc_auc_score

model.fit(X_train, y_train)


# Predictions on test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Classification report
print(classification_report(y_test, y_pred))

# ROC-AUC score
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))


              precision    recall  f1-score   support

           0       0.91      0.97      0.94     77390
           1       0.99      0.98      0.98    285882

    accuracy                           0.97    363272
   macro avg       0.95      0.97      0.96    363272
weighted avg       0.98      0.97      0.98    363272

ROC-AUC Score: 0.9970211288564386


In [94]:
import joblib

# Assuming `model` is your trained XGBoost model
joblib.dump(model, "model/xgb_model.pkl")
print("Model saved as xgb_model.pkl")


Model saved as xgb_model.pkl
