# Credit Risk Resampling Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
from collections import Counter
import numpy as np
import pandas as pd

# sklearn dependencies
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# imblearn dependencies
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

# Read the CSV and Perform Basic Data Cleaning

In [3]:
columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership",
    "annual_inc", "verification_status", "issue_d", "loan_status",
    "pymnt_plan", "dti", "delinq_2yrs", "inq_last_6mths",
    "open_acc", "pub_rec", "revol_bal", "total_acc",
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt",
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee",
    "recoveries", "collection_recovery_fee", "last_pymnt_amnt", "next_pymnt_d",
    "collections_12_mths_ex_med", "policy_code", "application_type", "acc_now_delinq",
    "tot_coll_amt", "tot_cur_bal", "open_acc_6m", "open_act_il",
    "open_il_12m", "open_il_24m", "mths_since_rcnt_il", "total_bal_il",
    "il_util", "open_rv_12m", "open_rv_24m", "max_bal_bc",
    "all_util", "total_rev_hi_lim", "inq_fi", "total_cu_tl",
    "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", "bc_open_to_buy",
    "bc_util", "chargeoff_within_12_mths", "delinq_amnt", "mo_sin_old_il_acct",
    "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", "mo_sin_rcnt_tl", "mort_acc",
    "mths_since_recent_bc", "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl",
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0",
    "num_sats", "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m",
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", "pub_rec_bankruptcies",
    "tax_liens", "tot_hi_cred_lim", "total_bal_ex_mort", "total_bc_limit",
    "total_il_high_credit_limit", "hardship_flag", "debt_settlement_flag"
]

target = ["loan_status"]

In [4]:
# Load the data
file_path = os.path.join("data",'LoanStats_2019Q1.csv')
df = pd.read_csv(file_path, skiprows=1)[:-2]
df = df.loc[:, columns].copy()

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Remove the `Issued` loan status
issued_mask = df['loan_status'] != 'Issued'
df = df.loc[issued_mask]

# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100


# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N


In [5]:
# look at what still needs cleaned
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68817 entries, 0 to 68816
Data columns (total 86 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   68817 non-null  float64
 1   int_rate                    68817 non-null  float64
 2   installment                 68817 non-null  float64
 3   home_ownership              68817 non-null  object 
 4   annual_inc                  68817 non-null  float64
 5   verification_status         68817 non-null  object 
 6   issue_d                     68817 non-null  object 
 7   loan_status                 68817 non-null  object 
 8   pymnt_plan                  68817 non-null  object 
 9   dti                         68817 non-null  float64
 10  delinq_2yrs                 68817 non-null  float64
 11  inq_last_6mths              68817 non-null  float64
 12  open_acc                    68817 non-null  float64
 13  pub_rec                     688

In [6]:
df["issue_d"] = pd.to_datetime(df["issue_d"])
df["issue_d"] = pd.to_numeric(df["issue_d"])
df["next_pymnt_d"] = pd.to_datetime(df["next_pymnt_d"])
df["next_pymnt_d"] = pd.to_numeric(df["next_pymnt_d"])
df["hardship_flag"] = df["hardship_flag"] != 'N'  # N = False
df["debt_settlement_flag"] = df["debt_settlement_flag"] != "N" # N = False
df["pymnt_plan"] = df["pymnt_plan"] != 'n'  # n = False
df["loan_status"] = df["loan_status"] != "low_risk" # low_risk = FALSE ; high_risk = TRUE
pd.get_dummies(df, columns=["home_ownership","verification_status", "application_type", "initial_list_status"])


Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,issue_d,loan_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,application_type_Individual,application_type_Joint App,initial_list_status_f,initial_list_status_w
0,10500.0,0.1719,375.35,66000.0,1551398400000000000,False,False,27.24,0.0,0.0,...,0,0,1,0,1,0,1,0,0,1
1,25000.0,0.2000,929.09,105000.0,1551398400000000000,False,False,20.23,0.0,0.0,...,1,0,0,0,0,1,1,0,0,1
2,20000.0,0.2000,529.88,56000.0,1551398400000000000,False,False,24.26,0.0,0.0,...,1,0,0,0,0,1,1,0,0,1
3,10000.0,0.1640,353.55,92000.0,1551398400000000000,False,False,31.44,0.0,1.0,...,0,0,1,0,0,1,1,0,0,1
4,22000.0,0.1474,520.39,52000.0,1551398400000000000,False,False,18.76,0.0,1.0,...,1,0,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68812,10000.0,0.1502,346.76,26000.0,1546300800000000000,False,False,9.60,0.0,0.0,...,0,0,1,0,1,0,1,0,0,1
68813,12000.0,0.2727,368.37,63000.0,1546300800000000000,False,False,29.07,0.0,0.0,...,0,0,1,1,0,0,1,0,0,1
68814,5000.0,0.1992,185.62,52000.0,1546300800000000000,False,False,14.86,0.0,0.0,...,1,0,0,0,1,0,1,0,0,1
68815,40000.0,0.0646,1225.24,520000.0,1546300800000000000,False,False,9.96,0.0,1.0,...,1,0,0,0,0,1,1,0,1,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68817 entries, 0 to 68816
Data columns (total 86 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   68817 non-null  float64
 1   int_rate                    68817 non-null  float64
 2   installment                 68817 non-null  float64
 3   home_ownership              68817 non-null  object 
 4   annual_inc                  68817 non-null  float64
 5   verification_status         68817 non-null  object 
 6   issue_d                     68817 non-null  int64  
 7   loan_status                 68817 non-null  bool   
 8   pymnt_plan                  68817 non-null  bool   
 9   dti                         68817 non-null  float64
 10  delinq_2yrs                 68817 non-null  float64
 11  inq_last_6mths              68817 non-null  float64
 12  open_acc                    68817 non-null  float64
 13  pub_rec                     688

In [8]:
cleaned_df = df.drop(columns=["home_ownership","verification_status", "application_type", "initial_list_status"])
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68817 entries, 0 to 68816
Data columns (total 82 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_amnt                   68817 non-null  float64
 1   int_rate                    68817 non-null  float64
 2   installment                 68817 non-null  float64
 3   annual_inc                  68817 non-null  float64
 4   issue_d                     68817 non-null  int64  
 5   loan_status                 68817 non-null  bool   
 6   pymnt_plan                  68817 non-null  bool   
 7   dti                         68817 non-null  float64
 8   delinq_2yrs                 68817 non-null  float64
 9   inq_last_6mths              68817 non-null  float64
 10  open_acc                    68817 non-null  float64
 11  pub_rec                     68817 non-null  float64
 12  revol_bal                   68817 non-null  float64
 13  total_acc                   688

In [9]:
data_scaler = StandardScaler()
scaled_data = data_scaler.fit_transform(cleaned_df)

# Split the Data into Training and Testing

In [10]:
# Create our features
X = data_scaler.fit_transform(cleaned_df.drop(columns="loan_status"))


# Create our target
y = cleaned_df["loan_status"]

In [11]:
X[:5,]

array([[-6.01092723e-01,  9.17984364e-01, -3.65558353e-01,
        -1.92194797e-01,  1.67060072e+00,  0.00000000e+00,
         2.70400563e-01, -3.03142532e-01, -6.56491511e-01,
        -7.61659174e-01, -3.74204323e-01, -7.32521829e-01,
        -8.65857295e-01, -5.81245133e-01, -5.80819581e-01,
        -5.00465445e-01, -5.00293345e-01, -3.22863900e-01,
        -8.99604190e-01, -3.22736265e-02,  0.00000000e+00,
         0.00000000e+00, -1.73835281e-01,  7.88142711e-01,
        -1.20112470e-01,  0.00000000e+00,  0.00000000e+00,
        -5.89694662e-03, -7.26922276e-01,  1.67997825e+00,
         2.39253304e-01,  1.19137226e+00,  1.35299632e+00,
        -8.08956027e-01, -1.34187152e-01, -4.57175710e-01,
        -1.63101750e-01, -2.48640217e-01, -8.32039942e-01,
         1.24221966e-01, -1.04213361e+00, -2.47582393e-01,
        -5.91660154e-01,  7.55780406e-01,  3.72749228e-01,
        -5.90524188e-01, -8.09152878e-01,  1.04494582e+00,
        -7.82090839e-02, -7.78953140e-03, -5.69940224e-0

In [12]:
# Check the balance of our target values
y.value_counts()

False    68470
True       347
Name: loan_status, dtype: int64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=9, stratify=y) 
classifier = LogisticRegression(random_state=9, max_iter=200)

# Oversampling

In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

### Naive Random Oversampling

In [14]:
# Resample the training data with the RandomOversampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train,y_train)
print(Counter(y_train), "\n", Counter(y_resampled))   # False = low_risk ; True = high_risk

Counter({False: 51352, True: 260}) 
 Counter({False: 51352, True: 51352})


In [15]:
# Train the Logistic Regression model using the resampled data
classifier.fit(X_resampled, y_resampled)
# scaled_classifier.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=9)

In [16]:
over_sampled_predict = classifier.predict(X_test)
print("Accuracy Score: ", metrics.balanced_accuracy_score(y_test, over_sampled_predict))

Accuracy Score:  0.8069596029184847


In [17]:
# Display the confusion matrix
metrics.confusion_matrix(y_test, over_sampled_predict)

array([[14641,  2477],
       [   21,    66]], dtype=int64)

In [18]:
# Print the imbalanced classification report
over_sampled_predict = classifier.predict(X_test)
print("False = low_risk", "\n True = high_risk")
print("===============================================")
print(classification_report_imbalanced(y_test, over_sampled_predict))


False = low_risk 
 True = high_risk
                   pre       rec       spe        f1       geo       iba       sup

      False       1.00      0.86      0.76      0.92      0.81      0.66     17118
       True       0.03      0.76      0.86      0.05      0.81      0.64        87

avg / total       0.99      0.85      0.76      0.92      0.81      0.66     17205



### SMOTE Oversampling

In [19]:
# Resample the training data with SMOTE
X_resampled, y_resampled = SMOTE(random_state=9, sampling_strategy='auto').fit_resample(X_train, y_train)


In [20]:
# Train the Logistic Regression model using the resampled data
classifier.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=9)

In [21]:
# Calculated the balanced accuracy score
smote_predict = classifier.predict(X_test)
metrics.balanced_accuracy_score(y_test, smote_predict)


0.7935516556478157

In [22]:
# Display the confusion matrix
metrics.confusion_matrix(y_test, smote_predict)

array([[14969,  2149],
       [   25,    62]], dtype=int64)

In [23]:
# Print the imbalanced classification report
print("False = low_risk", "\n True = high_risk")
print("===============================================")
print(classification_report_imbalanced(y_test, smote_predict))


False = low_risk 
 True = high_risk
                   pre       rec       spe        f1       geo       iba       sup

      False       1.00      0.87      0.71      0.93      0.79      0.63     17118
       True       0.03      0.71      0.87      0.05      0.79      0.61        87

avg / total       0.99      0.87      0.71      0.93      0.79      0.63     17205



# Undersampling

In this section, you will test an undersampling algorithms to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [24]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
X_resampled, y_resampled = ClusterCentroids(random_state=9).fit_resample(X_train, y_train)

In [25]:
# Train the Logistic Regression model using the resampled data
classifier.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=9)

In [26]:
# Calculated the balanced accuracy score
cc_predict = classifier.predict(X_test)
print(metrics.balanced_accuracy_score(y_test, cc_predict))

0.7727638984573608


In [27]:
# Display the confusion matrix
print(metrics.confusion_matrix(y_test, cc_predict))

[[12880  4238]
 [   18    69]]


In [28]:
# Print the imbalanced classification report
print("False = low_risk", "\n True = high_risk")
print("===============================================")
print(classification_report_imbalanced(y_test, smote_predict))

False = low_risk 
 True = high_risk
                   pre       rec       spe        f1       geo       iba       sup

      False       1.00      0.87      0.71      0.93      0.79      0.63     17118
       True       0.03      0.71      0.87      0.05      0.79      0.61        87

avg / total       0.99      0.87      0.71      0.93      0.79      0.63     17205



# Combination (Over and Under) Sampling

In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [29]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
x_resampled, y_resampled = SMOTEENN(random_state=9).fit_resample(X_train, y_train)

In [30]:
# Train the Logistic Regression model using the resampled data
classifier.fit(x_resampled, y_resampled)

LogisticRegression(max_iter=200, random_state=9)

In [31]:
# Calculated the balanced accuracy score
smoteenn_predict = classifier.predict(X_test)
print("Accuracy Score:  ", metrics.accuracy_score(y_test, smoteenn_predict))

Accuracy Score:   0.863644289450741


In [32]:
# Display the confusion matrix
print(metrics.confusion_matrix(y_test, smoteenn_predict))

[[14796  2322]
 [   24    63]]


In [33]:
# Print the imbalanced classification report
print("False = low_risk", "\n True = high_risk")
print("===============================================")
print(classification_report_imbalanced(y_test, smoteenn_predict))

False = low_risk 
 True = high_risk
                   pre       rec       spe        f1       geo       iba       sup

      False       1.00      0.86      0.72      0.93      0.79      0.63     17118
       True       0.03      0.72      0.86      0.05      0.79      0.62        87

avg / total       0.99      0.86      0.72      0.92      0.79      0.63     17205



# Results: 
The models above differ in the samples used for the training and use the same X values to test their performance. When you are comparing models that have the same output you can use the Adjusted Rand score of the model to help determine which is better, not just the rand score. If the models had different outputs then this metric would not be able to be compared.

In [34]:
print("Over Sampled Model: \n    ",  metrics.rand_score(y_test, over_sampled_predict), "    Rand Socre\n    ",metrics.adjusted_rand_score(y_test, over_sampled_predict), "   Adjusted Rand Score")

Over Sampled Model: 
     0.7517653450827282     Rand Socre
     0.03382714271996137    Adjusted Rand Score


In [35]:
print("Under Sampled ClusterCentroids Model: \n    ",  metrics.rand_score(y_test, cc_predict), "     Rand Socre\n    ",metrics.adjusted_rand_score(y_test, cc_predict), "   Adjusted Rand Score")

Under Sampled ClusterCentroids Model: 
     0.6276222401459594      Rand Socre
     0.014519957339254248    Adjusted Rand Score


In [36]:
print("SMOTE Over Sampled Model: \n    ",  metrics.rand_score(y_test, smote_predict), "    Rand Socre\n    ",metrics.adjusted_rand_score(y_test, smote_predict), "   Adjusted Rand Score")

SMOTE Over Sampled Model: 
     0.7792029333486309     Rand Socre
     0.03815906414934229    Adjusted Rand Score


In [37]:
print("SMOTEENN Sampled Model: \n    ",  metrics.rand_score(y_test, smoteenn_predict), "     Rand Socre\n    ",metrics.adjusted_rand_score(y_test, smoteenn_predict), "   Adjusted Rand Score")

SMOTEENN Sampled Model: 
     0.7644606483316161      Rand Socre
     0.035002026849903486    Adjusted Rand Score


The SMOTE model is the best of those four, however the Adjusted Rand score is not high enough to consider any of them for production level code. I believe the Adjusted Rand Score is so low because the sensitivity detection for high risk is nearly non-existant. Finding a different model is highly recommended.