# Ensemble Learning

## Initial Imports

In [148]:
import warnings
warnings.filterwarnings('ignore')

In [149]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [150]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier

## Read the CSV and Perform Basic Data Cleaning

In [205]:
# Load the data
file_path = Path('Resources/LoanStats_2019Q1.csv')
loan_df = pd.read_csv(file_path).dropna()
# Preview the data
loan_df.head(10)

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,dti,...,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,Mar-2019,low_risk,n,27.24,...,85.7,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N
1,25000.0,0.2,929.09,MORTGAGE,105000.0,Verified,Mar-2019,low_risk,n,20.23,...,91.2,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N
2,20000.0,0.2,529.88,MORTGAGE,56000.0,Verified,Mar-2019,low_risk,n,24.26,...,66.7,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N
3,10000.0,0.164,353.55,RENT,92000.0,Verified,Mar-2019,low_risk,n,31.44,...,100.0,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,Mar-2019,low_risk,n,18.76,...,100.0,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N
5,25000.0,0.1797,634.43,MORTGAGE,90000.0,Source Verified,Mar-2019,low_risk,n,18.49,...,93.1,66.7,0.0,0.0,550717.0,418934.0,18800.0,369051.0,N,N
6,20400.0,0.2,540.48,RENT,51315.0,Source Verified,Mar-2019,low_risk,n,25.63,...,100.0,0.0,1.0,0.0,40023.0,23469.0,3500.0,27523.0,N,N
7,10000.0,0.1557,349.46,OWN,66000.0,Source Verified,Mar-2019,low_risk,n,9.0,...,100.0,16.7,0.0,0.0,29300.0,22377.0,8400.0,12500.0,N,N
8,14000.0,0.0881,443.96,RENT,45000.0,Not Verified,Mar-2019,low_risk,n,10.24,...,100.0,0.0,0.0,0.0,25100.0,9724.0,12100.0,3000.0,N,N
9,10000.0,0.1474,345.39,RENT,63295.0,Verified,Mar-2019,low_risk,n,39.34,...,97.2,71.4,0.0,0.0,119822.0,102924.0,15300.0,80722.0,N,N


## Split the Data into Training and Testing

In [206]:
# Split into training and testing data
x_cols = [i for i in loan_df.columns if i not in ('loan_status')]
X = pd.get_dummies(df[x_cols]).copy()
y = df['loan_status']

In [203]:
# Check the balance of our target values
y.describe()

count        68817
unique           2
top       low_risk
freq         68470
Name: loan_status, dtype: object

In [204]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

## Data Pre-Processing

Scale the training and testing data using the `StandardScaler` from `sklearn`. Remember that when scaling the data, you only scale the features data (`X_train` and `X_testing`).

In [184]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [185]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
X_scaler = scaler.fit(X_train)

In [186]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Ensemble Learners

In this section, you will compare two ensemble algorithms to determine which algorithm results in the best performance. You will train a Balanced Random Forest Classifier and an Easy Ensemble classifier . For each algorithm, be sure to complete the folliowing steps:

1. Train the model using the training data. 
2. Calculate the balanced accuracy score from sklearn.metrics.
3. Display the confusion matrix from sklearn.metrics.
4. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.
5. For the Balanced Random Forest Classifier only, print the feature importance sorted in descending order (most important feature to least important) along with the feature score

Note: Use a random state of 1 for each algorithm to ensure consistency between tests

### Balanced Random Forest Classifier

In [187]:
# Resample the training data with the BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf.fit(X_train_scaled, y_train)

BalancedRandomForestClassifier(random_state=1)

In [188]:
# Calculated the balanced accuracy score
y_pred_brf = brf.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred_brf)

0.7871246640962729

In [189]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_brf)

array([[   58,    29],
       [ 1582, 15536]])

In [190]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_brf))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.04      0.67      0.91      0.07      0.78      0.59        87
   low_risk       1.00      0.91      0.67      0.95      0.78      0.62     17118

avg / total       0.99      0.91      0.67      0.95      0.78      0.62     17205



In [191]:
# # List the features sorted in descending order by feature importance
importances_sorted = sorted(zip(importances, X.columns), reverse=True)
importances_sorted[:10]

[(0.07376667607601396, 'total_rec_prncp'),
 (0.06390324452717588, 'total_rec_int'),
 (0.06073336071656837, 'total_pymnt_inv'),
 (0.05811195697921674, 'total_pymnt'),
 (0.04951778391272079, 'last_pymnt_amnt'),
 (0.02458051789018817, 'int_rate'),
 (0.020398879691407974, 'out_prncp'),
 (0.018625883307901298, 'dti'),
 (0.018378884967316288, 'max_bal_bc'),
 (0.017480030880564042, 'issue_d_Jan-2019')]

### Easy Ensemble Classifier

In [197]:
# Train the Classifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train_scaled, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [198]:
# Calculated the balanced accuracy score
y_pred_eec = eec.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred_eec)

0.9254565671948463

In [199]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_eec)

array([[   79,     8],
       [  978, 16140]])

In [200]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_eec)) 

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.07      0.91      0.94      0.14      0.93      0.85        87
   low_risk       1.00      0.94      0.91      0.97      0.93      0.86     17118

avg / total       0.99      0.94      0.91      0.97      0.93      0.86     17205



### Final Questions

1. Which model had the best balanced accuracy score?

 The Easy Ensemble Classifier had the best balanced accuracy score at 93%. 

2. Which model had the best recall score?

The Easy Ensemble Classifier had the best balanced accuracy score at 94%. 

3. Which model had the best geometric mean score?

The Easy Ensemble Classifier had the best balanced accuracy score at 94% 
   
4. What are the top three features?

The top three features are (total_rec_prncp, .07%), (total_rec_int, .06%) and (total_pymnt_inv, .06%).