## Credit_risk_anaylsis_external_data_ver

### Library Installation and Import

In [103]:
# # Install the basic library required
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install seaborn

# # Install the library required for model building and evaluation
# !pip install lightgbm
# !pip install hyperopt
# !pip install xgboost

In [104]:
# Import the basic library
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint

# Import the library for model building and evaluation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

In [105]:
# Model Evaluation Function
def model_performance(y_test, y_pred) :
    #Show metrics
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    Accuracy  =  ((tp+tn)/(tp+tn+fp+fn))
    Precision =  (tp/(tp+fp))
    Recall    =  (tp/(tp+fn))
    F1_score  =  (2*(((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn)))))

    print(f'Accuracy :{Accuracy}')
    print(f'Precision :{Precision}')
    print(f'Recall :{Recall}')
    print(f'F1_score :{F1_score}')

    return Accuracy, Precision, Recall, F1_score

### Import Data and Overview

In [106]:
# Read the csv file using pandas to a dataframe.
df = pd.read_csv('credit_risk_dataset.csv')
# Review the first 5 data for overview
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [107]:
# Check the info of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


## Section: Data Cleansing and Preprocessing
In this section, we are going to clean and preprocess our raw dataset before we can draw some insights from the cleaned data, as well as fit the processed data into our model candidates. We are going to do as follows:
1. Check and handle for any missing value, incorrectly formatted and duplicate data
2. Remove Outliers
3. Handle categorical variables by Encoding
4. Feature Scaling
5. Splitting Train and Test data for one-hot encoding and label encoding

###  Check and handle any missing value

In [108]:
df.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [109]:
df.fillna((df.median()), inplace=True)

  df.fillna((df.median()), inplace=True)


### Handling outliers

In [110]:
# Get the Descriptive statistics
df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,4.767994,9589.371106,11.00962,0.218164,0.170203,5.804211
std,6.348078,61983.12,4.087372,6322.086646,3.081611,0.413006,0.106782,4.055001
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,38500.0,2.0,5000.0,8.49,0.0,0.09,3.0
50%,26.0,55000.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,79200.0,7.0,12200.0,13.11,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


From the above, we can see that the maximum age in the dataset is 144, which is abnormally high and we believe that they are outliers based on our domain knowledge and the fact from the guinness world records (the oldest person in this world is 122), so we decided to find out all the abnormally high person age and remove them.

In [111]:
# Find out people with abnormally high age
df[df['person_age'] > 100]

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
81,144,250000,RENT,4.0,VENTURE,C,4800,13.57,0,0.02,N,3
183,144,200000,MORTGAGE,4.0,EDUCATION,B,6000,11.86,0,0.03,N,2
575,123,80004,RENT,2.0,EDUCATION,B,20400,10.25,0,0.25,N,3
747,123,78000,RENT,7.0,VENTURE,B,20000,10.99,0,0.26,N,4
32297,144,6000000,MORTGAGE,12.0,PERSONAL,C,5000,12.73,0,0.0,N,25


In [112]:
# Remove the outliers
df = df.drop(df[df['person_age'] > 100].index)

In [113]:
# Check the dimension of the dataset
df.shape

(32576, 12)

### Handling categorical variable

In [114]:
# Label Encoding for tree based MLA
df_le = df.apply(LabelEncoder().fit_transform)
df_le.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,2,2238,3,35,4,3,752,239,1,59,1,1
1,1,25,2,5,1,1,7,92,0,10,0,0
2,5,25,0,1,3,2,176,141,1,57,0,1
3,3,2539,3,4,3,2,752,217,1,53,0,0
4,4,1980,3,8,3,2,752,186,1,55,1,2


In [115]:
df_le.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32576 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   person_age                  32576 non-null  int64
 1   person_income               32576 non-null  int64
 2   person_home_ownership       32576 non-null  int64
 3   person_emp_length           32576 non-null  int64
 4   loan_intent                 32576 non-null  int64
 5   loan_grade                  32576 non-null  int64
 6   loan_amnt                   32576 non-null  int64
 7   loan_int_rate               32576 non-null  int64
 8   loan_status                 32576 non-null  int64
 9   loan_percent_income         32576 non-null  int64
 10  cb_person_default_on_file   32576 non-null  int64
 11  cb_person_cred_hist_length  32576 non-null  int64
dtypes: int64(12)
memory usage: 3.2 MB


## Modeling

### Train-Test split

In [116]:
# Create Train & Test Data
X_train, X_test, y_train, y_test = train_test_split(df_le.drop(columns=['loan_status']), df_le['loan_status'].values, test_size=0.2, stratify=df_le['loan_status'].values, random_state=0)

In [123]:
gbMod = GradientBoostingClassifier(learning_rate=0.1, verbose=2)

In [124]:
gbMod.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.9544            3.83s
         2           0.8876            3.61s
         3           0.8373            3.51s
         4           0.7967            3.45s
         5           0.7637            3.40s
         6           0.7364            3.34s
         7           0.7136            3.32s
         8           0.6940            3.27s
         9           0.6767            3.23s
        10           0.6625            3.19s
        11           0.6495            3.15s
        12           0.6383            3.11s
        13           0.6288            3.08s
        14           0.6196            3.05s
        15           0.6116            3.01s
        16           0.6049            2.98s
        17           0.5983            2.94s
        18           0.5931            2.90s
        19           0.5892            2.87s
        20           0.5843            2.83s
        21           0.5800            2.80s
        2

In [125]:
gb_pred = gbMod.predict(X_test)
gb_Accuracy, gb_Precision, gb_Recall, gb_F1_score = model_performance(y_test, gb_pred)

Accuracy :0.9220380601596071
Precision :0.9462890625
Recall :0.6814345991561181
F1_score :0.7923139820114473


### Hyperparameter tuning

In [126]:
gbHyperParams = {'n_estimators': randint(10, 500),
                 'max_depth': randint(1,10)}

In [127]:
gridSearchGB = RandomizedSearchCV(estimator=gbMod, param_distributions=gbHyperParams, n_iter=10,
                                   scoring='roc_auc', cv=None, verbose=2)

gridSearchGB.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
      Iter       Train Loss   Remaining Time 
         1           0.9198           26.41s
         2           0.8310           26.23s
         3           0.7640           25.85s
         4           0.7104           25.45s
         5           0.6646           25.27s
         6           0.6259           25.22s
         7           0.5928           25.20s
         8           0.5640           25.18s
         9           0.5381           25.18s
        10           0.5156           25.15s
        11           0.4960           25.03s
        12           0.4790           24.94s
        13           0.4629           24.89s
        14           0.4491           24.79s
        15           0.4359           24.74s
        16           0.4237           24.68s
        17           0.4135           24.57s
        18           0.4041           24.49s
        19           0.3956           24.43s
        20           0.3865           

In [128]:
bestGbModFitted = gridSearchGB.best_estimator_.fit(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           0.9271           41.81s
         2           0.8442           37.70s
         3           0.7809           36.29s
         4           0.7308           35.91s
         5           0.6898           35.66s
         6           0.6549           35.15s
         7           0.6256           34.86s
         8           0.5999           34.52s
         9           0.5771           34.47s
        10           0.5576           34.24s
        11           0.5411           34.03s
        12           0.5260           33.88s
        13           0.5117           33.73s
        14           0.4998           33.64s
        15           0.4901           33.49s
        16           0.4799           33.44s
        17           0.4700           33.32s
        18           0.4626           33.19s
        19           0.4552           33.08s
        20           0.4484           32.99s
        21           0.4425           32.89s
        2

In [135]:
test_labels=bestGbModFitted.predict(X_test)

In [137]:
gb_Accuracy, gb_Precision, gb_Recall, gb_F1_score = model_performance(y_test, test_labels)

Accuracy :0.9386126457949663
Precision :0.9595323741007195
Recall :0.750351617440225
F1_score :0.8421468034727704
