In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [65]:
df = pd.read_csv('loan_data.csv')

In [66]:
df['person_age'] = df['person_age'].astype('int')

In [67]:
cat_cols = [var for var in df.columns if df[var].dtypes == 'object']
num_cols = [var for var in df.columns if df[var].dtypes != 'object']

In [68]:
skewed_columns = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']
normal_column = ['loan_int_rate']

In [69]:
scaler = StandardScaler()
df[skewed_columns] = scaler.fit_transform(df[skewed_columns])

In [70]:
mms = MinMaxScaler()
df[normal_column] = mms.fit_transform(df[normal_column])

In [71]:
df['person_gender'] = df['person_gender'].map({'female' : 0, 'male' : 1})
df['previous_loan_defaults_on_file'] = df['previous_loan_defaults_on_file'].map({'No' : 0, 'Yes' : 1})
education_order = {'High School' : 0, 'Associate' : 1, 'Bachelor' : 2, 'Master' : 3, 'Doctorate' : 4}
df['person_education'] = df['person_education'].map(education_order)
loan_order = {'EDUCATION' : 0, 'MEDICAL' : 1, 'VENTURE': 2, 'PERSONAL' : 3, 'DEBTCONSOLIDATION' : 4, 'HOMEIMPROVEMENT' : 5}
df['loan_intent'] = df['loan_intent'].map(loan_order)
person_home_ownership = {'RENT' : 0, 'MORTGAGE' : 1, 'OWN' : 2, 'OTHER' : 3}
df['person_home_ownership'] = df['person_home_ownership'].map(person_home_ownership)

In [72]:
df.isnull().sum()

person_age                        0
person_gender                     0
person_education                  0
person_income                     0
person_emp_exp                    0
person_home_ownership             0
loan_amnt                         0
loan_intent                       0
loan_int_rate                     0
loan_percent_income               0
cb_person_cred_hist_length        0
credit_score                      0
previous_loan_defaults_on_file    0
loan_status                       0
dtype: int64

In [73]:
from feature_engine.outliers import OutlierTrimmer

trimmer = OutlierTrimmer(capping_method = 'iqr', tail = 'right', variables = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file', 'person_age', 'person_income', 'person_emp_exp', 'loan_amnt','loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score'])
df2 = trimmer.fit_transform(df)

In [74]:
X = df.drop('loan_status', axis=1)

In [75]:
y = df["loan_status"]

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

## Model 1 : Logistic Regression

In [77]:
model = LogisticRegression(max_iter = 1000, random_state = 42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [90]:
y_pred = model.predict(X_test)

In [87]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy :',accuracy)
print('Classification Report : \n',classification_report(y_test, y_pred))
print('Confusion Matrix : \n', confusion_matrix(y_test, y_pred))

Accuracy : 0.8911111111111111
Classification Report : 
               precision    recall  f1-score   support

           0       0.92      0.94      0.93      6990
           1       0.77      0.73      0.75      2010

    accuracy                           0.89      9000
   macro avg       0.85      0.83      0.84      9000
weighted avg       0.89      0.89      0.89      9000

Confusion Matrix : 
 [[6557  433]
 [ 547 1463]]


## Model 2 : Random Forest Classifier

In [88]:
from sklearn.ensemble import RandomForestClassifier

model2 = RandomForestClassifier(random_state=42)
model2.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [91]:
y_pred = model2.predict(X_test)

In [92]:
print('Accuracy :',accuracy_score(y_test, y_pred))
print('Classification Report : \n',classification_report(y_test, y_pred))
print('Confusion Matrix : \n', confusion_matrix(y_test, y_pred))

Accuracy : 0.9276666666666666
Classification Report : 
               precision    recall  f1-score   support

           0       0.94      0.97      0.95      6990
           1       0.89      0.77      0.83      2010

    accuracy                           0.93      9000
   macro avg       0.91      0.87      0.89      9000
weighted avg       0.93      0.93      0.93      9000

Confusion Matrix : 
 [[6794  196]
 [ 455 1555]]


## Model 3 : Decision Tree Classifier

In [93]:
from sklearn.tree import DecisionTreeClassifier

model3 = DecisionTreeClassifier(random_state = 42)
model3.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [94]:
y_pred = model3.predict(X_test)

In [95]:
print('Accuracy :',accuracy_score(y_test, y_pred))
print('Classification Report : \n',classification_report(y_test, y_pred))
print('Confusion Matrix : \n', confusion_matrix(y_test, y_pred))

Accuracy : 0.8995555555555556
Classification Report : 
               precision    recall  f1-score   support

           0       0.94      0.93      0.94      6990
           1       0.77      0.78      0.78      2010

    accuracy                           0.90      9000
   macro avg       0.85      0.86      0.86      9000
weighted avg       0.90      0.90      0.90      9000

Confusion Matrix : 
 [[6534  456]
 [ 448 1562]]


## Model 4 : Support Vector Classifier

In [98]:
from sklearn.svm import SVC

model4 = SVC(C=1.0, random_state=42, probability =True)
model4.fit(X_train, y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [99]:
y_pred = model4.predict(X_test)

In [100]:
print('Accuracy :',accuracy_score(y_test, y_pred))
print('Classification Report : \n',classification_report(y_test, y_pred))
print('Confusion Matrix : \n', confusion_matrix(y_test, y_pred))

Accuracy : 0.9064444444444445
Classification Report : 
               precision    recall  f1-score   support

           0       0.93      0.96      0.94      6990
           1       0.83      0.73      0.78      2010

    accuracy                           0.91      9000
   macro avg       0.88      0.84      0.86      9000
weighted avg       0.90      0.91      0.90      9000

Confusion Matrix : 
 [[6688  302]
 [ 540 1470]]


## Model 5 : KNN

In [101]:
from sklearn.neighbors import KNeighborsClassifier

model5 = KNeighborsClassifier(n_neighbors=3)
model5.fit(X_train, y_train)

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [102]:
y_pred = model5.predict(X_test)

In [103]:
print('Accuracy :',accuracy_score(y_test, y_pred))
print('Classification Report : \n',classification_report(y_test, y_pred))
print('Confusion Matrix : \n', confusion_matrix(y_test, y_pred))

Accuracy : 0.867
Classification Report : 
               precision    recall  f1-score   support

           0       0.90      0.94      0.92      6990
           1       0.74      0.63      0.68      2010

    accuracy                           0.87      9000
   macro avg       0.82      0.78      0.80      9000
weighted avg       0.86      0.87      0.86      9000

Confusion Matrix : 
 [[6537  453]
 [ 744 1266]]


## Model 6 : XGB Classifier

In [105]:
from xgboost import XGBClassifier

model6 = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model6.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [106]:
y_pred = model6.predict(X_test)

In [107]:
print('Accuracy :',accuracy_score(y_test, y_pred))
print('Classification Report : \n',classification_report(y_test, y_pred))
print('Confusion Matrix : \n', confusion_matrix(y_test, y_pred))

Accuracy : 0.9347777777777778
Classification Report : 
               precision    recall  f1-score   support

           0       0.95      0.97      0.96      6990
           1       0.89      0.81      0.85      2010

    accuracy                           0.93      9000
   macro avg       0.92      0.89      0.90      9000
weighted avg       0.93      0.93      0.93      9000

Confusion Matrix : 
 [[6789  201]
 [ 386 1624]]


## Model 7 : LGBM Classifier

In [110]:
from lightgbm import LGBMClassifier

model7 = LGBMClassifier(verbosity=-1, random_state=42)
model7.fit(X_train, y_train)

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [111]:
y_pred = model7.predict(X_test)

In [112]:
print('Accuracy :',accuracy_score(y_test, y_pred))
print('Classification Report : \n',classification_report(y_test, y_pred))
print('Confusion Matrix : \n', confusion_matrix(y_test, y_pred))

Accuracy : 0.9317777777777778
Classification Report : 
               precision    recall  f1-score   support

           0       0.94      0.97      0.96      6990
           1       0.89      0.80      0.84      2010

    accuracy                           0.93      9000
   macro avg       0.91      0.88      0.90      9000
weighted avg       0.93      0.93      0.93      9000

Confusion Matrix : 
 [[6783  207]
 [ 407 1603]]
