In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from perpetual import PerpetualBooster
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')
from sklearn.metrics import roc_auc_score, precision_score,recall_score,accuracy_score
from sklearn.model_selection import train_test_split

In [35]:
df = pd.read_csv('Data/train.csv') 

In [36]:
num_cols = [col for col in df.columns if df[col].dtype in ('int64','double','float')]
cat_cols = [col for col in df.columns if df[col].dtype =='object']

In [37]:
df = pd.get_dummies(df,columns=['loan_intent','cb_person_default_on_file','person_home_ownership']) 

In [38]:
lbe = LabelEncoder()
df['loan_grade'] = lbe.fit_transform(df['loan_grade'])

In [39]:
X = df.drop(columns=['id','loan_status'],axis=1)
y = df['loan_status']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [41]:
print('X_train shape: ',X_train.shape)
print('y_train shape: ',y_train.shape)
print('X_test shape: ',X_test.shape)
print('y_test shape: ',y_test.shape)

X_train shape:  (41051, 20)
y_train shape:  (41051,)
X_test shape:  (17594, 20)
y_test shape:  (17594,)


In [42]:
def modelevaluation(y_true,y_pred):
    roc_auc = roc_auc_score(y_true,y_pred)
    accuracy= accuracy_score(y_true,y_pred)
    precision = precision_score(y_true,y_pred)
    recall = recall_score(y_true,y_pred)
    return roc_auc,accuracy,precision,recall

def print_evaluation(y_true,y_pred):
    roc_auc,accuracy,precision,recall = modelevaluation(y_true,y_pred)
    print('Roc_Auc: ',roc_auc)
    print('accuracy: ',accuracy)
    print('precision: ',precision)
    print('recall: ',recall)

In [43]:
models = {
    "logistic regression":LogisticRegression(),
    "Xgboost": XGBClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN classification":KNeighborsClassifier(),
    "Randomforest classification":RandomForestClassifier(),
    "catboost classification":CatBoostClassifier()
    
}
model_list=[]
r2_list = []

In [45]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)
    print("Model performance of: ",list(models.keys())[i])
    print_evaluation(y_test,y_pred)
    print("")

Model performance of:  logistic regression
Roc_Auc:  0.6497608098455951
accuracy:  0.8851881323178357
precision:  0.6958041958041958
recall:  0.3225283630470016

Model performance of:  Xgboost
Roc_Auc:  0.8627034798561881
accuracy:  0.9528248266454473
precision:  0.9090909090909091
recall:  0.7374392220421394

Model performance of:  Decision Tree
Roc_Auc:  0.8326635676846097
accuracy:  0.9119586222575878
precision:  0.6735927465054778
recall:  0.7224473257698542

Model performance of:  KNN classification
Roc_Auc:  0.7365717117220212
accuracy:  0.8959872683869501
precision:  0.6675420168067226
recall:  0.5149918962722853

Model performance of:  Randomforest classification
Roc_Auc:  0.8528788216796307
accuracy:  0.9522564510628624
precision:  0.9284210526315789
recall:  0.7147487844408428

Learning rate set to 0.05033
0:	learn: 0.6272880	total: 147ms	remaining: 2m 26s
1:	learn: 0.5702028	total: 161ms	remaining: 1m 20s
2:	learn: 0.5220119	total: 174ms	remaining: 57.7s
3:	learn: 0.4805472	

In [46]:
model = CatBoostClassifier() 
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("Model performance of: ",list(models.keys())[i])
print_evaluation(y_test,y_pred)
print("")

Learning rate set to 0.05033
0:	learn: 0.6272880	total: 13ms	remaining: 13s
1:	learn: 0.5702028	total: 25.4ms	remaining: 12.7s
2:	learn: 0.5220119	total: 37.5ms	remaining: 12.4s
3:	learn: 0.4805472	total: 48.3ms	remaining: 12s
4:	learn: 0.4446567	total: 58.9ms	remaining: 11.7s
5:	learn: 0.4133925	total: 70.9ms	remaining: 11.7s
6:	learn: 0.3852820	total: 82.6ms	remaining: 11.7s
7:	learn: 0.3610829	total: 96.3ms	remaining: 11.9s
8:	learn: 0.3396516	total: 109ms	remaining: 12s
9:	learn: 0.3215818	total: 122ms	remaining: 12.1s
10:	learn: 0.3060462	total: 134ms	remaining: 12.1s
11:	learn: 0.2917380	total: 146ms	remaining: 12s
12:	learn: 0.2804242	total: 157ms	remaining: 12s
13:	learn: 0.2700662	total: 170ms	remaining: 11.9s
14:	learn: 0.2608038	total: 182ms	remaining: 11.9s
15:	learn: 0.2525959	total: 196ms	remaining: 12.1s
16:	learn: 0.2448786	total: 209ms	remaining: 12.1s
17:	learn: 0.2386459	total: 222ms	remaining: 12.1s
18:	learn: 0.2326186	total: 237ms	remaining: 12.3s
19:	learn: 0.227

In [57]:
model = PerpetualBooster() 
model.fit(X_train,y_train,budget =2)
y_pred = model.predict(X_test)
print("Model performance of: ",list(models.keys())[i])
print_evaluation(y_test,y_pred)
print("")

Model performance of:  catboost classification
Roc_Auc:  0.8598128234981746
accuracy:  0.9533932022280323
precision:  0.9216990788126919
recall:  0.729740680713128



In [58]:
test_df = pd.read_csv('Data/test.csv') 

In [59]:
test_df = pd.get_dummies(test_df,columns=['loan_intent','cb_person_default_on_file','person_home_ownership']) 
lbe = LabelEncoder()
test_df['loan_grade'] = lbe.fit_transform(test_df['loan_grade'])

In [60]:
y_pred = model.predict_proba(test_df.drop(columns=['id'],axis=1))

In [61]:
y_pred_class_1 = y_pred[:, 1]

In [62]:
final_df = pd.DataFrame({
    'id': test_df['id'],
    'loan_status': y_pred_class_1
})

In [63]:
print(final_df)

          id  loan_status
0      58645     0.981820
1      58646     0.012979
2      58647     0.622719
3      58648     0.004611
4      58649     0.036534
...      ...          ...
39093  97738     0.043020
39094  97739     0.008044
39095  97740     0.006147
39096  97741     0.388968
39097  97742     0.969070

[39098 rows x 2 columns]


In [64]:
final_df.to_csv('Data/final_predictions.csv', index=False)