In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np


import matplotlib.pyplot as plt 
%matplotlib inline

import seaborn as sns

from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import ExtraTreeRegressor, DecisionTreeClassifier

import warnings # Uyarilari gozardi etmek için
warnings.filterwarnings("ignore")

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
g = GaussianNB() 
b = BernoulliNB()
k = KNeighborsClassifier()
gbc = GradientBoostingClassifier()
log = LogisticRegression()
d = DecisionTreeClassifier()
r = RandomForestClassifier()

In [3]:
algorithms = [g, b, k, log, gbc, r, d]
names = ["GaussianNB", "BernoulliNB", "K Nearest", "Logistic", "GradientBoosting", "RandomForest", "DesicionTree"]

In [4]:
def algo_test(X, y, algorithms = algorithms, names = names):
    for i in range(len(algorithms)):
        algorithms[i] = algorithms[i].fit(X, y)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for i in range(len(algorithms)):
        accuracy.append(accuracy_score(y, algorithms[i].predict(X)))
        precision.append(precision_score(y, algorithms[i].predict(X)))
        recall.append(recall_score(y, algorithms[i].predict(X)))
        f1.append(f1_score(y, algorithms[i].predict(X)))
    metrics = pd.DataFrame(columns = ["Accuracy", "Precision", "Recall", "F1"], index = names)
    metrics["Accuracy"] = accuracy
    metrics["Precision"] = precision
    metrics["Recall"] = recall
    metrics["F1"] = f1
    return metrics.sort_values("F1", ascending = False)

In [5]:
df = pd.read_csv("LoansTrainingSet.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256984 entries, 0 to 256983
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Loan ID                       256984 non-null  object 
 1   Customer ID                   256984 non-null  object 
 2   Loan Status                   256984 non-null  object 
 3   Current Loan Amount           256984 non-null  int64  
 4   Term                          256984 non-null  object 
 5   Credit Score                  195308 non-null  float64
 6   Years in current job          245508 non-null  object 
 7   Home Ownership                256984 non-null  object 
 8   Annual Income                 195308 non-null  float64
 9   Purpose                       256984 non-null  object 
 10  Monthly Debt                  256984 non-null  object 
 11  Years of Credit History       256984 non-null  float64
 12  Months since last delinquent  116601 non-nul

In [7]:
df.sample(50)

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Years in current job,Home Ownership,Annual Income,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
148453,939239eb-4b2b-459a-95d5-8d71e1e2362c,0cf58a23-56fa-4c43-8199-960267ac4163,Fully Paid,22667,Short Term,,10+ years,Home Mortgage,,Debt Consolidation,"$1,085.24",24.5,13.0,13,0,29530,41185,0.0,0.0
169126,a82de79b-cd3c-4ef4-bd6d-5ddd2056ae31,760beda2-9613-4db2-86bf-7a6f2c4fbaa4,Fully Paid,7142,Short Term,,6 years,Home Mortgage,,Debt Consolidation,"$1,371.03",10.0,13.0,12,0,8581,11425,0.0,0.0
187251,ba4cad95-55b3-4880-944a-39694f8e9a94,a0a198a0-7953-494f-81fc-bf345fd2dd0c,Charged Off,20842,Long Term,678.0,4 years,Rent,101670.0,Debt Consolidation,"$2,440.08",19.7,56.0,12,0,9114,11183,0.0,0.0
17003,10e79fc7-eb4f-4cb7-9d98-d8f594467612,b837f549-067f-473b-b206-52ede388c3f4,Fully Paid,13163,Short Term,736.0,4 years,Rent,35438.0,Debt Consolidation,$502.03,16.4,4.0,5,0,5773,12660,0.0,0.0
186871,b9e2e2df-11e4-4a43-945c-6510d9dadb44,55281d6a-51fe-489e-9c0c-163419868a9b,Charged Off,29554,Long Term,620.0,10+ years,Rent,82152.0,Debt Consolidation,"$2,115.41",18.5,50.0,21,0,43900,101385,0.0,0.0
117305,748cd2e3-20bd-41a4-a294-49576f79dd0c,9f2ba56f-e357-4187-9c91-197dfe57fb25,Fully Paid,99999999,Short Term,733.0,4 years,Home Mortgage,95414.0,Debt Consolidation,$199.58,15.8,,6,0,9990,24365,0.0,0.0
590,0096267e-ee4e-4405-af80-41d30d3d386e,dcc36523-102e-4f63-a270-f56aa191bc86,Fully Paid,22688,Short Term,,6 years,Home Mortgage,,Debt Consolidation,"$1,959.35",22.2,,13,0,30712,69642,0.0,0.0
66143,41d02eea-1e41-400f-b784-82a73ed6e66d,d30a0c5d-ce99-4274-8311-7056e783f636,Charged Off,14013,Long Term,672.0,1 year,Rent,55050.0,Debt Consolidation,"$1,192.74",16.3,62.0,11,1,19702,25520,1.0,0.0
73520,49079d92-b009-4093-ac27-9ce68ddbc25e,f6d4e843-a524-4829-bdd1-53dadf0afd24,Charged Off,9436,Long Term,,4 years,Home Mortgage,,Debt Consolidation,$454.84,31.0,30.0,15,0,16272,31843,0.0,0.0
140263,8b8cf28c-5db8-4885-960b-704467c67209,0131dc92-63d3-4505-8889-21c46988419e,Charged Off,11891,Short Term,7450.0,4 years,Home Mortgage,118908.0,Debt Consolidation,"$1,555.71",18.5,,15,0,20208,37423,0.0,0.0


In [8]:
df.isnull().sum()

Loan ID                              0
Customer ID                          0
Loan Status                          0
Current Loan Amount                  0
Term                                 0
Credit Score                     61676
Years in current job             11476
Home Ownership                       0
Annual Income                    61676
Purpose                              0
Monthly Debt                         0
Years of Credit History              0
Months since last delinquent    140383
Number of Open Accounts              0
Number of Credit Problems            0
Current Credit Balance               0
Maximum Open Credit                  0
Bankruptcies                       529
Tax Liens                           23
dtype: int64

In [9]:
df.describe()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Bankruptcies,Tax Liens
count,256984.0,195308.0,195308.0,256984.0,116601.0,256984.0,256984.0,256984.0,256455.0,256961.0
mean,13713310.0,1251.116099,71952.72,18.290195,34.88145,11.106267,0.156628,15406.56,0.110316,0.027203
std,34381310.0,1762.016848,58877.57,7.075747,21.854165,4.982982,0.460731,19665.06,0.336229,0.24595
min,505.0,585.0,0.0,3.4,0.0,0.0,0.0,0.0,0.0,0.0
25%,8299.0,714.0,44321.0,13.5,16.0,8.0,0.0,5974.0,0.0,0.0
50%,14298.0,733.0,61242.0,17.0,32.0,10.0,0.0,11078.0,0.0,0.0
75%,24367.0,744.0,86462.0,21.7,51.0,14.0,0.0,19319.0,0.0,0.0
max,100000000.0,7510.0,8713547.0,70.5,176.0,76.0,11.0,1731412.0,7.0,11.0


In [10]:
df["Months since last delinquent"].fillna(0, inplace = True)

In [11]:
df.isnull().sum()

Loan ID                             0
Customer ID                         0
Loan Status                         0
Current Loan Amount                 0
Term                                0
Credit Score                    61676
Years in current job            11476
Home Ownership                      0
Annual Income                   61676
Purpose                             0
Monthly Debt                        0
Years of Credit History             0
Months since last delinquent        0
Number of Open Accounts             0
Number of Credit Problems           0
Current Credit Balance              0
Maximum Open Credit                 0
Bankruptcies                      529
Tax Liens                          23
dtype: int64

In [12]:
df["Monthly Debt"] = df["Monthly Debt"].str[1:]

In [13]:
df['Monthly Debt'] = df['Monthly Debt'].str.replace(',','')

In [14]:
df.head(50)

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Years in current job,Home Ownership,Annual Income,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,000025bb-5694-4cff-b17d-192b1a98ba44,5ebc8bb1-5eb9-4404-b11b-a6eebc401a19,Fully Paid,11520,Short Term,741.0,10+ years,Home Mortgage,33694.0,Debt Consolidation,584.03,12.3,41.0,10,0,6760,16056,0.0,0.0
1,00002c49-3a29-4bd4-8f67-c8f8fbc1048c,927b388d-2e01-423f-a8dc-f7e42d668f46,Fully Paid,3441,Short Term,734.0,4 years,Home Mortgage,42269.0,other,1106.04,26.3,0.0,17,0,6262,19149,0.0,0.0
2,00002d89-27f3-409b-aa76-90834f359a65,defce609-c631-447d-aad6-1270615e89c4,Fully Paid,21029,Short Term,747.0,10+ years,Home Mortgage,90126.0,Debt Consolidation,1321.85,28.8,0.0,5,0,20967,28335,0.0,0.0
3,00005222-b4d8-45a4-ad8c-186057e24233,070bcecb-aae7-4485-a26a-e0403e7bb6c5,Fully Paid,18743,Short Term,747.0,10+ years,Own Home,38072.0,Debt Consolidation,751.92,26.2,0.0,9,0,22529,43915,0.0,0.0
4,0000757f-a121-41ed-b17b-162e76647c1f,dde79588-12f0-4811-bab0-e2b07f633fcd,Fully Paid,11731,Short Term,746.0,4 years,Rent,50025.0,Debt Consolidation,355.18,11.5,0.0,12,0,17391,37081,0.0,0.0
5,0000a149-b055-4a57-b762-280783ccc25e,62ddc017-7023-4ba7-af23-1a7cd16c1ce5,Fully Paid,10208,Short Term,716.0,10+ years,Rent,41853.0,Business Loan,561.52,13.2,0.0,4,1,2289,4671,1.0,0.0
6,0000afa6-8902-4f8f-b870-25a8fdad0aeb,e49c1a82-a0f7-45e8-9f46-2f75c43f9fbc,Charged Off,24613,Long Term,6640.0,6 years,Rent,49225.0,Business Loan,542.29,17.6,73.0,7,0,14123,16954,0.0,0.0
7,0000afa6-8902-4f8f-b870-25a8fdad0aeb,e49c1a82-a0f7-45e8-9f46-2f75c43f9fbc,Charged Off,24613,Long Term,,6 years,Rent,,Business Loan,542.29,17.6,73.0,7,0,14123,16954,0.0,0.0
8,00011dfc-31c1-4178-932a-fbeb3f341efb,ef6e098c-6c83-4752-8d00-ff793e476b8c,Fully Paid,10036,Short Term,,5 years,Rent,,Debt Consolidation,386.36,17.7,0.0,7,0,11970,16579,0.0,0.0
9,0001cb86-af28-4011-bb86-183786e473ae,4aae67bb-d54b-41ae-8bce-1d62022ed8dd,Fully Paid,2036,Short Term,733.0,,Home Mortgage,55985.0,Debt Consolidation,741.79,19.8,29.0,7,0,10926,15676,0.0,0.0


In [15]:
df["Monthly Debt"] = df["Monthly Debt"].astype(float)

In [16]:
df["Monthly Debt"]

0          584.03
1         1106.04
2         1321.85
3          751.92
4          355.18
           ...   
256979    1706.58
256980    1376.47
256981     297.96
256982     297.96
256983    2525.82
Name: Monthly Debt, Length: 256984, dtype: float64

In [17]:
df['Years in current job'] = df['Years in current job'].str.replace('< 1 year','0.5')

In [18]:
df["Years in current job"] = df["Years in current job"].str.extract('(\d+)')

In [19]:
df["Years in current job"].fillna('0', inplace = True)

In [20]:
df["Years in current job"] = df["Years in current job"].astype(int)

In [21]:
df["Annual Income"].fillna(0, inplace = True)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256984 entries, 0 to 256983
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Loan ID                       256984 non-null  object 
 1   Customer ID                   256984 non-null  object 
 2   Loan Status                   256984 non-null  object 
 3   Current Loan Amount           256984 non-null  int64  
 4   Term                          256984 non-null  object 
 5   Credit Score                  195308 non-null  float64
 6   Years in current job          256984 non-null  int32  
 7   Home Ownership                256984 non-null  object 
 8   Annual Income                 256984 non-null  float64
 9   Purpose                       256984 non-null  object 
 10  Monthly Debt                  256984 non-null  float64
 11  Years of Credit History       256984 non-null  float64
 12  Months since last delinquent  256984 non-nul

In [23]:
df.isnull().sum()

Loan ID                             0
Customer ID                         0
Loan Status                         0
Current Loan Amount                 0
Term                                0
Credit Score                    61676
Years in current job                0
Home Ownership                      0
Annual Income                       0
Purpose                             0
Monthly Debt                        0
Years of Credit History             0
Months since last delinquent        0
Number of Open Accounts             0
Number of Credit Problems           0
Current Credit Balance              0
Maximum Open Credit                 0
Bankruptcies                      529
Tax Liens                          23
dtype: int64

In [24]:
df["Maximum Open Credit"]

0         16056
1         19149
2         28335
3         43915
4         37081
          ...  
256979    44080
256980     9758
256981    20090
256982    20090
256983    62371
Name: Maximum Open Credit, Length: 256984, dtype: object

In [25]:
df['Maximum Open Credit'] = df['Maximum Open Credit'].replace('#VALUE!','0')

In [26]:
df["Maximum Open Credit"]

0         16056
1         19149
2         28335
3         43915
4         37081
          ...  
256979    44080
256980     9758
256981    20090
256982    20090
256983    62371
Name: Maximum Open Credit, Length: 256984, dtype: object

In [27]:
df["Maximum Open Credit"] = df["Maximum Open Credit"].astype(int)

In [28]:
df.isnull().sum()

Loan ID                             0
Customer ID                         0
Loan Status                         0
Current Loan Amount                 0
Term                                0
Credit Score                    61676
Years in current job                0
Home Ownership                      0
Annual Income                       0
Purpose                             0
Monthly Debt                        0
Years of Credit History             0
Months since last delinquent        0
Number of Open Accounts             0
Number of Credit Problems           0
Current Credit Balance              0
Maximum Open Credit                 0
Bankruptcies                      529
Tax Liens                          23
dtype: int64

In [29]:
df.Bankruptcies.unique()

array([ 0.,  1.,  2., nan,  3.,  4.,  5.,  7.,  6.])

In [30]:
df["Tax Liens"].unique()

array([ 0.,  5., nan,  1.,  2.,  4.,  3.,  6.,  7.,  9.,  8., 10., 11.])

In [31]:
df[df.Bankruptcies.isnull()]["Tax Liens"].unique()

array([ 0., nan])

In [32]:
df[df["Tax Liens"].isnull()]["Bankruptcies"].unique()

array([nan])

In [33]:
df["Bankruptcies"].fillna(0, inplace = True)
df["Tax Liens"].fillna(0, inplace = True)

In [34]:
df["Loan Status"] = df["Loan Status"].map(lambda s: 0 if s == "Fully Paid" else 1)

In [35]:
df.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Years in current job,Home Ownership,Annual Income,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,000025bb-5694-4cff-b17d-192b1a98ba44,5ebc8bb1-5eb9-4404-b11b-a6eebc401a19,0,11520,Short Term,741.0,10,Home Mortgage,33694.0,Debt Consolidation,584.03,12.3,41.0,10,0,6760,16056,0.0,0.0
1,00002c49-3a29-4bd4-8f67-c8f8fbc1048c,927b388d-2e01-423f-a8dc-f7e42d668f46,0,3441,Short Term,734.0,4,Home Mortgage,42269.0,other,1106.04,26.3,0.0,17,0,6262,19149,0.0,0.0
2,00002d89-27f3-409b-aa76-90834f359a65,defce609-c631-447d-aad6-1270615e89c4,0,21029,Short Term,747.0,10,Home Mortgage,90126.0,Debt Consolidation,1321.85,28.8,0.0,5,0,20967,28335,0.0,0.0
3,00005222-b4d8-45a4-ad8c-186057e24233,070bcecb-aae7-4485-a26a-e0403e7bb6c5,0,18743,Short Term,747.0,10,Own Home,38072.0,Debt Consolidation,751.92,26.2,0.0,9,0,22529,43915,0.0,0.0
4,0000757f-a121-41ed-b17b-162e76647c1f,dde79588-12f0-4811-bab0-e2b07f633fcd,0,11731,Short Term,746.0,4,Rent,50025.0,Debt Consolidation,355.18,11.5,0.0,12,0,17391,37081,0.0,0.0


In [36]:
abs(df.corr()["Loan Status"]).sort_values(ascending = False)

Loan Status                     1.000000
Credit Score                    0.440307
Current Loan Amount             0.269804
Annual Income                   0.053360
Years of Credit History         0.035548
Number of Open Accounts         0.020787
Years in current job            0.020607
Monthly Debt                    0.016609
Tax Liens                       0.012590
Number of Credit Problems       0.010179
Current Credit Balance          0.008838
Maximum Open Credit             0.007923
Bankruptcies                    0.001107
Months since last delinquent    0.000343
Name: Loan Status, dtype: float64

In [37]:
df["Credit Score"].fillna(0, inplace = True)

In [38]:
x = df.drop(["Loan Status", "Loan ID", "Customer ID"], axis = 1)
y = df[["Loan Status"]]

In [39]:
x = pd.get_dummies(x, drop_first = True)

In [None]:
algo_test(x, y)