In [126]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [127]:
df = pd.read_csv("./dataset/bankloan.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [128]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [129]:
df['Gender'].mode()
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)

In [130]:
df['Self_Employed'].value_counts()
df['Self_Employed'].fillna(df['Self_Employed'].mode()[0], inplace=True)

In [131]:
df['LoanAmount'].median()
df['LoanAmount'].fillna(df['LoanAmount'].median(), inplace=True)

In [132]:
df['Dependents'].mode()
df['Dependents'].fillna(df['Dependents'].mode()[0], inplace=True)

In [133]:
df['Married'].mode()
df['Married'].fillna(df['Married'].mode()[0], inplace=True)

In [134]:
df['Loan_Amount_Term'].mode()
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0], inplace=True)

In [135]:
df['Credit_History'].value_counts()
df['Credit_History'].fillna(df['Credit_History'].mode()[0], inplace=True)

In [136]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [137]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [138]:
df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents           float64
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [139]:
# onehot encoder for categorical features
cat_features = df.select_dtypes(include=['object']).columns
cat_features = cat_features.drop(['Loan_Status','Loan_ID'])
cat_features

Index(['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area'], dtype='object')

In [140]:
def one_hot_encoding(df, feat):
    dummies = pd.get_dummies(df[feat], prefix=feat,drop_first=True)
    df = pd.concat([df,dummies],axis=1)
    df.drop(feat,axis=1,inplace=True)
    return df

for feat in cat_features:
    df = one_hot_encoding(df, feat)

df.head()


Unnamed: 0,Loan_ID,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban
0,LP001002,0.0,5849,0.0,128.0,360.0,1.0,Y,1,0,0,0,0,1
1,LP001003,1.0,4583,1508.0,128.0,360.0,1.0,N,1,1,0,0,0,0
2,LP001005,0.0,3000,0.0,66.0,360.0,1.0,Y,1,1,0,1,0,1
3,LP001006,0.0,2583,2358.0,120.0,360.0,1.0,Y,1,1,1,0,0,1
4,LP001008,0.0,6000,0.0,141.0,360.0,1.0,Y,1,0,0,0,0,1


In [146]:
y = df['Loan_Status']
y.map({'Y':1, 'N':0})
X = df.drop(['Loan_Status','Loan_ID'], axis=1)
print(X.shape)
print(y.shape)

(614, 12)
(614,)


In [147]:
# modelling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [148]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(491, 12)
(123, 12)
(491,)
(123,)


In [150]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

In [167]:
model = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01, max_depth=1, random_state=28)
model.fit(X_train_std, y_train)
model.score(X_test_std, y_test)

0.7886178861788617