### Loading the data

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score,make_scorer
from sklearn import tree
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
Loan = pd.read_csv('Loan.csv')
Loan.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_status
0,LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360.0,1.0,Urban,Y
1,LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360.0,1.0,Urban,N
2,LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360.0,1.0,Urban,Y
3,LP001035,Male,Yes,2,Graduate,No,2340,2546.0,100.0,360.0,,Urban,Y
4,LP001051,Male,No,0,Not Graduate,No,3276,0.0,78.0,360.0,1.0,Urban,Y


In [3]:
Loan.shape

(981, 13)

In [4]:
Loan.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,981.0,981.0,954.0,961.0,902.0
mean,5179.795107,1601.91633,142.51153,342.201873,0.83592
std,5695.104533,2718.772806,77.421743,65.100602,0.370553
min,0.0,0.0,9.0,6.0,0.0
25%,2875.0,0.0,100.0,360.0,1.0
50%,3800.0,1110.0,126.0,360.0,1.0
75%,5516.0,2365.0,162.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


### Check missing values

In [5]:
Loan.isnull().sum()

Loan_ID               0
Gender               24
Married               3
Dependents           25
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_status           0
dtype: int64

### Fill all the missing value

In [6]:
Loan['Gender'].value_counts()

Male      775
Female    182
Name: Gender, dtype: int64

In [7]:
Loan.Gender = Loan.Gender.fillna('Male')

In [8]:
Loan['Married'].value_counts()

Yes    631
No     347
Name: Married, dtype: int64

In [9]:
Loan.Married = Loan.Married.fillna('Yes')

In [10]:
Loan['Dependents'].value_counts()

0     545
2     160
1     160
3+     91
Name: Dependents, dtype: int64

In [11]:
Loan.Dependents = Loan.Dependents.fillna('0')

In [12]:
Loan['Self_Employed'].value_counts()

No     807
Yes    119
Name: Self_Employed, dtype: int64

In [13]:
Loan.Self_Employed = Loan.Self_Employed.fillna('No')

In [14]:
Loan['LoanAmount'].value_counts()

120.0    29
110.0    27
100.0    24
187.0    21
150.0    19
125.0    18
130.0    18
160.0    17
90.0     15
128.0    14
113.0    14
135.0    14
108.0    13
96.0     12
70.0     12
104.0    12
95.0     12
80.0     12
116.0    10
200.0    10
185.0    10
115.0    10
138.0    10
132.0    10
180.0    10
140.0     9
112.0     9
131.0     9
122.0     9
158.0     9
         ..
312.0     1
47.0      1
320.0     1
17.0      1
189.0     1
279.0     1
265.0     1
550.0     1
79.0      1
336.0     1
311.0     1
190.0     1
570.0     1
349.0     1
267.0     1
257.0     1
496.0     1
292.0     1
36.0      1
9.0       1
49.0      1
26.0      1
460.0     1
324.0     1
68.0      1
196.0     1
254.0     1
400.0     1
250.0     1
405.0     1
Name: LoanAmount, Length: 232, dtype: int64

In [15]:
Loan.LoanAmount = Loan.LoanAmount.fillna(Loan.LoanAmount.mean())

In [16]:
Loan['Loan_Amount_Term'].value_counts()

360.0    823
180.0     66
480.0     23
300.0     20
240.0      8
84.0       7
120.0      4
36.0       3
60.0       3
12.0       2
350.0      1
6.0        1
Name: Loan_Amount_Term, dtype: int64

In [17]:
Loan.Loan_Amount_Term=Loan.Loan_Amount_Term.fillna('360')

In [18]:
Loan['Credit_History'].value_counts()

1.0    754
0.0    148
Name: Credit_History, dtype: int64

In [19]:
Loan.Credit_History = Loan.Credit_History.fillna('1.0')

In [20]:
Loan.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_status          0
dtype: int64

### Visualize the data

In [21]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [22]:
Loan.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_status
0,LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360,1.0,Urban,Y
1,LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360,1.0,Urban,N
2,LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360,1.0,Urban,Y
3,LP001035,Male,Yes,2,Graduate,No,2340,2546.0,100.0,360,1.0,Urban,Y
4,LP001051,Male,No,0,Not Graduate,No,3276,0.0,78.0,360,1.0,Urban,Y


### Loan status according to the Gender

In [23]:
# plt.figure(15,8)
# Question: How to select all the male candidates loan approved?

### Drop column

In [24]:
Loan = Loan.drop(['Loan_ID'],axis = 1)
Loan.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_status
0,Male,Yes,0,Graduate,No,5720,0.0,110.0,360,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360,1.0,Urban,N
2,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360,1.0,Urban,Y
3,Male,Yes,2,Graduate,No,2340,2546.0,100.0,360,1.0,Urban,Y
4,Male,No,0,Not Graduate,No,3276,0.0,78.0,360,1.0,Urban,Y


In [25]:
Loan['Property_Area'].value_counts()

Semiurban    349
Urban        342
Rural        290
Name: Property_Area, dtype: int64

### Replacing Values

In [26]:
Loan.Gender = Loan.Gender.replace({'Male':0,'Female':1})
Loan.Married=Loan.Married.replace({'Yes':1,'No':0})
Loan.Education = Loan.Education.replace({'Graduate':0,'Not Graduate':1})
Loan.Self_Employed = Loan.Self_Employed.replace({'No':0,'Yes':1})
Loan.Property_Area = Loan.Property_Area.replace({'Semiurban':0,'Urban':1,'Rural':2})
Loan.Loan_status = Loan.Loan_status.replace({'N':0,'Y':1})
Loan.Dependents = Loan.Dependents.replace({'3+':3})
Loan.Credit_History = Loan.Credit_History.replace({'1.0':1})

In [27]:
Loan.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_status
0,0,1,0,0,0,5720,0.0,110.0,360,1.0,1,1
1,0,1,1,0,0,3076,1500.0,126.0,360,1.0,1,0
2,0,1,2,0,0,5000,1800.0,208.0,360,1.0,1,1
3,0,1,2,0,0,2340,2546.0,100.0,360,1.0,1,1
4,0,0,0,1,0,3276,0.0,78.0,360,1.0,1,1


In [28]:
Loan.shape

(981, 12)

In [30]:
Loan.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_status
0,0,1,0,0,0,5720,0.0,110.0,360,1.0,1,1
1,0,1,1,0,0,3076,1500.0,126.0,360,1.0,1,0
2,0,1,2,0,0,5000,1800.0,208.0,360,1.0,1,1
3,0,1,2,0,0,2340,2546.0,100.0,360,1.0,1,1
4,0,0,0,1,0,3276,0.0,78.0,360,1.0,1,1


In [97]:
X=Loan.iloc[:,:-1].values
y=Loan.iloc[:,11].values

In [62]:
X

array([[0, 1, '0', ..., 360.0, 1.0, 1],
       [0, 1, '1', ..., 360.0, 1.0, 1],
       [0, 1, '2', ..., 360.0, 1.0, 1],
       ...,
       [0, 1, '1', ..., 360.0, 1.0, 1],
       [0, 1, '2', ..., 360.0, 1.0, 1],
       [1, 0, '0', ..., 360.0, 0.0, 0]], dtype=object)

In [63]:
y

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1,

In [64]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=6)

In [70]:
print(f"Row in training set:{len(X_train)}\nRow in test set:{len(X_test)} ")

Row in training set:784
Row in test set:197 


In [71]:
Loan_model = DecisionTreeClassifier()
Loan_model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [72]:
y_pred=Loan_model.predict(X_test)

In [75]:
# from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6598984771573604

In [76]:
parameters={'max_depth':[1,2,3,4,5],'min_samples_leaf':[1,2,3,4,5],'min_samples_split':[2,3,4,5],
           'criterion':['gini','entropy']}
scorer=make_scorer(f1_score)

In [77]:
grid_obj = GridSearchCV(Loan_model, parameters, scoring=scorer)

In [78]:
grid_fit = grid_obj.fit(X_train, y_train)



In [80]:
best_Loan_model = grid_fit.best_estimator_
best_Loan_model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [85]:
best_Loan_model.fit(X_train, y_train)
best_train_predictions = best_Loan_model.predict(X_train)
best_test_predictions = best_Loan_model.predict(X_test)

print('The training F1 Score is', f1_score(best_train_predictions, y_train)*100)
print('The testing F1 Score is', f1_score(best_test_predictions, y_test)*100)

The training F1 Score is 81.56329651656755
The testing F1 Score is 82.43243243243242


In [91]:
df=pd.DataFrame(y_test,columns=['y_test'])
df['y_pred']= y_pred
df.tail()

Unnamed: 0,y_test,y_pred
192,1,1
193,1,1
194,0,0
195,1,0
196,1,1


In [104]:
X_test[192]

array([0, 1, 3, 0, 0, 13518, 0.0, 390.0, 360.0, 1.0, 2], dtype=object)

In [105]:
ran_data_1 =[0, 1, 3, 0, 0, 13518, 0.0, 390.0, 360.0, 1.0, 2]
ran_data_arr=np.array(ran_data_1)
ran_data_num=ran_data_arr.reshape(1,-1)
pred_single_row=best_Loan_model.predict(ran_data_num)
round(float(pred_single_row))

1

In [106]:
y_test[192]

1

In [107]:
from sklearn.externals import joblib
joblib.dump(best_Loan_model,"DecisionTreeModel.pkl")



['DecisionTreeModel.pkl']