In [197]:
#import data and dependencies
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
from sklearn import tree

data_frame_train = pd.read_csv('train_data.csv')
data_frame_train


Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [None]:
#test_data
data_frame_test = pd.read_csv('test_data.csv')
data_frame_test

In [None]:
#copy train data
data_frame_train_original = data_frame_train.copy()
data_frame_train_original

In [None]:
#copy test data
data_frame_test_original = data_frame_test.copy()
data_frame_test_original


In [None]:
data_frame_test.head(3)

In [None]:
data_frame_train.head(3)

In [None]:
data_frame_train.info()

In [None]:
data_frame_test.info()

In [None]:
#Exploratory Data Analysis
data_frame_train['Loan_Status'].value_counts()

In [None]:
data_frame_train['Loan_Status'].value_counts().plot.bar(figsize=(10,6), title='Loan Status')

In [None]:
plt.figure(1)
plt.subplot(221)
data_frame_train['Gender'].value_counts(normalize=True).plot.bar(figsize=(20,10), title= 'Gender')
plt.subplot(222)
data_frame_train['Married'].value_counts(normalize=True).plot.bar(title= 'Married')
plt.subplot(223)
data_frame_train['Self_Employed'].value_counts(normalize=True).plot.bar(title= 'Self Employed')
plt.subplot(224)
data_frame_train['Credit_History'].value_counts(normalize=True).plot.bar(title= 'Credit_History')


In [None]:
plt.figure(1)
plt.subplot(131)
data_frame_train['Dependents'].value_counts(normalize=True).plot.bar(figsize=(24,6), title='Dependents')
plt.subplot(132)
data_frame_train['Education'].value_counts(normalize=True).plot.bar(title='Education')
plt.subplot(133)
data_frame_train['Property_Area'].value_counts(normalize=True).plot.bar(title='Property Area')

In [None]:
plt.figure(1)
plt.subplot(121)
sns.distplot(data_frame_train['ApplicantIncome'])
plt.subplot(122)
data_frame_train['ApplicantIncome'].plot.box(figsize=(16,5))

In [None]:
data_frame_train.boxplot(column= 'ApplicantIncome', by='Education')
plt.tight_layout()

In [None]:
data_frame_train.boxplot(column= 'ApplicantIncome', by='Gender')
plt.tight_layout()

In [None]:
Married = pd.crosstab(data_frame_train['Married'], data_frame_train['Loan_Status'])
Married.div(Married.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(6,4))
plt.legend(loc = 'best')

In [None]:
Credit_History = pd.crosstab(data_frame_train['Credit_History'], data_frame_train['Loan_Status'])
Credit_History.div(Credit_History.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(6,4))
plt.legend(loc = 'best')

In [None]:
Property_Area = pd.crosstab(data_frame_train['Property_Area'], data_frame_train['Loan_Status'])
Property_Area.div(Property_Area.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True, figsize=(6,4))
plt.legend(bbox_to_anchor=(1.05,1.0),loc='best')

In [None]:
data_frame_train['Total_Income'] = data_frame_train['ApplicantIncome'] + data_frame_train['CoapplicantIncome']
bins = [0,2500,4000,6000,81000] 
group= ['Low', 'Average', 'High', 'Very High']
data_frame_train['Total_Income_bin'] = pd.cut(data_frame_train['Total_Income'], bins, labels=group)
Total_Income_bin = pd.crosstab(data_frame_train['Total_Income_bin'], data_frame_train['Loan_Status'])
Total_Income_bin.div(Total_Income_bin.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
plt.xlabel('Total Income')
plt.ylabel('Percentage')
plt.legend(bbox_to_anchor=(1.05,1.0),loc='best')

In [None]:
data_frame_train.columns


In [None]:
data_frame_train['Dependents'].replace('3+', 3, inplace=True)
data_frame_test['Dependents'].replace('3+', 3, inplace=True)
data_frame_train['Loan_Status'].replace('N', 0, inplace=True)
data_frame_train['Loan_Status'].replace('Y', 1, inplace=True)

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(data_frame_train.corr(), vmax=0.8, square=True, cmap='BuPu')

In [None]:
data_frame_train.isnull().sum()

In [None]:
data_frame_train['Gender'].fillna(data_frame_train['Gender'].mode()[0], inplace=True)
data_frame_train['Married'].fillna(data_frame_train['Married'].mode()[0], inplace=True)
data_frame_train['Dependents'].fillna(data_frame_train['Dependents'].mode()[0], inplace=True)
data_frame_train['Self_Employed'].fillna(data_frame_train['Dependents'].mode()[0], inplace=True)
data_frame_train['Credit_History'].fillna(data_frame_train['Credit_History'].mode()[0], inplace=True)
data_frame_train['Loan_Amount_Term'].fillna(data_frame_train['Loan_Amount_Term'].mode()[0], inplace=True)
data_frame_train['LoanAmount'].fillna(data_frame_train['LoanAmount'].median(), inplace=True)

In [None]:
data_frame_train.isnull().sum()

In [None]:
data_frame_test.isnull().sum()

In [None]:
data_frame_test['Gender'].fillna(data_frame_test['Gender'].mode()[0], inplace=True)
data_frame_test['Married'].fillna(data_frame_test['Married'].mode()[0], inplace=True)
data_frame_test['Dependents'].fillna(data_frame_test['Dependents'].mode()[0], inplace=True)
data_frame_test['Self_Employed'].fillna(data_frame_test['Dependents'].mode()[0], inplace=True)
data_frame_test['Credit_History'].fillna(data_frame_test['Credit_History'].mode()[0], inplace=True)
data_frame_test['Loan_Amount_Term'].fillna(data_frame_test['Loan_Amount_Term'].mode()[0], inplace=True)
data_frame_test['LoanAmount'].fillna(data_frame_test['LoanAmount'].median(), inplace=True)

In [None]:
data_frame_test.isnull().sum()

In [None]:
 #describe
data_frame.describe()  


In [None]:
#values in array form
data_frame.values

In [None]:
#split data input data
input = data_frame.drop(columns='Loan_Status')
input

In [None]:
#split data output data
output = data_frame['Loan_Status']
output

In [None]:
input = pd.get_dummies(input)
data_frame_train = pd.get_dummies(data_frame_train)
data_frame_test = pd.get_dummies(data_frame_test)

In [None]:
#split data
input_train, input_test, output_train, output_test = train_test_split(input,output,test_size = 0.3)

In [198]:
 #Choose a model 
model = DecisionTreeClassifier()
#Train Data
model.fit(input_train,output_train)

#predict
predictions = model.predict(input_test)
predictions

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [199]:
#classification report
print(classification_report(output_test, predictions))

NameError: name 'predictions' is not defined

In [200]:
#accuracy
print(accuracy_score(output_test, predictions))

NameError: name 'predictions' is not defined

In [None]:
#model persistence
joblib.dump(model,'loan-approval-decision-tree.joblib')

In [201]:
#model
rfmodel = RandomForestClassifier(n_estimators=500)

In [202]:
#model training
rfmodel.fit(input_train, input_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [212]:
rfpredictions = rfmodel.predict(input_test)

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [213]:
print(accuracy_score(output_test, rfpredictions))

NameError: name 'rfpredictions' is not defined

In [215]:
print(classification_report(output_test, rfpredictions))

NameError: name 'rfpredictions' is not defined

In [None]:
#model persistence
joblib.dump(rfmodel,'loan-approval-random-forest.joblib')