In [1]:
#import required libraries
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
%matplotlib inline

In [2]:
#read dataset
train_csv = pd.read_csv('loan-train.csv')
test_csv = pd.read_csv('loan-test.csv')

In [3]:
#check shape of train and test data
print("Shape of train dataset: ",train_csv.shape)
print("Shape of test dataset: ",test_csv.shape)

Shape of train dataset:  (614, 13)
Shape of test dataset:  (367, 12)


In [4]:
#check head of train dataset
train_csv.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [None]:
#get info of dataset
train_csv.info()

In [None]:
#see if there are any outliers
train_csv.boxplot(column='ApplicantIncome')

In [None]:
train_csv['ApplicantIncome'].hist(bins=10)

In [None]:
train_csv.boxplot(column='LoanAmount')

In [None]:
train_csv['LoanAmount'].hist(bins=10)

In [None]:
#normalize LoanAmount
train_csv['LoanAmount_Normalized']=np.log(train_csv['LoanAmount'])
train_csv['LoanAmount_Normalized'].hist(bins=20)

In [None]:
#check missing values
train_csv.isnull().sum()

In [None]:
#fill up missing values
#fill up mode values at missing places
train_csv['Gender'].fillna(train_csv['Gender'].mode()[0], inplace = True)
train_csv['Married'].fillna(train_csv['Married'].mode()[0], inplace = True)
train_csv['Dependents'].fillna(train_csv['Dependents'].mode()[0], inplace = True)
train_csv['Self_Employed'].fillna(train_csv['Self_Employed'].mode()[0], inplace = True)

#LoanAmount is numerical value so we will use mean instead of mode
train_csv['LoanAmount'].fillna(train_csv['LoanAmount'].mean(), inplace = True)
train_csv['LoanAmount_Normalized'].fillna(train_csv['LoanAmount_Normalized'].mean(), inplace = True)

#Loan_Amount_Term and Credit_History contains categorical values so use mode
train_csv['Loan_Amount_Term'].fillna(train_csv['Loan_Amount_Term'].mode()[0], inplace = True)
train_csv['Credit_History'].fillna(train_csv['Credit_History'].mode()[0], inplace = True)

In [None]:
#check missing values
train_csv.isnull().sum()

In [None]:
#normalize ApplicantIncome and CoapplicantIncome
train_csv['TotalIncome'] = train_csv['ApplicantIncome'] + train_csv['CoapplicantIncome']
train_csv['TotalIncome_Normalized'] = np.log(train_csv['TotalIncome'])

train_csv['TotalIncome_Normalized'].hist(bins=20)

In [None]:
train_csv.head()

In [None]:
#divide dataset into depenedent and independent variables
X = train_csv.iloc[:,np.r_[1:5,9:11,13:15]].values
Y = train_csv.iloc[:,12].values

In [None]:
X

In [None]:
Y

In [None]:
#divide dataset into train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [None]:
X_train

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
#convert text values into numerical values
for i in range(0,5):
    X_train[:,i] = label_encoder.fit_transform(X_train[:,i])
    X_test[:,i] = label_encoder.fit_transform(X_test[:,i])
X_train[:,7] = label_encoder.fit_transform(X_train[:,7]) 
X_test[:,7] = label_encoder.fit_transform(X_test[:,7]) 
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

In [None]:
X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

In [None]:
#scale data
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree_classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
tree_classifier.fit(X_train, y_train)

In [None]:
y_pred = tree_classifier.predict(X_test)
y_pred

In [None]:
from sklearn import metrics
print("Accuracy: ", metrics.accuracy_score(y_pred,y_test))

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier()
forest_classifier.fit(X_train,y_train)
y_pred = forest_classifier.predict(X_test)
y_pred

In [None]:
print("Accuracy: ", metrics.accuracy_score(y_pred,y_test))

In [None]:
from sklearn.naive_bayes import GaussianNB
naive_classifier = GaussianNB()
naive_classifier.fit(X_train, y_train)
y_pred = naive_classifier.predict(X_test)
y_pred

In [None]:
from sklearn import metrics
print("Accuracy: ", metrics.accuracy_score(y_pred,y_test))

In [None]:
#see some rows of test dataset
test_csv.head()

In [None]:
#get test dataset info
test_csv.info()

In [None]:
#get null values or missing values in test dataset
test_csv.isnull().sum()

In [None]:
test_csv.boxplot(column='LoanAmount')

In [None]:
test_csv.boxplot(column='ApplicantIncome')

In [None]:
#normalize LoanAmount
test_csv['LoanAmount_Normalized']=np.log(test_csv['LoanAmount'])
test_csv['LoanAmount_Normalized'].hist(bins=20)

In [None]:
#fill up missing values
#fill up mode values at missing places
test_csv['Gender'].fillna(test_csv['Gender'].mode()[0], inplace = True)
test_csv['Dependents'].fillna(test_csv['Dependents'].mode()[0], inplace = True)
test_csv['Self_Employed'].fillna(test_csv['Self_Employed'].mode()[0], inplace = True)

#LoanAmount is numerical value so we will use mean instead of mode
test_csv['LoanAmount'].fillna(test_csv['LoanAmount'].mean(), inplace = True)
test_csv['LoanAmount_Normalized'].fillna(test_csv['LoanAmount_Normalized'].mean(), inplace = True)

#Loan_Amount_Term and Credit_History contains categorical values so use mode
test_csv['Loan_Amount_Term'].fillna(test_csv['Loan_Amount_Term'].mode()[0], inplace = True)
test_csv['Credit_History'].fillna(test_csv['Credit_History'].mode()[0], inplace = True)

In [None]:
#check missing values
test_csv.isnull().sum()

In [None]:
#normalize ApplicantIncome and CoapplicantIncome
test_csv['TotalIncome'] = test_csv['ApplicantIncome'] + test_csv['CoapplicantIncome']
test_csv['TotalIncome_Normalized'] = np.log(test_csv['TotalIncome'])

test_csv['TotalIncome_Normalized'].hist(bins=20)

In [None]:
test_csv.head()

In [None]:
test = test_csv.iloc[:,np.r_[1:5,9:11,13:15]].values

In [None]:
#convert text values into numerical values
for i in range(0,5):
    test[:,i] = label_encoder.fit_transform(test[:,i])
test[:,7] = label_encoder.fit_transform(test[:,7])

In [None]:
test

In [None]:
#scale
test = ss.fit_transform(test)

In [None]:
#predict
pred = naive_classifier.predict(test)
pred