# Bank Loan Classification

In [389]:
#importing necessary libraries

import pandas as pd

In [390]:
#reading the dataset

df=pd.read_csv("Bank_loan_data.csv")

In [391]:
#cheaking head data

df.head()

Unnamed: 0,ID,Age,Gender,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,10,34,M,9,180.0,93023,1,8.9,3,0,Home Owner,1,0,0,0.0,0
1,17,38,M,14,130.0,95010,4,4.7,3,134,Rent,1,0,0,0.0,0
2,19,46,M,21,193.0,91604,2,8.1,3,0,Rent,1,0,0,0.0,0
3,30,38,M,13,,94104,1,3.3,2,0,Rent,1,0,1,1.0,1
4,39,42,M,18,,94114,3,5.0,3,0,Rent,1,1,1,1.0,0


###  Since, many columns are missing values which results in building weak machine learning model . So, we will fill those values accroding to the condition.

## Performing some EDA

In [392]:
#checking null values present in the data set

df.isnull()

Unnamed: 0,ID,Age,Gender,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
476,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
477,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
478,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False


True represent missing values while False represent not a missing values

In [393]:
#checking total null values in the dataset

df.isnull().sum()

ID                     0
Age                    0
Gender                 0
Experience             0
Income                67
ZIP Code               0
Family                 0
CCAvg                  0
Education              0
Mortgage               0
Home Ownership         0
Personal Loan          0
Securities Account     0
CD Account             0
Online                40
CreditCard             0
dtype: int64

we got 67 null values in income column and 40 null values in Online(Internet Banking)

### For filling missing values , we use mean , median or mode accroding to the data present to the column.

In [394]:
# filling empty values in  Income column . Since Income is a numeric variable , we use mean for filling missing values.

df['Income'] = df['Income'].fillna(df['Income'].mean())


In [395]:
#checking newly missing values total in income column

df['Income'].isnull().sum()

0

Now, we got no missing values in Income Column.

# Performing feature engineering

In [396]:
#checking data type of income 

df['Income'].dtype

dtype('float64')

In [397]:
#changing data type of income 

df['Income']=df['Income'].astype('int')

In [398]:
df['Income'].dtypes

dtype('int64')

Now, we got income data type as int.

Since , Online(Internet Banking) column do not play important role in building ML model for loan acceptance . So, we remove this column.


We have some columns like 'ID' , 'ZipCode' , 'Age' , 'Gender' etc in the dataset which do not particularly affect in builidng the model. So,we drop those columns.

In [399]:
df=df.drop(columns={'Age','ID','ZIP Code','Gender','Experience','Family','CCAvg','Mortgage','Home Ownership','Online'})

In [400]:
df.columns

Index(['Income', 'Education', 'Personal Loan', 'Securities Account',
       'CD Account', 'CreditCard'],
      dtype='object')

Now, we got proper data to train and build machine learning model.

In [401]:
#exporting proper original data for building ML model.

df.to_csv('Bank_loan_original_data.csv',index='True')

# Implementing Machine Learning Model

In [413]:
#importing required libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import tree
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [414]:
#reading the dataset obtained after EDA and feature engineering

df_original=pd.read_csv("Bank_loan_original_data.csv")

In [415]:
#dataframe first 10 rows

df_original.head(10)

Unnamed: 0.1,Unnamed: 0,Income,Education,Personal Loan,Securities Account,CD Account,CreditCard
0,0,180,3,1.0,0,0,0
1,1,130,3,1.0,0,0,0
2,2,193,3,1.0,0,0,0
3,3,144,2,1.0,0,1,1
4,4,144,3,1.0,1,1,0
5,5,132,2,,0,0,0
6,6,194,3,1.0,1,1,1
7,7,190,3,1.0,0,0,0
8,8,131,3,1.0,0,0,0
9,9,135,2,1.0,0,1,1


In [416]:
#dropping unnecessary columns

df_original=df_original.drop(columns={'Unnamed: 0'})

In [417]:
#checking data again

df_original

Unnamed: 0,Income,Education,Personal Loan,Securities Account,CD Account,CreditCard
0,180,3,1,0,0,0
1,130,3,1,0,0,0
2,193,3,1,0,0,0
3,144,2,1,0,1,1
4,144,3,1,1,1,0
...,...,...,...,...,...,...
475,129,3,1,0,1,1
476,144,2,1,0,1,1
477,144,2,1,0,0,0
478,144,3,1,0,1,1


## Now,we start selecting columns to build ML model.

In [418]:
# selecting columns to train model

x = df_original[['Income', 'Education', 'Securities Account', 'CD Account', 'CreditCard']] # These are the variables which we are going to train.
y = df_original['Personal Loan'] #This is target variable which we are going to predict.


In [419]:
#splitting the variable into training and testing set 

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) #we keep 20% data for test and 80% data for training the model and we keep randomness at 42.

# Logistic Regression

In [421]:
# Creating a logistic regression model
model = LogisticRegression()

# Training the model
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

Accuracy: 100.0


## Here, we got accuracy 100% which shows us that our model is working fine and predicting the best possible results.

In [429]:
#testing model using values 
#data includes ([[income,education,securities account,cd acccount,credit card]])


y_pred=model.predict([[130,3,0,1,1]])
y_pred[0]

'1'

### Here, we got 1 as a result which is the 'YES' in Loan Acceptance. It means the Customer who has above income,education,securities account,cd acccount,credit card has accept the personal loan offered in the last campaign.

In [431]:
# saving trained model into pickle file so that model can be easily loaded and used for prediction.

import pickle
LR_Model_pkl = open('LogisticRegression.pkl', 'wb')
pickle.dump(model, LR_Model_pkl)
LR_Model_pkl.close()

## DecisionTree

In [None]:
from sklearn.tree import DecisionTreeClassifier

DecisionTree = DecisionTreeClassifier(criterion="entropy",random_state=2,max_depth=5)

DecisionTree.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

Accuracy: 100.0


Here also , we got accuracy as 100% which is best accuracy and shows the model is working fine.


In [433]:
#testing model using values 
#data includes ([[income,education,securities account,cd acccount,credit card]])

y_pred=model.predict([[167,1,1,0,0]])
y_pred[0]

'1'

 ### Here, we got 1 as a result which is the 'YES' in Loan Acceptance. It means the Customer who has above income,education,securities account,cd acccount,credit card has accept the personal loan offered in the last campaign.

In [434]:
# saving trained model into pickle file so that model can be easily loaded and used for prediction.

import pickle
LR_Model_pkl = open('DecisionTree.pkl', 'wb')
pickle.dump(DecisionTree, LR_Model_pkl)
LR_Model_pkl.close()

# GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB

NaiveBayes = GaussianNB()

NaiveBayes.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

Accuracy: 100.0


Here also , we got accuracy as 100% which is best accuracy and shows the model is working fine.


In [435]:
#testing model using values 
#data includes ([[income,education,securities account,cd acccount,credit card]])

y_pred=model.predict([[96,1,0,0,0]])
y_pred[0]

'1'

 Here, we got 1 as a result which is the 'YES' in Loan Acceptance. It means the Customer who has above income,education,securities account,cd acccount,credit card has accept the personal loan offered in the last campaign.

In [436]:
# saving trained model into pickle file so that model can be easily loaded and used for prediction.

import pickle
LR_Model_pkl = open('GaussianNB.pkl', 'wb')
pickle.dump(NaiveBayes, LR_Model_pkl)
LR_Model_pkl.close()

# With this , I have build the machine learning model to predict that the customer accept the personal loan offered in the last campaign or not. Thank You.