# Bank Loan Classification

In [1]:
#importing necessary libraries

import pandas as pd

In [2]:
#reading the dataset

df=pd.read_csv("Bank_loan_data.csv")

In [3]:
#cheaking head data

df.head()

Unnamed: 0,ID,Age,Gender,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,10,34,M,9,180.0,93023,1,8.9,3,0,Home Owner,1,0,0,0.0,0
1,17,38,M,14,130.0,95010,4,4.7,3,134,Rent,1,0,0,0.0,0
2,19,46,M,21,193.0,91604,2,8.1,3,0,Rent,1,0,0,0.0,0
3,30,38,M,13,,94104,1,3.3,2,0,Rent,1,0,1,1.0,1
4,39,42,M,18,,94114,3,5.0,3,0,Rent,1,1,1,1.0,0


###  Since, many columns are missing values which results in building weak machine learning model . So, we will fill those values accroding to the condition.

## Performing some EDA

In [4]:
#checking null values present in the data set

df.isnull()

Unnamed: 0,ID,Age,Gender,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
476,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
477,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False
478,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False


True represent missing values while False represent not a missing values

In [5]:
#checking total null values in the dataset

df.isnull().sum()

ID                     0
Age                    0
Gender                 0
Experience             0
Income                67
ZIP Code               0
Family                 0
CCAvg                  0
Education              0
Mortgage               0
Home Ownership         0
Personal Loan          0
Securities Account     0
CD Account             0
Online                40
CreditCard             0
dtype: int64

we got 67 null values in income column and 40 null values in Online(Internet Banking)

### For filling missing values , we use mean , median or mode accroding to the data present to the column.

In [6]:
# filling empty values in  Income column . Since Income is a numeric variable , we use mean for filling missing values.

df['Income'] = df['Income'].fillna(df['Income'].mean())


In [7]:
#checking newly missing values total in income column

df['Income'].isnull().sum()

0

Now, we got no missing values in Income Column.

# Performing feature engineering

In [8]:
#checking data type of income 

df['Income'].dtype

dtype('float64')

In [9]:
#changing data type of income 

df['Income']=df['Income'].astype('int')

In [10]:
df['Income'].dtypes

dtype('int64')

In [11]:
# filling empty values in Online(Internet Banking) column . 
df['Online']=df['Online'].fillna(df['Online'].mode()[0])

In [12]:
#checking newly missing values total in Online(Internet Banking) column

df['Online'].isnull().sum()

0

## changing float values into int 

In [13]:
df['Online'].dtypes

dtype('float64')

In [14]:
df['Online']=df['Online'].astype('int')

In [15]:
df['Online'].dtypes

dtype('int64')

In [16]:
df['CCAvg'].dtypes

dtype('float64')

In [17]:
df['CCAvg']=df['CCAvg'].astype('int')

In [18]:
df['CCAvg'].dtypes

dtype('int64')

### Now, we got no missing values in Income and Online(Internet Banking) Column and also we got income ,CCAvg and Online(Internet Banking) data type as int.

## Since ,column like "ID" , "ZipCode" etc does not play important role in building ML model for loan acceptance . So, we remove this column.


In [19]:
df.columns

Index(['ID', 'Age', 'Gender', 'Experience', 'Income', 'ZIP Code', 'Family',
       'CCAvg', 'Education', 'Mortgage', 'Home Ownership', 'Personal Loan',
       'Securities Account', 'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [20]:
df=df.drop(columns={'ID','ZIP Code','Mortgage'})

In [21]:
# the columns are dropped 

df.columns

Index(['Age', 'Gender', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Home Ownership', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')

## performing one-hot encoding to convert string values into categorical values

In [22]:
gender_map = {'M': 3, 'F': 2,'O':1}
df['Gender'] = df['Gender'].map(gender_map)

3 represent male , 2 represent female and 1 represent others.

In [23]:
home_map={'Home Owner':3,'Home Mortage':2,'Rent':1}
df['Home Ownership']=df['Home Ownership'].map(home_map)

3 represent Home Owner , 2 represent Home Mortage and 1 represent Rent.

In [24]:
#checking if there is null values present in dataset or not

df.isnull().sum()

Age                   0
Gender                0
Experience            0
Income                0
Family                0
CCAvg                 0
Education             0
Home Ownership        0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

# Detecting outliers present in the columns and removing it

In [25]:
df.describe()

Unnamed: 0,Age,Gender,Experience,Income,Family,CCAvg,Education,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0,480.0
mean,52.04375,2.56875,19.84375,144.10625,2.6125,3.491667,2.233333,2.03125,0.997917,0.125,0.291667,0.641667,0.297917
std,74.024316,0.619332,11.582443,30.321987,1.115393,2.12155,0.753373,0.7375,0.045644,0.331064,0.455004,0.480011,0.45782
min,0.0,1.0,0.0,8.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,34.0,2.0,9.0,124.0,2.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0
50%,45.0,3.0,20.0,144.0,3.0,3.0,2.0,2.0,1.0,0.0,0.0,1.0,0.0
75%,55.0,3.0,30.0,169.25,4.0,5.0,3.0,3.0,1.0,0.0,1.0,1.0,1.0
max,978.0,3.0,41.0,203.0,4.0,10.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0


In [26]:
# we have outliers whose age is less than 18 and more than 65 . so, we remove those outliers.

df=df[(df['Age'] >= 18) & (df['Age'] <= 65)]

In [27]:
# we have outliers whose experience level is less than 1 and more than 10 years . so, we remove those outliers.

df=df[(df['Experience'] >= 1) & (df['Experience'] <= 10)]

In [28]:
# we have outliers whose CCAvg is less than 1 and more than 8. so, we remove those outliers.

df=df[(df['CCAvg'] >= 1) & (df['CCAvg'] <= 8)]

In [29]:
df

Unnamed: 0,Age,Gender,Experience,Income,Family,CCAvg,Education,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,34,3,9,180,1,8,3,3,1,0,0,0,0
5,32,3,7,132,4,1,2,2,0,0,0,1,0
9,31,3,7,135,4,3,2,1,1,0,1,1,1
16,29,3,3,148,3,4,1,1,1,0,0,1,0
35,26,3,2,60,2,3,1,3,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,29,2,4,184,4,2,3,3,1,0,0,1,0
471,35,2,10,135,3,4,2,2,1,0,0,1,0
472,34,2,8,144,1,7,3,2,1,0,0,0,0
477,28,2,4,144,2,1,2,2,1,0,0,1,0


## Now, we got proper data to train and build machine learning model.

In [30]:
#exporting proper original data for building ML model.

df.to_csv('finaldata.csv',index='True')

In [31]:
df

Unnamed: 0,Age,Gender,Experience,Income,Family,CCAvg,Education,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,34,3,9,180,1,8,3,3,1,0,0,0,0
5,32,3,7,132,4,1,2,2,0,0,0,1,0
9,31,3,7,135,4,3,2,1,1,0,1,1,1
16,29,3,3,148,3,4,1,1,1,0,0,1,0
35,26,3,2,60,2,3,1,3,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,29,2,4,184,4,2,3,3,1,0,0,1,0
471,35,2,10,135,3,4,2,2,1,0,0,1,0
472,34,2,8,144,1,7,3,2,1,0,0,0,0
477,28,2,4,144,2,1,2,2,1,0,0,1,0


# Implementing Machine Learning Model

In [32]:
#importing required libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn import tree
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

In [33]:
#reading the dataset obtained after EDA and feature engineering

df_original=pd.read_csv("finaldata.csv")

In [34]:
#dataframe first 10 rows

df_original.head(10)

Unnamed: 0.1,Unnamed: 0,Age,Gender,Experience,Income,Family,CCAvg,Education,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,0,34,3,9,180,1,8,3,3,1,0,0,0,0
1,5,32,3,7,132,4,1,2,2,0,0,0,1,0
2,9,31,3,7,135,4,3,2,1,1,0,1,1,1
3,16,29,3,3,148,3,4,1,1,1,0,0,1,0
4,35,26,3,2,60,2,3,1,3,1,0,0,0,0
5,36,28,3,4,155,4,5,2,3,1,0,0,1,0
6,40,36,3,10,179,3,6,1,1,1,0,0,1,0
7,41,28,3,3,115,4,3,2,1,1,0,0,1,0
8,44,29,3,4,183,3,8,3,2,1,0,0,1,0
9,57,33,3,8,115,4,2,2,3,1,0,0,1,0


In [35]:
#dropping unnecessary columns

df_original=df_original.drop(columns={'Unnamed: 0'})

In [36]:
#checking data again

df_original

Unnamed: 0,Age,Gender,Experience,Income,Family,CCAvg,Education,Home Ownership,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,34,3,9,180,1,8,3,3,1,0,0,0,0
1,32,3,7,132,4,1,2,2,0,0,0,1,0
2,31,3,7,135,4,3,2,1,1,0,1,1,1
3,29,3,3,148,3,4,1,1,1,0,0,1,0
4,26,3,2,60,2,3,1,3,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,29,2,4,184,4,2,3,3,1,0,0,1,0
107,35,2,10,135,3,4,2,2,1,0,0,1,0
108,34,2,8,144,1,7,3,2,1,0,0,0,0
109,28,2,4,144,2,1,2,2,1,0,0,1,0


## Now,we start selecting columns to build ML model.

In [37]:
df_original.columns

Index(['Age', 'Gender', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Home Ownership', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')

In [38]:
# selecting columns to train model

x = df_original[['Age','Gender','Experience','Income','Family','CCAvg', 'Education','Home Ownership', 'Securities Account', 'CD Account','Online', 'CreditCard']] # These are the variables which we are going to train.
y = df_original['Personal Loan'] #This is target variable which we are going to predict.


In [39]:
#splitting the variable into training and testing set 

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) #we keep 20% data for test and 80% data for training the model and we keep randomness at 42.

# Logistic Regression

In [40]:
# Creating a logistic regression model
model = LogisticRegression()

# Training the model
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Calculating the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

Accuracy: 100.0


## Here, we got accuracy 100% which shows us that our model is working fine and predicting the best possible results.

In [41]:
#testing model using values 
#data incluse ['Age','Gender','Experience','Income','Family','CCAvg', 'Education','Mortgage','Home Ownership', 'Securities Account', 'CD Account','Online', 'CreditCard']

y_pred=model.predict([[38,3,7,132,4,1,2,2,0,0,1,0]])
y_pred[0]

1

### Here, we got 1 as a result which is the 'YES' in Loan Acceptance. It means the Customer who has above input values had accept the personal loan offered in the last campaign.

In [42]:
# saving trained model into pickle file so that model can be easily loaded and used for prediction.

import pickle
LR_Model_pkl = open('LogisticRegression.pkl', 'wb')
pickle.dump(model, LR_Model_pkl)
LR_Model_pkl.close()

## DecisionTree

In [43]:
from sklearn.tree import DecisionTreeClassifier

DecisionTree = DecisionTreeClassifier(criterion="entropy",random_state=2,max_depth=5)

DecisionTree.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

Accuracy: 100.0


Here also , we got accuracy as 100% which is best accuracy and shows the model is working fine.


In [44]:
#testing model using values 
#data incluse ['Age','Gender','Experience','Income','Family','CCAvg', 'Education','Mortgage','Home Ownership', 'Securities Account', 'CD Account','Online', 'CreditCard']

y_pred=model.predict([[47,3,21,144,4,3,3,2,0,0,0,0]])
y_pred[0]

1

 ### Here, we got 1 as a result which is the 'YES' in Loan Acceptance. It means the Customer who had above input values had accept the personal loan offered in the last campaign.

In [45]:
# saving trained model into pickle file so that model can be easily loaded and used for prediction.

import pickle
LR_Model_pkl = open('DecisionTree.pkl', 'wb')
pickle.dump(DecisionTree, LR_Model_pkl)
LR_Model_pkl.close()

# GaussianNB

In [46]:
from sklearn.naive_bayes import GaussianNB

NaiveBayes = GaussianNB()

NaiveBayes.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy*100)

Accuracy: 100.0


Here also , we got accuracy as 100% which is best accuracy and shows the model is working fine.


In [47]:
#testing model using values 
#data incluse ['Age','Gender','Experience','Income','Family','CCAvg', 'Education','Mortgage','Home Ownership', 'Securities Account', 'CD Account','Online', 'CreditCard']

y_pred=model.predict([[34,3,9,123,1,1,2,1,0,0,1,0]])
y_pred[0]

1

 ## Here, we got 1 as a result which is the 'YES' in Loan Acceptance. It means the Customer who had above input values had accept the personal loan offered in the last campaign.

In [48]:
# saving trained model into pickle file so that model can be easily loaded and used for prediction.

import pickle
LR_Model_pkl = open('GaussianNB.pkl', 'wb')
pickle.dump(NaiveBayes, LR_Model_pkl)
LR_Model_pkl.close()

# With this , I have build the machine learning model to predict that the customer accept the personal loan offered in the last campaign or not. Thank You.