In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import scale
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
bank = pd.read_csv('bank-full.csv')
bank

Unnamed: 0,job,marital,default,housing,loan,poutcome,y,age,education,balance,contact,day,month,duration,campaign,pdays,previous
0,management,married,no,yes,no,unknown,no,58,tertiary,2143,unknown,5,may,261,1,-1,0
1,technician,single,no,yes,no,unknown,no,44,secondary,29,unknown,5,may,151,1,-1,0
2,entrepreneur,married,no,yes,yes,unknown,no,33,secondary,2,unknown,5,may,76,1,-1,0
3,blue-collar,married,no,yes,no,unknown,no,47,unknown,1506,unknown,5,may,92,1,-1,0
4,unknown,single,no,no,no,unknown,no,33,unknown,1,unknown,5,may,198,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,technician,married,no,no,no,unknown,yes,51,tertiary,825,cellular,17,nov,977,3,-1,0
45207,retired,divorced,no,no,no,unknown,yes,71,primary,1729,cellular,17,nov,456,2,-1,0
45208,retired,married,no,no,no,success,yes,72,secondary,5715,cellular,17,nov,1127,5,184,3
45209,blue-collar,married,no,no,no,unknown,no,57,secondary,668,telephone,17,nov,508,4,-1,0


In [3]:
bank.head()

Unnamed: 0,job,marital,default,housing,loan,poutcome,y,age,education,balance,contact,day,month,duration,campaign,pdays,previous
0,management,married,no,yes,no,unknown,no,58,tertiary,2143,unknown,5,may,261,1,-1,0
1,technician,single,no,yes,no,unknown,no,44,secondary,29,unknown,5,may,151,1,-1,0
2,entrepreneur,married,no,yes,yes,unknown,no,33,secondary,2,unknown,5,may,76,1,-1,0
3,blue-collar,married,no,yes,no,unknown,no,47,unknown,1506,unknown,5,may,92,1,-1,0
4,unknown,single,no,no,no,unknown,no,33,unknown,1,unknown,5,may,198,1,-1,0


In [4]:
# Droping columns which are not required

bank.drop(["age", "balance", "pdays", "campaign", "previous", "duration", "month", "education", "contact", "day"],inplace=True, axis = "columns")

In [5]:
bank

Unnamed: 0,job,marital,default,housing,loan,poutcome,y
0,management,married,no,yes,no,unknown,no
1,technician,single,no,yes,no,unknown,no
2,entrepreneur,married,no,yes,yes,unknown,no
3,blue-collar,married,no,yes,no,unknown,no
4,unknown,single,no,no,no,unknown,no
...,...,...,...,...,...,...,...
45206,technician,married,no,no,no,unknown,yes
45207,retired,divorced,no,no,no,unknown,yes
45208,retired,married,no,no,no,success,yes
45209,blue-collar,married,no,no,no,unknown,no


In [6]:
bank.head()

Unnamed: 0,job,marital,default,housing,loan,poutcome,y
0,management,married,no,yes,no,unknown,no
1,technician,single,no,yes,no,unknown,no
2,entrepreneur,married,no,yes,yes,unknown,no
3,blue-collar,married,no,yes,no,unknown,no
4,unknown,single,no,no,no,unknown,no


In [7]:
bank.shape

(45211, 7)

In [8]:
# Removing NA values in data set

bank = bank.dropna()
bank.shape

(45211, 7)

In [9]:
# Create one hot encoding of the categorical columns

bank = pd.get_dummies(bank, columns =['job', 'marital', 'default', 'housing', 'loan', 'poutcome'])

In [10]:
bank.head()

Unnamed: 0,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,no,0,0,0,0,1,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
1,no,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
2,no,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,0,1
3,no,0,1,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
4,no,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,1


In [11]:
# List out column names
# For each possible value of the column we have created new column in the database with column name appended as a prefix

bank.columns

Index(['y', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'marital_divorced', 'marital_married', 'marital_single',
       'default_no', 'default_yes', 'housing_no', 'housing_yes', 'loan_no',
       'loan_yes', 'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'],
      dtype='object')

In [12]:
# For each encoded field in original database, a list of columns is added in created database with all possible values that column takes in original database

bank

Unnamed: 0,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,no,0,0,0,0,1,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
1,no,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
2,no,0,0,1,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,0,1
3,no,0,1,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,1
4,no,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,yes,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,1
45207,yes,0,0,0,0,0,1,0,0,0,...,1,0,1,0,1,0,0,0,0,1
45208,yes,0,0,0,0,0,1,0,0,0,...,1,0,1,0,1,0,0,0,1,0
45209,no,0,1,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,1


In [13]:
# From above output => Let's consider first row => Customer has a job of 'management', also he has a 'housing' and no 'loan'

In [14]:
# Droping the columns with unknown values

bank.drop(bank.columns[[12, 25]], axis=1, inplace=True)

In [15]:
bank.columns

Index(['y', 'job_admin.', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'marital_divorced', 'marital_married', 'marital_single', 'default_no',
       'default_yes', 'housing_no', 'housing_yes', 'loan_no', 'loan_yes',
       'poutcome_failure', 'poutcome_other', 'poutcome_success'],
      dtype='object')

In [16]:
# Separate data into X and Y array
# X array contains data columns to be analyzed and Y array is a single dimensional array contains output of the prediction

X = bank.iloc[:,1:]
X.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,marital_single,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,poutcome_failure,poutcome_other,poutcome_success
0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,1,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,1,1,0,0,1,1,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,0,1,0,0,0,0


In [17]:
Y = bank.iloc[:,0]
Y.head()

0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object

In [18]:
# X_train, Y_train for training the data and  X_test, Y_test for testing and validating data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

In [19]:
# Building classifier

classifier = LogisticRegression(solver='lbfgs',random_state=0)

In [20]:
classifier.fit(X_train, Y_train)

LogisticRegression(random_state=0)

In [21]:
# Predicting test data

predicted_y = classifier.predict(X_test)
predicted_y

array(['no', 'no', 'no', ..., 'no', 'no', 'no'], dtype=object)

In [22]:
# Above output indicates => 'no' i.e customer has no potential to 'Deposit' 

In [28]:
print('Accuracy: {:.2f}'.format(classifier.score(X_test, Y_test)))

Accuracy: 0.89


In [29]:
# No further tuning is required as accuracy of model is 89%. Accuracy indicating high rate of success for potential customers in opening 'Deposit'