# Logistic Regression of bank data

Logistic regression example of bank data set. 

### Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### Load the data
Load the 'Bank-data.csv' dataset.

In [2]:
raw_data = pd.read_csv('Bank-data.csv')
raw_data.head()

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.12,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no


In [3]:
data = raw_data.copy()
data = data.drop('Unnamed: 0', axis=1)
data['y'] = data['y'].map({'yes':1,'no':0})
data.head()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,0
1,0.767,0.0,0.0,2.0,1.0,274.0,1
2,4.858,0.0,1.0,0.0,0.0,167.0,0
3,4.12,0.0,0.0,0.0,0.0,686.0,1
4,4.856,0.0,1.0,0.0,0.0,157.0,0


In [4]:
data.describe()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
count,518.0,518.0,518.0,518.0,518.0,518.0,518.0
mean,2.835776,0.034749,0.266409,0.388031,0.127413,382.177606,0.5
std,1.876903,0.183321,0.442508,0.814527,0.333758,344.29599,0.500483
min,0.635,0.0,0.0,0.0,0.0,9.0,0.0
25%,1.04275,0.0,0.0,0.0,0.0,155.0,0.0
50%,1.466,0.0,0.0,0.0,0.0,266.5,0.5
75%,4.9565,0.0,1.0,0.0,0.0,482.75,1.0
max,4.97,1.0,1.0,5.0,1.0,2653.0,1.0


### Declare the dependent and independent variables

In [5]:
y = data['y']
x1 = data[['interest_rate','credit','march','previous','duration']]

### Perform Logistic Regression

In [6]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.336664
         Iterations 7


0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,512.0
Method:,MLE,Df Model:,5.0
Date:,"Sun, 12 Mar 2023",Pseudo R-squ.:,0.5143
Time:,14:03:50,Log-Likelihood:,-174.39
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,1.211e-77

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0211,0.311,-0.068,0.946,-0.631,0.589
interest_rate,-0.8001,0.089,-8.943,0.000,-0.975,-0.625
credit,2.3585,1.088,2.169,0.030,0.227,4.490
march,-1.8322,0.330,-5.556,0.000,-2.478,-1.186
previous,1.5363,0.501,3.067,0.002,0.554,2.518
duration,0.0070,0.001,9.381,0.000,0.006,0.008


### Confusion Matrix

In [7]:
pred_values = results_log.predict(x)
bins = np.array([0, 0.5, 1])
cm = np.histogram2d(y,pred_values, bins=bins)[0]
cm_df = pd.DataFrame(cm)
cm_df.columns = ['Predicted 0','Predicted 1']
cm_df = cm_df.rename(index={0: 'Actual 0',1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,218.0,41.0
Actual 1,30.0,229.0


In [8]:
accuracy = (cm[0,0] + cm[1,1])/cm.sum()
accuracy

0.862934362934363

## Test the model
We test the model with the test data located in 'Bank-data-testing.csv'

### Load test data

In [10]:
test_data_raw = pd.read_csv('Bank-data-testing.csv')
test_data_raw.head()

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.313,0.0,1.0,0.0,0.0,487.0,no
1,1,4.961,0.0,0.0,0.0,0.0,132.0,no
2,2,4.856,0.0,1.0,0.0,0.0,92.0,no
3,3,4.12,0.0,0.0,0.0,0.0,1468.0,yes
4,4,4.963,0.0,0.0,0.0,0.0,36.0,no


In [11]:
test_actual = test_data_raw['y'].map({'yes':1,'no':0})
test_data = test_data_raw.drop(['y', 'Unnamed: 0', 'may'], axis=1)
test_data = sm.add_constant(test_data)
test_data.head()

Unnamed: 0,const,interest_rate,credit,march,previous,duration
0,1.0,1.313,0.0,1.0,0.0,487.0
1,1.0,4.961,0.0,0.0,0.0,132.0
2,1.0,4.856,0.0,1.0,0.0,92.0
3,1.0,4.12,0.0,0.0,0.0,1468.0
4,1.0,4.963,0.0,0.0,0.0,36.0


### Evaluate test data on the model

In [12]:
test_predicted = results_log.predict(test_data)
cm_test = np.histogram2d(test_actual,test_predicted, bins=bins)[0]
cm_test_df = pd.DataFrame(cm_test)
cm_test_df.columns = ['Predicted 0','Predicted 1']
cm_test_df = cm_test_df.rename(index={0: 'Actual 0',1:'Actual 1'})
cm_test_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,93.0,18.0
Actual 1,13.0,98.0


In [13]:
print('Missclassification rate test data: ' + str((cm_test[0,1] + cm_test[1,0])/cm_test.sum()))
print('Missclassification rate training data: ' + str((cm[0,1] + cm[1,0])/cm.sum()))

Missclassification rate test data: 0.13963963963963963
Missclassification rate training data: 0.13706563706563707


Slightly higher missclassification rate for the test data indicate that we have a good model without overfitting of the training data