# Bank marketing analysis

## Importing necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

## load the data

In [4]:
raw_data = pd.read_csv('Bank-data.csv')

### Cleaning the data

In [5]:
data =raw_data.copy()
data = data.drop(['Unnamed: 0'],axis = 1)
data['y'] = data['y'].map({'yes':1,'no':0})
data.head()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.334,0.0,1.0,0.0,0.0,117.0,0
1,0.767,0.0,0.0,2.0,1.0,274.0,1
2,4.858,0.0,1.0,0.0,0.0,167.0,0
3,4.12,0.0,0.0,0.0,0.0,686.0,1
4,4.856,0.0,1.0,0.0,0.0,157.0,0


In [15]:
data.describe()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
count,518.0,518.0,518.0,518.0,518.0,518.0,518.0
mean,2.835776,0.034749,0.266409,0.388031,0.127413,382.177606,0.5
std,1.876903,0.183321,0.442508,0.814527,0.333758,344.29599,0.500483
min,0.635,0.0,0.0,0.0,0.0,9.0,0.0
25%,1.04275,0.0,0.0,0.0,0.0,155.0,0.0
50%,1.466,0.0,0.0,0.0,0.0,266.5,0.5
75%,4.9565,0.0,1.0,0.0,0.0,482.75,1.0
max,4.97,1.0,1.0,5.0,1.0,2653.0,1.0


## Declare the dependent and independent variables

In [52]:
estimators = ['interest_rate','credit','march','previous','duration']
y = data['y']
x1_all = data[estimators]

## Multiple logistic regression

In [64]:
x_all = sm.add_constant(x1_all)
reg_log = sm.Logit(y,x_all)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.336664
         Iterations 7


0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,512.0
Method:,MLE,Df Model:,5.0
Date:,"Fri, 10 Jan 2025",Pseudo R-squ.:,0.5143
Time:,14:19:00,Log-Likelihood:,-174.39
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,1.211e-77

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0211,0.311,-0.068,0.946,-0.631,0.589
interest_rate,-0.8001,0.089,-8.943,0.000,-0.975,-0.625
credit,2.3585,1.088,2.169,0.030,0.227,4.490
march,-1.8322,0.330,-5.556,0.000,-2.478,-1.186
previous,1.5363,0.501,3.067,0.002,0.554,2.518
duration,0.0070,0.001,9.381,0.000,0.006,0.008


## Train confusion matrix and show the accuracy

In [65]:
def confusion_matrix(data,actual_values,model):
    pred_values = model.predict(data)
    bins=np.array([0,0.5,1])
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
    accuracy = (cm[0,0]+cm[1,1])/cm.sum()
    return cm, accuracy

In [66]:
confusion_matrix(x_all,y,results_log)

(array([[218.,  41.],
        [ 30., 229.]]),
 0.862934362934363)

## Test the model.

### load new data

In [67]:
raw_data2 = pd.read_csv('Bank-data-testing.csv')
data2 =raw_data2.copy()
data2 = data2.drop(['Unnamed: 0'], axis=1)
data2.head()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.313,0.0,1.0,0.0,0.0,487.0,no
1,4.961,0.0,0.0,0.0,0.0,132.0,no
2,4.856,0.0,1.0,0.0,0.0,92.0,no
3,4.12,0.0,0.0,0.0,0.0,1468.0,yes
4,4.963,0.0,0.0,0.0,0.0,36.0,no


In [68]:
data2['y'] = data2['y'].map({'yes':1, 'no':0})
data2.head()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.313,0.0,1.0,0.0,0.0,487.0,0
1,4.961,0.0,0.0,0.0,0.0,132.0,0
2,4.856,0.0,1.0,0.0,0.0,92.0,0
3,4.12,0.0,0.0,0.0,0.0,1468.0,1
4,4.963,0.0,0.0,0.0,0.0,36.0,0


In [69]:
data2.describe()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
count,222.0,222.0,222.0,222.0,222.0,222.0,222.0
mean,2.922095,0.031532,0.274775,0.346847,0.099099,398.86036,0.5
std,1.891766,0.175144,0.44741,0.75595,0.29947,410.565798,0.50113
min,0.639,0.0,0.0,0.0,0.0,6.0,0.0
25%,1.04925,0.0,0.0,0.0,0.0,144.75,0.0
50%,1.714,0.0,0.0,0.0,0.0,255.5,0.5
75%,4.96,0.0,1.0,0.0,0.0,525.25,1.0
max,4.968,1.0,1.0,4.0,1.0,3643.0,1.0


## Declaring the dependent and independent variables

In [70]:
y_test = data2['y']
x1_test = data2[estimators]
x_test = sm.add_constant(x1_test)

## Test confusion matrix with its accuracy shown. the results compared to those of training confusion matrix to assess the model.. 

In [72]:
confusion_matrix(x_test, y_test, results_log)

(array([[93., 18.],
        [13., 98.]]),
 0.8603603603603603)

Looking at the test acccuracy we see a number which is a tiny but lower: 86.04%, compared to 86.29% for train accuracy. 