# BP:
Whether the client has subscribed a term deposit or not 

Attribute information For bank dataset

   Input variables:
   # bank client data:
   
   1 - age (numeric)
   
   2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur","student",
                                       "blue-collar","self-employed","retired","technician","services") 
                                       
   3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
   
   4 - education (categorical: "unknown","secondary","primary","tertiary")
   
   5 - default: has credit in default? (binary: "yes","no")
   
   6 - balance: average yearly balance, in euros (numeric) 
   
   7 - housing: has housing loan? (binary: "yes","no")
   
   8 - loan: has personal loan? (binary: "yes","no")
   
   # related with the last contact of the current campaign:
   
   9 - contact: contact communication type (categorical: "unknown","telephone","cellular") 
   
  10 - day: last contact day of the month (numeric)
  
  11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
  
  12 - duration: last contact duration, in seconds (numeric)
   # other attributes:
  13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
  
  14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
  
  15 - previous: number of contacts performed before this campaign and for this client (numeric)
  
  16 - poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

  Output variable (desired target):
  17 - y - has the client subscribed a term deposit? (binary: "yes","no")

8. Missing Attribute Values: None

In [121]:
# Importing modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [134]:
# Loading data set
data = pd.read_csv('bank.csv',delimiter=';')

In [135]:
data.shape

(45211, 17)

In [136]:
data.tail()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no
45210,37,entrepreneur,married,secondary,no,2971,no,no,cellular,17,nov,361,2,188,11,other,no


In [137]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [138]:
# Removing unwanted column and nan values
data = data.drop(['contact','day','month','duration'],axis=1)

In [139]:
data.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [149]:
# Converting object-categorical-numeric
data[["job",'marital','education','default','housing','loan','poutcome','y']] = data[['job','marital','education','default','housing','loan','poutcome','y']].astype("category")

In [150]:
df = data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   age        45211 non-null  int64   
 1   job        45211 non-null  category
 2   marital    45211 non-null  category
 3   education  45211 non-null  category
 4   default    45211 non-null  category
 5   balance    45211 non-null  int64   
 6   housing    45211 non-null  category
 7   loan       45211 non-null  category
 8   campaign   45211 non-null  int64   
 9   pdays      45211 non-null  int64   
 10  previous   45211 non-null  int64   
 11  poutcome   45211 non-null  category
 12  y          45211 non-null  category
dtypes: category(8), int64(5)
memory usage: 2.1 MB


In [151]:
df["job"] = df["job"].cat.codes
df["marital"] = df["marital"].cat.codes
df["education"] = df["education"].cat.codes
df["default"] = df["default"].cat.codes
df["housing"] = df["housing"].cat.codes
df["loan"] = df["loan"].cat.codes
df["poutcome"] = df["poutcome"].cat.codes
df["y"] = df["y"].cat.codes

In [152]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   age        45211 non-null  int64
 1   job        45211 non-null  int8 
 2   marital    45211 non-null  int8 
 3   education  45211 non-null  int8 
 4   default    45211 non-null  int8 
 5   balance    45211 non-null  int64
 6   housing    45211 non-null  int8 
 7   loan       45211 non-null  int8 
 8   campaign   45211 non-null  int64
 9   pdays      45211 non-null  int64
 10  previous   45211 non-null  int64
 11  poutcome   45211 non-null  int8 
 12  y          45211 non-null  int8 
dtypes: int64(5), int8(8)
memory usage: 2.1 MB


In [160]:
x = df.drop(['y'],axis=1)
y = df['y']

In [166]:
model = LogisticRegression(solver='liblinear', random_state=0)

In [167]:
model.fit(x, y)

LogisticRegression(random_state=0, solver='liblinear')

In [168]:
model.intercept_

array([-1.78039995])

In [170]:
model.coef_

array([[-8.64562944e-04,  7.93078570e-03,  1.01906162e-01,
         1.52238602e-01, -2.51807304e-01,  2.26138209e-05,
        -1.04091069e+00, -6.13026921e-01, -1.36489831e-01,
         2.71025745e-03,  7.28118756e-02,  4.84595284e-02]])

In [187]:
 model.predict_proba(x)

array([[0.91518927, 0.08481073],
       [0.91872058, 0.08127942],
       [0.96032834, 0.03967166],
       ...,
       [0.78381168, 0.21618832],
       [0.87607655, 0.12392345],
       [0.59610404, 0.40389596]])

In [190]:
y_pred = model.predict(x)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int8)

# Model Validation

In [None]:
# Confusion matrix
