In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
%matplotlib inline

In [2]:
df = pd.read_csv("bank-full.csv")

In [3]:
len(df)

45211

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
default,no,no,no,no,no
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
loan,no,no,yes,no,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5


## Initial data preparation

In [5]:
df = df[["age", "job", "marital", "education", "balance", "housing", "contact", "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"]]
df

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,no


In [6]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

## Question 1

In [7]:
df.education.mode()

0    secondary
Name: education, dtype: object

## EDA

In [8]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [9]:
numerical = ["age", "balance", "day", "duration", "campaign", "pdays", "previous"]

In [10]:
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [11]:
df.columns

Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
       'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
       'y'],
      dtype='object')

In [12]:
df[numerical].nunique()

age           77
balance     7168
day           31
duration    1573
campaign      48
pdays        559
previous      41
dtype: int64

## Question 2

In [51]:
correlation_matrix = df[numerical].corr()
    

In [52]:
## To get thr correelational values and find maximum
max_corr = correlation_matrix.unstack().sort_values(ascending = False)

In [53]:
## To remove self correlation and get thr highest correlation features
max_corr = max_corr[max_corr < 1]
top_features = max_corr.idxmax()
print(f"The two features with the highest correlation are: {top_features}")

The two features with the highest correlation are: ('previous', 'pdays')


## Target encoding

In [15]:
df.y = (df.y == "yes").astype(int)

## Validation framework

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [18]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [19]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [20]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [21]:
del df_train["y"]
del df_val["y"]
del df_test["y"]

## Mutual information

In [22]:
from sklearn.metrics import mutual_info_score

In [23]:
def mutual_info_y_score(series):
    return mutual_info_score(series, df_full_train.y)

In [57]:
mi_score =  df_full_train[categorical].apply(mutual_info_y_score).index

## Question 3

In [58]:
mi_score.max()

'poutcome'

## One-hot encoding

In [25]:
from sklearn.feature_extraction import DictVectorizer

In [26]:
train_dicts = df_train[categorical + numerical].to_dict(orient = "records")

In [27]:
dv = DictVectorizer(sparse=False)

In [28]:
X_train = dv.fit_transform(train_dicts)

In [29]:
val_dicts = df_val[categorical + numerical].to_dict(orient = "records")

In [30]:
X_val = dv.transform(val_dicts)

## Logistic regression

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [33]:
model.fit(X_train, y_train)

In [34]:
model.intercept_[0]

-0.9075237913978677

In [35]:
model.coef_[0].round(3)

array([-2.000e-03,  0.000e+00, -8.000e-02,  2.470e-01,  7.100e-02,
       -1.226e+00,  6.000e-03,  4.000e-03, -4.180e-01, -2.310e-01,
       -7.200e-02, -1.860e-01, -1.020e-01, -8.050e-01,  7.400e-02,
       -2.410e-01, -2.400e-01, -2.790e-01, -9.200e-02,  3.360e-01,
       -2.730e-01, -1.530e-01,  2.420e-01, -1.630e-01,  1.600e-02,
       -1.330e-01, -2.960e-01, -4.470e-01, -1.640e-01,  7.600e-02,
       -6.730e-01,  3.180e-01, -3.140e-01, -9.450e-01, -9.480e-01,
        2.620e-01,  1.244e+00, -4.730e-01, -9.000e-01,  7.360e-01,
        7.110e-01, -1.000e-03, -7.640e-01, -5.450e-01,  1.487e+00,
       -1.085e+00,  3.000e-03])

In [36]:
y_pred = model.predict_proba(X_val)[:, 1]

In [37]:
y_decision = (y_pred >= 0.5)

## Question 4

In [38]:
original_accuracy = (y_val == y_decision).mean()
original_accuracy

0.9011280690112807

In [39]:
y_decision.astype(int)

array([0, 0, 0, ..., 0, 0, 0])

## Question 5

Without age

In [40]:
base_without_age = ['job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

train_dicts = df_train[base_without_age].to_dict(orient = "records")
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val[base_without_age].to_dict(orient = "records")
X_val = dv. transform(val_dicts)

In [41]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
y_decision = (y_pred >= 0.5)
accuracy_without_age = (y_val == y_decision).mean()
diff  = abs(original_accuracy - accuracy_without_age)
diff


0.0

Without balance

In [42]:
base_without_balance = ['age', 'job', 'marital', 'education', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

train_dicts = df_train[base_without_balance].to_dict(orient = "records")
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val[base_without_balance].to_dict(orient = "records")
X_val = dv. transform(val_dicts)

In [43]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
y_decision = (y_pred >= 0.5)
accuracy_without_balance = (y_val == y_decision).mean()
diff  = abs(original_accuracy - accuracy_without_balance)
diff

0.0

Without marital

In [44]:
base_without_marital = ['age', 'job', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

train_dicts = df_train[base_without_marital].to_dict(orient = "records")
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val[base_without_marital].to_dict(orient = "records")
X_val = dv. transform(val_dicts)

In [45]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
y_decision = (y_pred >= 0.5)
accuracy_without_marital = (y_val == y_decision).mean()
diff  = abs(original_accuracy - accuracy_without_marital)
diff

0.000774165007741745

Without previous

In [46]:
base_without_previous = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'poutcome']

train_dicts = df_train[base_without_previous].to_dict(orient = "records")
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val[base_without_previous].to_dict(orient = "records")
X_val = dv. transform(val_dicts)

In [47]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_val)[:, 1]
y_decision = (y_pred >= 0.5)
accuracy_without_previous = (y_val == y_decision).mean()
diff  = abs(original_accuracy - accuracy_without_previous)
diff

0.00044238000442387015

## Question 6

In [48]:
train_dicts = df_train[numerical + categorical].to_dict(orient = "records")
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val[numerical + categorical].to_dict(orient = "records")
X_val = dv. transform(val_dicts)

In [49]:
for c in [0.01, 0.1, 1, 10, 100]:

    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    y_decision = (y_pred >= 0.5)
    
    print("regularization_parameter %6s :" %c, (y_val == y_decision).mean())

regularization_parameter   0.01 : 0.898363193983632
regularization_parameter    0.1 : 0.9014598540145985
regularization_parameter      1 : 0.9011280690112807
regularization_parameter     10 : 0.9015704490157045
regularization_parameter    100 : 0.9007962840079629
