## Imports

In [3]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


%matplotlib inline

## Exploratory data analysis

In [5]:
df = pd.read_csv('bank-full.csv', sep=';')
print(df.shape)
df.head()

(45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [6]:
df.drop(columns=['default', 'loan'], inplace=True)
df.isna().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [7]:
df.education.mode()

0    secondary
Name: education, dtype: object

In [13]:
numerical_features = df.dtypes[df.dtypes != np.object_].index.values
categorical_features = df.dtypes[df.dtypes == np.object_].index.values

df[numerical_features].corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [14]:
df['y'] = df['y'].map({'yes': 1, 'no': 0})

target = df.pop('y')

X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

categorical_features = np.delete(categorical_features, np.argmax(categorical_features == 'y'))

In [19]:
X_train

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
20326,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown
24301,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown
38618,49,blue-collar,married,secondary,3309,yes,cellular,15,may,349,2,-1,0,unknown
18909,37,housemaid,married,primary,2410,no,cellular,4,aug,315,1,-1,0,unknown
23081,31,self-employed,married,tertiary,3220,no,cellular,26,aug,74,4,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13264,27,services,single,secondary,167,no,cellular,8,jul,606,2,-1,0,unknown
28829,40,technician,single,tertiary,693,no,cellular,30,jan,427,1,-1,0,unknown
3844,54,technician,divorced,secondary,0,yes,unknown,16,may,161,1,-1,0,unknown
15597,25,services,single,secondary,2311,no,cellular,21,jul,1105,2,-1,0,unknown


In [20]:
for feat in categorical_features:
    if feat == 'y':
        continue
        
    score = mutual_info_score(X_train[feat], y_train)
    print(f'MI of {feat} = {round(score, 3)}')

MI of job = 0.007
MI of marital = 0.002
MI of education = 0.003
MI of housing = 0.01
MI of contact = 0.013
MI of month = 0.025
MI of poutcome = 0.03


Split the data

In [30]:
df_train_dict = X_train[categorical_features].to_dict(orient='records')
df_val_dict = X_val[categorical_features].to_dict(orient='records')
df_test_dict = X_test[categorical_features].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train_cat = dv.fit_transform(df_train_dict)
X_val_cat = dv.transform(df_val_dict)
X_test_cat = dv.transform(df_test_dict)

X_train_full = np.hstack([X_train[numerical_features].values, X_train_cat])
X_val_full = np.hstack([X_val[numerical_features].values, X_val_cat])
X_test_full = np.hstack([X_test[numerical_features].values, X_test_cat])

Train model

In [31]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_full, y_train)

Get validation score

In [37]:
y_pred = model.predict_proba(X_val_full)[:, 1]
val_accuracy = model.score(X_val_full, y_val)
print(f'Validation Accuracy: {val_accuracy:.3f}')


Validation Accuracy: 0.901


Eliminate least significant features

In [40]:
X_train_reduced.shape, X_train_full.shape

((27126, 46), (27126, 47))

In [41]:
accuracy_diffs = {}
original_accuracy = val_accuracy

for i, feat in enumerate(list(numerical_features) + dv.feature_names_):
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

    X_train_reduced = np.delete(X_train_full, i, axis=1)
    X_val_reduced = np.delete(X_val_full, i, axis=1)
    
    model.fit(X_train_reduced, y_train)
    reduced_accuracy = model.score(X_val_reduced, y_val)
    
    accuracy_diffs[feat] = original_accuracy - reduced_accuracy
    print(f'Feature: {feat}, Accuracy difference: {accuracy_diffs[feat]:.4f}')

least_useful_feature = min(accuracy_diffs, key=accuracy_diffs.get)
print(f'Least useful feature: {least_useful_feature}')

Feature: age, Accuracy difference: 0.0001
Feature: balance, Accuracy difference: 0.0001
Feature: day, Accuracy difference: -0.0002
Feature: duration, Accuracy difference: 0.0113
Feature: campaign, Accuracy difference: 0.0006
Feature: pdays, Accuracy difference: 0.0000
Feature: previous, Accuracy difference: -0.0003
Feature: contact=cellular, Accuracy difference: -0.0001
Feature: contact=telephone, Accuracy difference: 0.0002
Feature: contact=unknown, Accuracy difference: -0.0002
Feature: education=primary, Accuracy difference: 0.0002
Feature: education=secondary, Accuracy difference: 0.0000
Feature: education=tertiary, Accuracy difference: 0.0001
Feature: education=unknown, Accuracy difference: 0.0001
Feature: housing=no, Accuracy difference: -0.0004
Feature: housing=yes, Accuracy difference: -0.0003
Feature: job=admin., Accuracy difference: -0.0003
Feature: job=blue-collar, Accuracy difference: 0.0000
Feature: job=entrepreneur, Accuracy difference: 0.0004
Feature: job=housemaid, Accur

Regularized LogReg

In [42]:
C_values = [0.01, 0.1, 1, 10, 100]
accuracy_results = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_full, y_train)
    
    val_accuracy = model.score(X_val_full, y_val)
    accuracy_results[C] = round(val_accuracy, 3)
    
    print(f'C={C}, Validation Accuracy: {accuracy_results[C]}')

print(accuracy_results)


C=0.01, Validation Accuracy: 0.898
C=0.1, Validation Accuracy: 0.901
C=1, Validation Accuracy: 0.901
C=10, Validation Accuracy: 0.902
C=100, Validation Accuracy: 0.901
{0.01: 0.898, 0.1: 0.901, 1: 0.901, 10: 0.902, 100: 0.901}
