In [4]:
!wget -O https://archive.ics.uci.edu/static/public/222/bank+marketing.zip "../data/bank+marketing.zip"
!unzip -o ../data/bank+marketing.zip -d ../data

zsh:1: command not found: wget
unzip:  cannot find or open ../data/bank+marketing.zip, ../data/bank+marketing.zip.zip or ../data/bank+marketing.zip.ZIP.


In [176]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression


In [76]:
df = pd.read_csv('../data/bank-full.csv', sep=';')
df = df[['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']]
df.isna().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [77]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [78]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

In [79]:
df.head(5)


Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [81]:
global_deposit_rate = round((df.y == 'yes').astype(int).mean(), 2)
global_deposit_rate

0.12

### Question 1

In [84]:
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

### Question 2

In [85]:
correlation_matrix = df[numerical].corr()
print(correlation_matrix)

               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000


In [86]:
df.y = (df.y == 'yes').astype(int)

In [147]:
y = df.y
X = df.drop('y', axis=1)

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=len(X) * 0.2 / len(X_train_full), random_state=42)

len(X_train), len(X_val), len(X_test)

(27125, 9043, 9043)

In [148]:
X_train_full[numerical].corrwith(y_train_full).abs().sort_values(ascending=False)

duration    0.393402
pdays       0.105742
previous    0.092051
campaign    0.072571
balance     0.052518
age         0.026684
day         0.025887
dtype: float64

### Question 3

In [112]:
def mutual_info_deposit_score(series: pd.Series):
    return round(mutual_info_score(series, y_train), 2)

mutual_infos = X_train[categorical].apply(mutual_info_deposit_score)
mutual_infos.sort_values(ascending=False)

month        0.03
poutcome     0.03
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

### Question 4

In [169]:
train_dict = X_train.to_dict(orient='records')
train_dict[0]

{'age': 38,
 'job': 'entrepreneur',
 'marital': 'married',
 'education': 'secondary',
 'balance': 0,
 'housing': 'yes',
 'contact': 'cellular',
 'day': 17,
 'month': 'nov',
 'duration': 258,
 'campaign': 1,
 'pdays': -1,
 'previous': 0,
 'poutcome': 'unknown'}

In [170]:
dict_vectorizer = DictVectorizer(sparse=False)
X_train_vect = dict_vectorizer.fit_transform(train_dict)
print(X_train_vect[0])
list(zip(X_train_vect[0], dict_vectorizer.feature_names_))

[ 38.   0.   1.   1.   0.   0.  17. 258.   0.   1.   0.   0.   0.   1.
   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.  -1.
   0.   0.   0.   1.   0.]


[(38.0, 'age'),
 (0.0, 'balance'),
 (1.0, 'campaign'),
 (1.0, 'contact=cellular'),
 (0.0, 'contact=telephone'),
 (0.0, 'contact=unknown'),
 (17.0, 'day'),
 (258.0, 'duration'),
 (0.0, 'education=primary'),
 (1.0, 'education=secondary'),
 (0.0, 'education=tertiary'),
 (0.0, 'education=unknown'),
 (0.0, 'housing=no'),
 (1.0, 'housing=yes'),
 (0.0, 'job=admin.'),
 (0.0, 'job=blue-collar'),
 (1.0, 'job=entrepreneur'),
 (0.0, 'job=housemaid'),
 (0.0, 'job=management'),
 (0.0, 'job=retired'),
 (0.0, 'job=self-employed'),
 (0.0, 'job=services'),
 (0.0, 'job=student'),
 (0.0, 'job=technician'),
 (0.0, 'job=unemployed'),
 (0.0, 'job=unknown'),
 (0.0, 'marital=divorced'),
 (1.0, 'marital=married'),
 (0.0, 'marital=single'),
 (0.0, 'month=apr'),
 (0.0, 'month=aug'),
 (0.0, 'month=dec'),
 (0.0, 'month=feb'),
 (0.0, 'month=jan'),
 (0.0, 'month=jul'),
 (0.0, 'month=jun'),
 (0.0, 'month=mar'),
 (0.0, 'month=may'),
 (1.0, 'month=nov'),
 (0.0, 'month=oct'),
 (0.0, 'month=sep'),
 (-1.0, 'pdays'),
 (0.0,

In [171]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_vect, y_train)

In [152]:
val_dict = X_val.to_dict(orient='records')
X_val_vect = dict_vectorizer.transform(val_dict)

val_predict = model.predict_proba(X_val_vect)
val_predict

array([[0.98678555, 0.01321445],
       [0.99017113, 0.00982887],
       [0.84000744, 0.15999256],
       ...,
       [0.99082813, 0.00917187],
       [0.71942805, 0.28057195],
       [0.96826863, 0.03173137]])

In [153]:
y_val_predict = (val_predict[:, 1] > 0.5).astype(int)

global_accuracy = round((y_val_predict == y_val).mean(), 2)
global_accuracy

0.9

### Question 5

In [180]:
diff_dict = {}

dict_vectorizer = DictVectorizer(sparse=False)

def get_trained_model(train_serie: pd.Series, reg: float = 1.0):
    train_serie_dict = train_serie.to_dict(orient='records')
    X_train_serie = dict_vectorizer.fit_transform(train_serie_dict)
    
    tmp_model = LogisticRegression(solver='liblinear', C=reg, max_iter=1000, random_state=42)
    tmp_model.fit(X_train_serie, y_train)
    
    return tmp_model

def predict_model(tmp_model, serie: pd.Series):    
    serie_dict = serie.to_dict(orient='records')
    X_serie = dict_vectorizer.fit_transform(serie_dict)

    serie_predict = tmp_model.predict_proba(X_serie)
    y_serie_predict = (serie_predict[:, 1] > 0.5).astype(int)
    
    return y_serie_predict

def vectorize_predict(train_serie: pd.Series, serie: pd.Series, exclude_col: str):
    train_model = get_trained_model(train_serie)
    y_predict = predict_model(train_model, serie)
    
    serie_accuracy = (y_predict == y_val).mean()
    
    print(f'Without {exclude_col} -> Accuracy: {serie_accuracy} - Global: {global_accuracy} - Diff: {global_accuracy - serie_accuracy}')
    diff_dict[exclude_col] = global_accuracy - serie_accuracy
    
for col in X_val.columns:
    X_train_col = X_train.copy().drop(col, axis=1)
    X_val_col = X_val.copy().drop(col, axis=1)
    
    vectorize_predict(X_train_col, X_val_col, col)
    
sorted_dict = sorted(diff_dict.items(), key=lambda item: abs(item[1]))
sorted_dict

Without age -> Accuracy: 0.900807254229791 - Global: 0.9 - Diff: -0.000807254229790999
Without job -> Accuracy: 0.9006966714585868 - Global: 0.9 - Diff: -0.0006966714585867617
Without marital -> Accuracy: 0.9011390025434037 - Global: 0.9 - Diff: -0.001139002543403711
Without education -> Accuracy: 0.9000331748313612 - Global: 0.9 - Diff: -3.3174831361226786e-05
Without balance -> Accuracy: 0.9010284197721995 - Global: 0.9 - Diff: -0.0010284197721994737
Without housing -> Accuracy: 0.9010284197721995 - Global: 0.9 - Diff: -0.0010284197721994737
Without contact -> Accuracy: 0.9009178370009953 - Global: 0.9 - Diff: -0.0009178370009952364
Without day -> Accuracy: 0.9004755059161783 - Global: 0.9 - Diff: -0.0004755059161782871
Without month -> Accuracy: 0.899922592060157 - Global: 0.9 - Diff: 7.740793984301053e-05
Without duration -> Accuracy: 0.8897489771093664 - Global: 0.9 - Diff: 0.010251022890633621
Without campaign -> Accuracy: 0.8993696782041358 - Global: 0.9 - Diff: 0.00063032179586

[('education', -3.3174831361226786e-05),
 ('month', 7.740793984301053e-05),
 ('day', -0.0004755059161782871),
 ('campaign', 0.0006303217958641971),
 ('job', -0.0006966714585867617),
 ('pdays', -0.0006966714585867617),
 ('age', -0.000807254229790999),
 ('previous', -0.000807254229790999),
 ('contact', -0.0009178370009952364),
 ('balance', -0.0010284197721994737),
 ('housing', -0.0010284197721994737),
 ('marital', -0.001139002543403711),
 ('poutcome', 0.006933539754506279),
 ('duration', 0.010251022890633621)]

### Question 6

In [182]:
C = [0.01, 0.1, 1, 10, 100]

for reg in C:
    model = get_trained_model(X_train, reg)
    y_predict = predict_model(model, X_val)
    
    accuracy = round((y_predict == y_val).mean(), 3)
    print(f'C: {reg} - Accuracy: {accuracy}')

C: 0.01 - Accuracy: 0.898
C: 0.1 - Accuracy: 0.901
C: 1 - Accuracy: 0.901
C: 10 - Accuracy: 0.901
C: 100 - Accuracy: 0.901
