# Import libs

In [58]:
import pandas as pd
import numpy as np

# Read data

In [59]:
df=pd.read_csv('bank-full.csv',sep=';')

In [60]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [61]:
df['y']=np.where(df.y=='no',0,1)

In [62]:
#Mode for the column `education` (1 point)

In [63]:
df['education'].mode()

0    secondary
Name: education, dtype: object

In [64]:
df[df.select_dtypes(exclude='object').columns].corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288,0.025155
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674,0.052838
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171,-0.028348
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203,0.394521
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855,-0.073172
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482,0.103621
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0,0.093236
y,0.025155,0.052838,-0.028348,0.394521,-0.073172,0.103621,0.093236,1.0


In [65]:
from sklearn.model_selection import train_test_split

In [66]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [67]:
df_train, df_val = train_test_split(df_train_full, test_size=0.2, random_state=42)

In [68]:
y_train = df_train.y.values
y_val = df_val.y.values

In [69]:
del df_train['y']
del df_val['y']

In [70]:
from sklearn.metrics import mutual_info_score

In [71]:
def calculate_mi(series):
    return mutual_info_score(series, y_train)

In [72]:
categorical = df.drop(columns='y').select_dtypes(include='object').columns

In [73]:
df_mi = df_train[categorical].apply(calculate_mi)

In [74]:
df_mi.round(2)

job          0.01
marital      0.00
education    0.00
default      0.00
housing      0.01
loan         0.00
contact      0.01
month        0.02
poutcome     0.03
dtype: float64

In [75]:
from sklearn.feature_extraction import DictVectorizer

In [76]:
train_dict = df_train.to_dict(orient='records')

In [77]:
train_dict[0]

{'age': 36,
 'job': 'management',
 'marital': 'married',
 'education': 'tertiary',
 'default': 'yes',
 'balance': -485,
 'housing': 'yes',
 'loan': 'no',
 'contact': 'cellular',
 'day': 19,
 'month': 'nov',
 'duration': 278,
 'campaign': 1,
 'pdays': -1,
 'previous': 0,
 'poutcome': 'unknown'}

In [78]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [79]:
X_train = dv.transform(train_dict)

In [80]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [81]:
model.fit(X_train, y_train)

In [82]:
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [83]:
model.predict_proba(X_val)

array([[0.98580176, 0.01419824],
       [0.9900552 , 0.0099448 ],
       [0.8319817 , 0.1680183 ],
       ...,
       [0.33129748, 0.66870252],
       [0.96407977, 0.03592023],
       [0.14236777, 0.85763223]])

In [84]:
y_pred = model.predict_proba(X_val)[:, 1]

In [85]:
y_pred

array([0.01419824, 0.0099448 , 0.1680183 , ..., 0.66870252, 0.03592023,
       0.85763223])

In [86]:
churn = y_pred > 0.5

In [87]:
(y_val == churn).mean()

0.9011611833010782

In [90]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'age': 0.0,
 'balance': 0.0,
 'campaign': -0.084,
 'contact=cellular': 0.313,
 'contact=telephone': 0.187,
 'contact=unknown': -1.263,
 'day': 0.009,
 'default=no': -0.456,
 'default=yes': -0.307,
 'duration': 0.004,
 'education=primary': -0.396,
 'education=secondary': -0.192,
 'education=tertiary': 0.013,
 'education=unknown': -0.188,
 'housing=no': -0.032,
 'housing=yes': -0.731,
 'job=admin.': 0.13,
 'job=blue-collar': -0.18,
 'job=entrepreneur': -0.249,
 'job=housemaid': -0.371,
 'job=management': -0.066,
 'job=retired': 0.255,
 'job=self-employed': -0.188,
 'job=services': -0.094,
 'job=student': 0.34,
 'job=technician': -0.136,
 'job=unemployed': -0.03,
 'job=unknown': -0.175,
 'loan=no': -0.151,
 'loan=yes': -0.612,
 'marital=divorced': -0.25,
 'marital=married': -0.389,
 'marital=single': -0.123,
 'month=apr': -0.02,
 'month=aug': -0.746,
 'month=dec': 0.429,
 'month=feb': -0.289,
 'month=jan': -1.215,
 'month=jul': -0.925,
 'month=jun': 0.344,
 'month=mar': 1.482,
 'month=ma

In [91]:
for i in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=i, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    churn = y_pred > 0.5
    print((y_val == churn).mean())

0.8993641139065524
0.9011611833010782
0.9011611833010782
0.9006082388719934
0.9011611833010782
