In [2]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

In [4]:
df = pd.read_csv("/content/bank-additional-full.csv", sep=";")

In [5]:
df.y.value_counts()

y
no     36548
yes     4640
Name: count, dtype: int64

In [6]:
y = df["y"].map({"no":0, "yes":1})
X = df.drop("y", axis=1)

In [7]:
X.drop("duration", inplace=True, axis=1)

In [8]:
X.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
dtype: object

In [9]:
num_features = ["age", "campaign", "pdays", "previous", "emp.var.rate",
                "cons.price.idx", "cons.conf.idx","euribor3m", "nr.employed"]

cat_features = ["job", "marital", "education","default", "housing", "loan",
                "contact", "month", "day_of_week", "poutcome"]

In [10]:
preprocessor = ColumnTransformer([("numerical", "passthrough", num_features),
                                  ("categorical", OneHotEncoder(sparse=False, handle_unknown="ignore"),
                                   cat_features)])

In [11]:
# Logistic Regression
lr_model = Pipeline([("preprocessor", preprocessor),
                     ("model", LogisticRegression(class_weight="balanced", solver="liblinear", random_state=42))])

# Decision Tree
dt_model = Pipeline([("preprocessor", preprocessor),
                     ("model", DecisionTreeClassifier(class_weight="balanced"))])

# Random Forest
rf_model = Pipeline([("preprocessor", preprocessor),
                     ("model", RandomForestClassifier(class_weight="balanced", n_estimators=100, n_jobs=-1))])

# XGBoost
xgb_model = Pipeline([("preprocessor", preprocessor),
                      # Add a scale_pos_weight to make it balanced
                      ("model", XGBClassifier(scale_pos_weight=(1 - y.mean()), n_jobs=-1))])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.3, random_state=42)

In [13]:
gs = GridSearchCV(lr_model, {"model__C": [1, 1.3, 1.5]}, n_jobs=-1, cv=5, scoring="accuracy")
gs.fit(X_train, y_train)



In [14]:
print(gs.best_params_)
print(gs.best_score_)

{'model__C': 1.3}
0.8276507327775018


In [15]:
lr_model.set_params(**gs.best_params_)

In [16]:
lr_model.get_params("model")

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('numerical', 'passthrough',
                                    ['age', 'campaign', 'pdays', 'previous',
                                     'emp.var.rate', 'cons.price.idx',
                                     'cons.conf.idx', 'euribor3m',
                                     'nr.employed']),
                                   ('categorical',
                                    OneHotEncoder(handle_unknown='ignore',
                                                  sparse=False),
                                    ['job', 'marital', 'education', 'default',
                                     'housing', 'loan', 'contact', 'month',
                                     'day_of_week', 'poutcome'])])),
  ('model',
   LogisticRegression(C=1.3, class_weight='balanced', random_state=42,
                      solver='liblinear'))],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('numerical

In [17]:

lr_model.fit(X_train, y_train)



In [18]:
y_pred = lr_model.predict(X_test)

In [19]:
accuracy_score(y_test, y_pred)

0.8323217609452133

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90     10965
           1       0.36      0.64      0.46      1392

    accuracy                           0.83     12357
   macro avg       0.66      0.75      0.68     12357
weighted avg       0.88      0.83      0.85     12357



In [22]:
!pip install eli5
import eli5
eli5.show_weights(lr_model.named_steps["model"])

Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/216.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m204.8/216.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.2/216.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25l[?25hdone
  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107720 sha256=e88e37b2c6799f2d868d5af60cc5e0eb1c5d3784d7d1e703e9a80483e77ed3fd
  Stored in directory: /root/.cache/pip/wheels/b8/58/ef/2cf4c306898c2338d51540e0922c8e0d6028e07007085c0004
Successfully built eli5
Installing collected packages: eli5
Successfully installed eli5-0.13.0


Weight?,Feature
+1.031,x49
+0.699,x7
+0.604,x5
+0.522,x29
+0.401,x24
+0.372,x14
+0.318,x46
+0.276,x45
+0.243,x42
+0.225,x61


In [23]:
preprocessor = lr_model.named_steps["preprocessor"]

In [24]:
ohe_categories = preprocessor.named_transformers_["categorical"].categories_

In [25]:
new_ohe_features = [f"{col}__{val}" for col, vals in zip(cat_features, ohe_categories) for val in vals]

In [26]:
all_features = num_features + new_ohe_features

In [27]:
pd.DataFrame(lr_model.named_steps["preprocessor"].transform(X_train), columns=all_features).head()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job__admin.,...,month__oct,month__sep,day_of_week__fri,day_of_week__mon,day_of_week__thu,day_of_week__tue,day_of_week__wed,poutcome__failure,poutcome__nonexistent,poutcome__success
0,50.0,2.0,999.0,0.0,1.1,93.994,-36.4,4.86,5191.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,51.0,5.0,999.0,0.0,1.1,93.994,-36.4,4.858,5191.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,46.0,2.0,999.0,0.0,-1.8,92.893,-46.2,1.244,5099.1,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,46.0,1.0,999.0,0.0,1.4,94.465,-41.8,4.961,5228.1,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,25.0,5.0,999.0,0.0,-1.8,92.893,-46.2,1.266,5099.1,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [28]:
eli5.show_weights(lr_model.named_steps["model"], feature_names=all_features)

Weight?,Feature
+1.031,month__mar
+0.699,euribor3m
+0.604,cons.price.idx
+0.522,education__illiterate
+0.401,marital__unknown
+0.372,job__retired
+0.318,month__dec
+0.276,month__aug
+0.243,contact__cellular
+0.225,poutcome__success


In [29]:
i = 4
X_test.iloc[[i]]

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
39993,27,unknown,single,university.degree,no,yes,no,cellular,jun,wed,4,3,2,success,-1.7,94.055,-39.8,0.767,4991.6


In [30]:
y_test.iloc[i]

1

In [31]:
eli5.show_prediction(lr_model.named_steps["model"],
                     lr_model.named_steps["preprocessor"].transform(X_test)[i],
                     feature_names=all_features, show_feature_values=True)

Contribution?,Feature,Value
56.8,cons.price.idx,94.055
1.508,emp.var.rate,-1.7
0.536,euribor3m,0.767
0.286,cons.conf.idx,-39.8
0.243,contact__cellular,1.0
0.225,poutcome__success,1.0
0.122,day_of_week__wed,1.0
0.113,default__no,1.0
0.065,job__unknown,1.0
-0.004,pdays,3.0


In [32]:
gs = GridSearchCV(dt_model, {"model__max_depth": [3, 5, 7],
                             "model__min_samples_split": [2, 5]},
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, y_train)

  pid = os.fork()


In [33]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 5, 'model__min_samples_split': 2}
0.8509929442344253


In [34]:
dt_model.set_params(**gs.best_params_)

In [35]:
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)



In [36]:
accuracy_score(y_test, y_pred)

0.8553856113943514

In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92     10965
           1       0.41      0.62      0.49      1392

    accuracy                           0.86     12357
   macro avg       0.68      0.75      0.70     12357
weighted avg       0.89      0.86      0.87     12357



In [38]:
eli5.show_weights(dt_model.named_steps["model"], feature_names=all_features)

Weight,Feature
0.7088,nr.employed
0.1340,cons.conf.idx
0.0488,cons.price.idx
0.0338,pdays
0.0211,month__oct
0.0194,euribor3m
0.0125,default__unknown
0.0081,poutcome__failure
0.0045,contact__telephone
0.0042,campaign


In [39]:
eli5.show_prediction(dt_model.named_steps["model"],
                     dt_model.named_steps["preprocessor"].transform(X_test)[i],
                     feature_names=all_features, show_feature_values=True)

Contribution?,Feature,Value
0.5,<BIAS>,1.0
0.369,nr.employed,4991.6
0.083,pdays,3.0
0.008,day_of_week__mon,0.0
0.0,campaign,4.0


In [40]:
gs = GridSearchCV(rf_model, {"model__max_depth": [10, 15],
                             "model__min_samples_split": [5, 10]},
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, y_train)



In [41]:
print(gs.best_params_)
print(gs.best_score_)

{'model__max_depth': 15, 'model__min_samples_split': 5}
0.8753077676333841


In [42]:
rf_model.set_params(**gs.best_params_)

In [43]:
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)



In [44]:
accuracy_score(y_test, y_pred)

0.8793396455450352

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93     10965
           1       0.47      0.57      0.52      1392

    accuracy                           0.88     12357
   macro avg       0.71      0.75      0.72     12357
weighted avg       0.89      0.88      0.88     12357



In [46]:
eli5.show_weights(rf_model.named_steps["model"],
                  feature_names=all_features)

Weight,Feature
0.1309  ± 0.2456,euribor3m
0.1263  ± 0.2665,nr.employed
0.0909  ± 0.2327,emp.var.rate
0.0694  ± 0.0290,age
0.0495  ± 0.1114,cons.conf.idx
0.0441  ± 0.1371,pdays
0.0432  ± 0.0982,cons.price.idx
0.0405  ± 0.0147,campaign
0.0197  ± 0.0821,poutcome__success
0.0196  ± 0.0560,contact__cellular


In [47]:
gs = GridSearchCV(xgb_model, {"model__max_depth": [5, 10],
                              "model__min_child_weight": [5, 10],
                              "model__n_estimators": [25]},
                  n_jobs=-1, cv=5, scoring="accuracy")

gs.fit(X_train, y_train)



In [48]:
print(gs.best_params_)
print(gs.best_score_)
xgb_model.set_params(**gs.best_params_)
xgb_model.fit(X_train, y_train)

{'model__max_depth': 5, 'model__min_child_weight': 5, 'model__n_estimators': 25}
0.9002461587725588




In [49]:
y_pred = xgb_model.predict(X_test)

In [50]:
accuracy_score(y_test, y_pred)

0.9015133122926277

In [51]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95     10965
           1       0.69      0.23      0.34      1392

    accuracy                           0.90     12357
   macro avg       0.80      0.61      0.65     12357
weighted avg       0.88      0.90      0.88     12357



In [53]:
!pip install lime
from lime.lime_tabular import LimeTabularExplainer

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m174.1/275.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283835 sha256=8d0cdf89f5ce60a96024f4065ef3a7619acc6e7c5cab579fd701dbcd18e70e9c
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [54]:
categorical_names = {}
for col in cat_features:
    categorical_names[X_train.columns.get_loc(col)] = [new_col.split("__")[1]
                                                       for new_col in new_ohe_features
                                                       if new_col.split("__")[0] == col]

In [55]:
categorical_names

{1: ['admin.',
  'blue-collar',
  'entrepreneur',
  'housemaid',
  'management',
  'retired',
  'self-employed',
  'services',
  'student',
  'technician',
  'unemployed',
  'unknown'],
 2: ['divorced', 'married', 'single', 'unknown'],
 3: ['basic.4y',
  'basic.6y',
  'basic.9y',
  'high.school',
  'illiterate',
  'professional.course',
  'university.degree',
  'unknown'],
 4: ['no', 'unknown', 'yes'],
 5: ['no', 'unknown', 'yes'],
 6: ['no', 'unknown', 'yes'],
 7: ['cellular', 'telephone'],
 8: ['apr', 'aug', 'dec', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep'],
 9: ['fri', 'mon', 'thu', 'tue', 'wed'],
 13: ['failure', 'nonexistent', 'success']}

In [58]:
i = 2
X_observation = X_test.iloc[[i], :]
X_observation

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
12077,35,technician,single,professional.course,no,no,no,telephone,jun,fri,1,999,0,nonexistent,1.4,94.465,-41.8,4.947,5228.1


In [59]:
print(f"""\
* True label: {y_test.iloc[i]}
* LR: {lr_model.predict_proba(X_observation)[0]}
* DT: {dt_model.predict_proba(X_observation)[0]}
* RF: {rf_model.predict_proba(X_observation)[0]}
* XGB: {xgb_model.predict_proba(X_observation)[0]}""")

* True label: 0
* LR: [0.71202722 0.28797278]
* DT: [0.75848014 0.24151986]
* RF: [0.8087215 0.1912785]
* XGB: [0.9572216  0.04277838]


In [63]:
from functools import partial

def custom_predict_proba(X, model):
    X_str = convert_to_lime_format(X, categorical_names, col_names=X_train.columns, invert=True)
    return model.predict_proba(X_str)

lr_predict_proba = partial(custom_predict_proba, model=lr_model)
dt_predict_proba = partial(custom_predict_proba, model=dt_model)
rf_predict_proba = partial(custom_predict_proba, model=rf_model)
xgb_predict_proba = partial(custom_predict_proba, model=xgb_model)