In [13]:
import numpy as np
import pandas as pd

In [28]:
train_df = pd.read_csv("data/bank-full.csv")
test_df = pd.read_csv("data/bank.csv")
train_df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [15]:
X_train = train_df.drop(columns=["y"])
y_train = train_df["y"]

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

pp_obj = ColumnTransformer(
    transformers=[
        ("cat_cols", OneHotEncoder(), ["job", "marital", "education", "default", "housing", "loan", "contact", "day", "month", "poutcome"]),
        ("quant_cols", StandardScaler(), ["age", "balance", "day", "campaign", "pdays", "previous"])
    ],
    remainder = "passthrough"
)

pl_obj = Pipeline([
    ("preprocessor", pp_obj),
    ("classifier", RandomForestClassifier(criterion="entropy"))
])

hp_dict = {
    "classifier__max_depth": [2, 3, 4, 5],
    "classifier__min_samples_split": [9, 10, 11, 12]
}

gscv_obj = GridSearchCV(pl_obj, hp_dict, cv=5)
gscv_obj.fit(X_train, y_train)

In [22]:
gscv_obj.best_params_
# [4, 5, 6], [5, 6, 7] leads to 4 and 7
# [3, 4, 5], [6, 7, 8] leads to 4 and 8
# [3, 4, 5], [8, 9, 10] leads to 3 and 10
# [2, 3, 4, 5], [9, 10, 11, 12] leads to 3 and 12

{'classifier__max_depth': 3, 'classifier__min_samples_split': 12}

In [24]:
def to_binary(val):
    if val == "yes":
        return 1
    return 0

y_train_binary = y_train.apply(to_binary)

In [79]:
from xgboost import XGBClassifier

pp_obj2 = ColumnTransformer(
    transformers=[
        ("cat_cols", OneHotEncoder(), ["job", "marital", "education", "default", "housing", "loan", "contact", "day", "month", "poutcome"]),
        ("quant_cols", StandardScaler(), ["age", "balance", "day", "campaign", "pdays", "previous"])
    ],
    remainder = "passthrough"
)

pl_obj2 = Pipeline([
    ("preprocessor", pp_obj2),
    ("classifier", XGBClassifier())
])

hp_dict2 = {
    "classifier__max_depth": [2, 3, 4],
    "classifier__subsample": [0.5, 0.7, 0.9, 1]
}

gscv_obj2 = GridSearchCV(pl_obj2, hp_dict2, cv=5)
gscv_obj2.fit(X_train, y_train_binary)

In [80]:
gscv_obj2.best_params_

{'classifier__max_depth': 2, 'classifier__subsample': 0.9}

In [81]:
from sklearn.metrics import recall_score
X_test = test_df.drop(columns=["y"])
y_test = test_df["y"]

recall_score(y_test, gscv_obj.predict(X_test), average='macro')

0.5

In [82]:
y_test_binary = y_test.apply(to_binary)
recall_score(y_test_binary, pd.Series(gscv_obj2.predict(X_test)), average='macro')

0.6920839731285988