In [17]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [18]:
full_df = pd.read_csv("data/bank-full.csv") #test_df = pd.read_csv("data/bank.csv")
train_df, test_df = train_test_split(full_df, test_size=0.2)
train_df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
21599,33,management,married,tertiary,no,801,no,no,cellular,19,aug,133,4,-1,0,unknown,no
42724,38,unemployed,married,primary,no,7005,yes,no,cellular,25,jan,696,1,202,2,success,yes
17991,35,services,married,secondary,no,5,no,no,cellular,30,jul,153,2,-1,0,unknown,no
30876,49,blue-collar,married,primary,no,3103,no,no,cellular,9,feb,26,5,-1,0,unknown,no
17535,33,technician,married,tertiary,no,-482,yes,no,cellular,29,jul,90,3,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16810,26,services,married,secondary,no,199,no,no,cellular,24,jul,139,6,-1,0,unknown,no
16630,30,technician,married,secondary,no,13,yes,yes,cellular,24,jul,205,1,-1,0,unknown,no
18846,42,technician,married,secondary,no,0,no,no,cellular,4,aug,240,1,-1,0,unknown,no
25955,55,unemployed,married,secondary,no,4733,yes,no,cellular,19,nov,70,1,174,2,other,no


In [19]:
X_train = train_df.drop(columns=["y"])
y_train = train_df["y"]

X_test = test_df.drop(columns=["y"])
y_test = test_df["y"]

In [20]:
def to_binary(val):
    if val == "yes":
        return 1
    return 0

In [21]:
# Baseline Models
# logrc
# knn
# dtc
# rfc
# xgbc

In [22]:
pp_obj = ColumnTransformer(
    transformers=[
        ("cat_cols", OneHotEncoder(), ["job", "marital", "education", "default", "housing", "loan", "contact", "day", "month", "poutcome"]),
        ("quant_cols", StandardScaler(), ["age", "balance", "duration", "campaign", "pdays", "previous"])
    ],
    remainder = "passthrough"
)

pl_obj = Pipeline([
    ("preprocessor", pp_obj),
    ("classifier", LogisticRegression())
])

y_train_binary = y_train.apply(to_binary)
pl_obj.fit(X_train, y_train_binary)

train_recall_1 = recall_score(y_train_binary, pl_obj.predict(X_train))
y_test_binary = y_test.apply(to_binary)
test_recall_1 = recall_score(y_test_binary, pl_obj.predict(X_test))
[train_recall_1, test_recall_1]

[0.36658061487913635, 0.33949416342412453]

In [23]:
pp_obj = ColumnTransformer(
    transformers=[
        ("cat_cols", OneHotEncoder(), ["job", "marital", "education", "default", "housing", "loan", "contact", "day", "month", "poutcome"]),
        ("quant_cols", StandardScaler(), ["age", "balance", "duration", "campaign", "pdays", "previous"])
    ],
    remainder = "passthrough"
)

pl_obj = Pipeline([
    ("preprocessor", pp_obj),
    ("classifier", KNeighborsClassifier())
])

y_train_binary = y_train.apply(to_binary)
pl_obj.fit(X_train, y_train_binary)

train_recall_2 = recall_score(y_train_binary, pl_obj.predict(X_train))
y_test_binary = y_test.apply(to_binary)
test_recall_2 = recall_score(y_test_binary, pl_obj.predict(X_test))
[train_recall_2, test_recall_2]

[0.49002581553625907, 0.3433852140077821]

In [24]:
pp_obj = ColumnTransformer(
    transformers=[
        ("cat_cols", OneHotEncoder(), ["job", "marital", "education", "default", "housing", "loan", "contact", "day", "month", "poutcome"]),
        ("quant_cols", StandardScaler(), ["age", "balance", "duration", "campaign", "pdays", "previous"])
    ],
    remainder = "passthrough"
)

pl_obj = Pipeline([
    ("preprocessor", pp_obj),
    ("classifier", DecisionTreeClassifier())
])

y_train_binary = y_train.apply(to_binary)
pl_obj.fit(X_train, y_train_binary)

train_recall_3 = recall_score(y_train_binary, pl_obj.predict(X_train))
y_test_binary = y_test.apply(to_binary)
test_recall_3 = recall_score(y_test_binary, pl_obj.predict(X_test))
[train_recall_3, test_recall_3]

[1.0, 0.4805447470817121]

In [25]:
pp_obj = ColumnTransformer(
    transformers=[
        ("cat_cols", OneHotEncoder(), ["job", "marital", "education", "default", "housing", "loan", "contact", "day", "month", "poutcome"]),
        ("quant_cols", StandardScaler(), ["age", "balance", "duration", "campaign", "pdays", "previous"])
    ],
    remainder = "passthrough"
)

pl_obj = Pipeline([
    ("preprocessor", pp_obj),
    ("classifier", RandomForestClassifier())
])

y_train_binary = y_train.apply(to_binary)
pl_obj.fit(X_train, y_train_binary)

train_recall_4 = recall_score(y_train_binary, pl_obj.predict(X_train))
y_test_binary = y_test.apply(to_binary)
test_recall_4 = recall_score(y_test_binary, pl_obj.predict(X_test))
[train_recall_4, test_recall_4]
# In datasets with imbalanced class distributions, Random Forests may be biased toward the majority class (the negative one in our case), impacting the predictive performance for minority classes (the positive one).

[1.0, 0.3784046692607004]

In [26]:
pp_obj = ColumnTransformer(
    transformers=[
        ("cat_cols", OneHotEncoder(), ["job", "marital", "education", "default", "housing", "loan", "contact", "day", "month", "poutcome"]),
        ("quant_cols", StandardScaler(), ["age", "balance", "duration", "campaign", "pdays", "previous"])
    ],
    remainder = "passthrough"
)

pl_obj = Pipeline([
    ("preprocessor", pp_obj),
    ("classifier", XGBClassifier())
])

y_train_binary = y_train.apply(to_binary)
pl_obj.fit(X_train, y_train_binary)

train_recall_5 = recall_score(y_train_binary, pl_obj.predict(X_train))
y_test_binary = y_test.apply(to_binary)
test_recall_5 = recall_score(y_test_binary, pl_obj.predict(X_test))
[train_recall_5, test_recall_5]

[0.7033560197136822, 0.5]

In [32]:
recalls_df = pd.DataFrame(np.array([[train_recall_1, test_recall_1],
                                    [train_recall_2, test_recall_2],
                                    [train_recall_3, test_recall_3],
                                    [train_recall_4, test_recall_4],
                                    [train_recall_5, test_recall_5]]),
                          columns=["Training Recall", "Testing Recall"],
                          index=["Logistic Regression",
                                 "K-Nearest Neighbors",
                                 "Decision Tree",
                                 "Random Forest",
                                 "XGBoost"])
recalls_df

Unnamed: 0,Training Recall,Testing Recall
Logistic Regression,0.366581,0.339494
K-Nearest Neighbors,0.490026,0.343385
Decision Tree,1.0,0.480545
Random Forest,1.0,0.378405
XGBoost,0.703356,0.5


In [30]:
recalls_df = pd.DataFrame(np.array([[train_recall_1, test_recall_1],
                                    [train_recall_2, test_recall_2],
                                    [train_recall_3, test_recall_3],
                                    [train_recall_4, test_recall_4],
                                    [train_recall_5, test_recall_5]]),
                          columns=["training recall", "testing recall"],
                          index=["Logarithmic Regression",
                                 "K-Nearest Neighbors",
                                 "Decision Tree",
                                 "Random Forest",
                                 "XGBoost"])
recalls_df

Unnamed: 0,training recall,testing recall
Logarithmic Regression,0.366581,0.339494
K-Nearest Neighbors,0.490026,0.343385
Decision Tree,1.0,0.480545
Random Forest,1.0,0.378405
XGBoost,0.703356,0.5


In [10]:
# the above are baselines
# the below are model options

In [62]:
pp_obj = ColumnTransformer(
    transformers=[
        ("cat_cols", OneHotEncoder(), ["job", "marital", "education", "default", "housing", "loan", "contact", "day", "month", "poutcome"]),
        ("quant_cols", StandardScaler(), ["age", "balance", "duration", "campaign", "pdays", "previous"])
    ],
    remainder = "passthrough"
)

pl_obj = Pipeline([
    ("preprocessor", pp_obj),
    ("classifier", XGBClassifier(max_depth = 6))
])

y_train_binary = y_train.apply(to_binary)
pl_obj.fit(X_train, y_train_binary)

train_recall_5 = recall_score(y_train_binary, pl_obj.predict(X_train))
y_test_binary = y_test.apply(to_binary)
test_recall_5 = recall_score(y_test_binary, pl_obj.predict(X_test))
[train_recall_5, test_recall_5] # sps=8: 0.8132, 

[0.7033560197136822, 0.5]

In [80]:
pp_obj = ColumnTransformer(
    transformers=[
        ("cat_cols", OneHotEncoder(), ["job", "marital", "education", "default", "housing", "loan", "contact", "day", "month", "poutcome"]),
        ("quant_cols", StandardScaler(), ["age", "balance", "duration", "campaign", "pdays", "previous"])
    ],
    remainder = "passthrough"
)

pl_obj = Pipeline([
    ("preprocessor", pp_obj),
    ("classifier", XGBClassifier())
])

hp_dict = {
    "classifier__scale_pos_weight": [8, 9, 10],
    "classifier__max_depth": [4, 5, 6]
}

gscv_obj = GridSearchCV(pl_obj, hp_dict, cv=5)

def to_binary(val):
    if val == "yes":
        return 1
    return 0

y_train_binary = y_train.apply(to_binary)
gscv_obj.fit(X_train, y_train_binary)

train_recall = recall_score(y_train_binary, gscv_obj.predict(X_train))
y_test_binary = y_test.apply(to_binary)
test_recall = recall_score(y_test_binary, gscv_obj.predict(X_test))
[train_recall, test_recall]

[0.9767660173668153, 0.8132295719844358]

In [81]:
gscv_obj.best_params_

{'classifier__max_depth': 6, 'classifier__scale_pos_weight': 8}

In [14]:
# [0.7177650429799427, 0.4904632152588556]
# [0.983046800382044, 0.7965485921889192]
# [0.9756446991404012, 0.8201634877384196]

In [None]:
    "classifier__eta": [0.1],
    "classifier__gamma": [0.5],
    "classifier__max_depth": [10],
    "classifier__subsample": [0.5],

In [76]:
pp_obj = ColumnTransformer(
    transformers=[
        ("cat_cols", OneHotEncoder(), ["job", "marital", "education", "default", "housing", "loan", "contact", "day", "month", "poutcome"]),
    ],
    remainder = "passthrough"
)

pl_obj = Pipeline([
    ("preprocessor", pp_obj),
    ("classifier", DecisionTreeClassifier())
])

hp_dict = {
    "classifier__max_depth": [15],
}

gscv_obj = GridSearchCV(pl_obj, hp_dict, cv=5)

def to_binary(val):
    if val == "yes":
        return 1
    return 0

y_train_binary = y_train.apply(to_binary)
gscv_obj.fit(X_train, y_train_binary)

train_recall = recall_score(y_train_binary, gscv_obj.predict(X_train))
y_test_binary = y_test.apply(to_binary)
test_recall = recall_score(y_test_binary, gscv_obj.predict(X_test))
[train_recall, test_recall]

[0.7498813478879924, 0.5060465116279069]

In [77]:
gscv_obj.best_params_

{'classifier__max_depth': 15}