In [52]:
import pandas as pd
# pre-processing pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split



In [53]:
df = pd.read_csv("data/bank-full.csv", sep=";")

In [54]:
## df.shape
## df.columns
df.head

<bound method NDFrame.head of        age           job   marital  education default  balance housing loan  \
0       58    management   married   tertiary      no     2143     yes   no   
1       44    technician    single  secondary      no       29     yes   no   
2       33  entrepreneur   married  secondary      no        2     yes  yes   
3       47   blue-collar   married    unknown      no     1506     yes   no   
4       33       unknown    single    unknown      no        1      no   no   
...    ...           ...       ...        ...     ...      ...     ...  ...   
45206   51    technician   married   tertiary      no      825      no   no   
45207   71       retired  divorced    primary      no     1729      no   no   
45208   72       retired   married  secondary      no     5715      no   no   
45209   57   blue-collar   married  secondary      no      668      no   no   
45210   37  entrepreneur   married  secondary      no     2971      no   no   

         contact  day

In [55]:
df['y'].value_counts()

y
no     39922
yes     5289
Name: count, dtype: int64

In [66]:
## Step 3 : Encoding the Target

# Run this code only once. Or run the whole code from beginning. 
# Or else the encoding fails as yes and no are already substituted as 1 and 0 and re-running maps them to NaN
# Which effectively lends to fail the train-test split data

df['y'] = df['y'].map({'yes': 1, 'no': 0})

In [57]:
df['y'].value_counts()

y
0    39922
1     5289
Name: count, dtype: int64

In [58]:
# Feature Groups
numeric_features = [
    'age', 'balance', 'day', 'duration',
    'campaign', 'pdays', 'previous'
]

categorical_features = [
    'job', 'marital', 'education', 'default',
    'housing', 'loan', 'contact', 'month', 'poutcome'
]

In [59]:
# Building Pre-processing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [60]:
df['y'].isna().sum()

np.int64(0)

In [61]:
# Train-Test Split

X = df.drop('y', axis=1)
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [63]:
# Baseline Model - Logistic Regression
from sklearn.linear_model import LogisticRegression

logreg_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000))
    ]
)

In [64]:
logreg_pipeline.fit(X_train, y_train)

In [67]:
# Evaluate
from sklearn.metrics import (
    accuracy_score, roc_auc_score,
    precision_score, recall_score,
    f1_score, matthews_corrcoef
)

y_pred = logreg_pipeline.predict(X_test)
y_prob = logreg_pipeline.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_prob))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("MCC:", matthews_corrcoef(y_test, y_pred))

Accuracy: 0.901249585314608
AUC: 0.9055740146044154
Precision: 0.6444833625218914
Recall: 0.34782608695652173
F1: 0.4518109269490485
MCC: 0.42605817794513523


In [69]:
# Create re-usable Evaluate function

from sklearn.metrics import (
    accuracy_score, roc_auc_score,
    precision_score, recall_score,
    f1_score, matthews_corrcoef
)

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

In [70]:
# Logistic regression

evaluate_model(logreg_pipeline, X_test, y_test)

{'Accuracy': 0.901249585314608,
 'AUC': np.float64(0.9055740146044154),
 'Precision': 0.6444833625218914,
 'Recall': 0.34782608695652173,
 'F1': 0.4518109269490485,
 'MCC': np.float64(0.42605817794513523)}

In [73]:
# Decision Tree

# Step - 1 Build the decision tree pipeline
from sklearn.tree import DecisionTreeClassifier

dt_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(
            random_state=42
        ))
    ]
)

# Step - 2 Train the model
dt_pipeline.fit(X_train, y_train)

# Step - 3 Evaluate
dt_results = evaluate_model(dt_pipeline, X_test, y_test)
dt_results

{'Accuracy': 0.8728298131151166,
 'AUC': np.float64(0.7008652802454508),
 'Precision': 0.4581818181818182,
 'Recall': 0.4763705103969754,
 'F1': 0.46709916589434664,
 'MCC': np.float64(0.39502678202261815)}