In [21]:
from joblib import load
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import xgboost as xgb
from pathlib import Path
import pandas as pd

TRAINING_DATA_PATH = Path('dataset', 'bank_06.pkl')
raw_data = pd.read_pickle(TRAINING_DATA_PATH)
RANDOM_STATE_ID = 100577770


In [24]:
# create identical pipeline from project 1

num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'previous', 'pdays_duration', 'prev_contacted']
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

class PdaysTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.median_pdays = None
    
    def fit(self, X, y=None):
        known_pdays = X.loc[X['pdays'] != -1, 'pdays']
        self.median_pdays = known_pdays.median() if len(known_pdays) > 0 else 0
        return self
    
    def transform(self, X):
        X = X.copy()
        X['prev_contacted'] = (X['pdays'] != -1).astype(int)
        X['pdays_duration'] = X['pdays'].replace(-1, self.median_pdays)
        X.drop('pdays', axis=1, inplace=True)
        return X

# Column transformer for scaling and encoding
column_processor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ]
)

best_params = {
    'max_depth' : 7, 
    'learning_rate': 0.0683962886025413,
    'n_estimators' : 181
}

pipeline = [
    ('pdays_transform', PdaysTransformer),
    ('preprocessor', column_processor),
    ('classifier', xgb.XGBClassifier(
        max_depth=best_params['max_depth'],
        learning_rate=best_params['learning_rate'],
        n_estimators=best_params['n_estimators'],
        random_state=RANDOM_STATE_ID
    ))
]

Define our full pipeline without feature selection, identital to assignment #1. Containing our preprocessing in PdaysTransformer and column_processor, and our model XGBoost in xgb.XGBClassifier.

Next we create our feature selection transformers using F-score and Mutual Information methods, then it to the pipeline.

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
import copy

# create feature selection estimators
f_score_selector = SelectKBest(score_func = f_classif)
mutual_info_selector = SelectKBest(score_func = mutual_info_classif)

# create deep copies for each selector type
pipeline_f_score = copy.deepcopy(pipeline)
pipeline_f_score.insert(2, ('f_classif', f_score_selector))
f_score_pipe = Pipeline(pipeline_f_score)

pipeline_mutual_info = copy.deepcopy(pipeline)
pipeline_mutual_info.insert(2, ('mutual_info_classif', mutual_info_selector))
mutual_info_pipe = Pipeline(pipeline_mutual_info)

In [17]:
len(raw_data.columns)

17

In [26]:
# use grid search to find optimal k-features to select
from sklearn.model_selection import GridSearchCV

k_range = {"k": list(range(1,10))}

f_score_grid = GridSearchCV(
    f_score_pipe,
    k_range,
    n_jobs=1,
    verbose=1
)

# split data to be trained
X = raw_data.drop("deposit", axis=1)
y = raw_data["deposit"]

f_score_grid.fit(X, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


TypeError: BaseEstimator.get_params() missing 1 required positional argument: 'self'