In [None]:
from joblib import load
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import xgboost as xgb

RANDOM_STATE_ID = 100577770


In [16]:
# create identical pipeline from project 1

num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'previous', 'pdays_duration', 'prev_contacted']
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

class PdaysTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.median_pdays = None
    
    def fit(self, X, y=None):
        known_pdays = X.loc[X['pdays'] != -1, 'pdays']
        self.median_pdays = known_pdays.median() if len(known_pdays) > 0 else 0
        return self
    
    def transform(self, X):
        X = X.copy()
        X['prev_contacted'] = (X['pdays'] != -1).astype(int)
        X['pdays_duration'] = X['pdays'].replace(-1, self.median_pdays)
        X.drop('pdays', axis=1, inplace=True)
        return X

pdaysTransformer = PdaysTransformer()

# Column transformer for scaling and encoding
column_processor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ]
)

best_params = {
    'max_depth' : 7, 
    'learning_rate': 0.0683962886025413,
    'n_estimators' : 181
}

full_pipeline = Pipeline([
    ('pdays_transform', pdaysTransformer),
    ('preprocessor', column_processor),
    ('classifier', xgb.XGBClassifier(
        max_depth=best_params['max_depth'],
        learning_rate=best_params['learning_rate'],
        n_estimators=best_params['n_estimators'],
        random_state=RANDOM_STATE_ID
    ))
])