This is a minimal code to see how to save a final model, plus additional information, that can be used later by Streamlit

In [12]:

# This cell creates the final model, trained with the entire dataset (X, y)
# In a more complex case (like that of the assignment), the final model would be an entire pipeline (with preprocessing)
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from pathlib import Path
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb

TRAINING_DATA_PATH = Path('dataset', 'bank_06.pkl')
raw_data = pd.read_pickle(TRAINING_DATA_PATH)
RANDOM_STATE_ID = 100577770

In [15]:
num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'previous', 'pdays_duration', 'prev_contacted']
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

class PdaysTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.median_pdays = None
    
    def fit(self, X, y=None):
        known_pdays = X.loc[X['pdays'] != -1, 'pdays']
        self.median_pdays = known_pdays.median() if len(known_pdays) > 0 else 0
        return self
    
    def transform(self, X):
        X = X.copy()
        X['prev_contacted'] = (X['pdays'] != -1).astype(int)
        X['pdays_duration'] = X['pdays'].replace(-1, self.median_pdays)
        X.drop('pdays', axis=1, inplace=True)
        return X

# Column transformer for scaling and encoding
column_processor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ]
)

best_params = {
    'max_depth' : 7, 
    'learning_rate': 0.0683962886025413,
    'n_estimators' : 181
}




In [16]:
full_pipeline = Pipeline([
    ('pdays_transform', PdaysTransformer()),
    ('preprocessor', column_processor),
    ('classifier', xgb.XGBClassifier(
        max_depth=best_params['max_depth'],
        learning_rate=best_params['learning_rate'],
        n_estimators=best_params['n_estimators'],
        random_state=RANDOM_STATE_ID
    ))
])

In [17]:
# Here, we pack and save into a joblib file, the final model, plus information about the numerical and categorical features
pack = {
  "pipeline" : full_pipeline
}

In [18]:
from joblib import dump
dump(pack, "pack_for_streamlit.joblib")


['pack_for_streamlit.joblib']