This is a minimal code to see how to save a final model, plus additional information, that can be used later by Streamlit

In [9]:

# This cell creates the final model, trained with the entire dataset (X, y)
# In a more complex case (like that of the assignment), the final model would be an entire pipeline (with preprocessing)
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import xgboost as xgb

TRAINING_DATA_PATH = Path('dataset', 'bank_06.pkl')
raw_data = pd.read_pickle(TRAINING_DATA_PATH)
RANDOM_STATE_ID = 100577770

First, we create the steps necessary for the preprocessing pipeline. Then we assemble the pipeline and fit our final model on the entire training dataset.

In [2]:
num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'previous', 'pdays_duration', 'prev_contacted']
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

class PdaysTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.median_pdays = None
    
    def fit(self, X, y=None):
        known_pdays = X.loc[X['pdays'] != -1, 'pdays']
        self.median_pdays = known_pdays.median() if len(known_pdays) > 0 else 0
        return self
    
    def transform(self, X):
        X = X.copy()
        X['prev_contacted'] = (X['pdays'] != -1).astype(int)
        X['pdays_duration'] = X['pdays'].replace(-1, self.median_pdays)
        X.drop('pdays', axis=1, inplace=True)
        return X

# Column transformer for scaling and encoding
column_processor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ]
)

best_params = {
    'max_depth' : 7, 
    'learning_rate': 0.0683962886025413,
    'n_estimators' : 181
}




In [10]:
full_pipeline = Pipeline([
    ('pdays_transform', PdaysTransformer()),
    ('preprocessor', column_processor),
    ('classifier', xgb.XGBClassifier(
        max_depth=best_params['max_depth'],
        learning_rate=best_params['learning_rate'],
        n_estimators=best_params['n_estimators'],
        random_state=RANDOM_STATE_ID
    ))
])

X = raw_data.drop("deposit", axis=1)
y = raw_data["deposit"].map({"yes": 1, "no": 0})

final_model = full_pipeline.fit(X, y)

In [11]:
# Compute additional data for the streamlit pack

num_cols=raw_data.select_dtypes(include=["number"]).columns.tolist()
num_summary = {}
for c in num_cols:
    s = raw_data[c].dropna()
    num_summary[c] = {
        "p1": float(np.percentile(s, 1)),
        "median": float(np.percentile(s, 50)),
        "p99": float(np.percentile(s, 99))
    }
print (num_summary)

cat_cols=raw_data.select_dtypes(exclude=["number"]).columns.tolist()
cat_options = {col: sorted(list(raw_data[col].dropna().astype(str).str.strip().unique()))
               for col in cat_cols}
print (cat_options)


{'age': {'p1': 22.0, 'median': 39.0, 'p99': 77.0}, 'balance': {'p1': -522.0, 'median': 549.5, 'p99': 13157.82000000004}, 'day': {'p1': 1.0, 'median': 15.0, 'p99': 31.0}, 'duration': {'p1': 14.0, 'median': 255.0, 'p99': 1576.0300000000007}, 'campaign': {'p1': 1.0, 'median': 2.0, 'p99': 13.010000000000218}, 'pdays': {'p1': -1.0, 'median': -1.0, 'p99': 426.0}, 'previous': {'p1': 0.0, 'median': 0.0, 'p99': 10.0}}
{'job': ['admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown'], 'marital': ['divorced', 'married', 'single'], 'education': ['primary', 'secondary', 'tertiary', 'unknown'], 'default': ['no', 'yes'], 'housing': ['no', 'yes'], 'loan': ['no', 'yes'], 'contact': ['cellular', 'telephone', 'unknown'], 'month': ['apr', 'aug', 'dec', 'feb', 'jan', 'jul', 'jun', 'mar', 'may', 'nov', 'oct', 'sep'], 'poutcome': ['failure', 'other', 'success', 'unknown'], 'deposit': ['no', 'yes']}


In [12]:
# Here, we pack and save into a joblib file, the final model, plus information about the numerical and categorical features
pack = {
      "model": final_model,
    "num_cols": num_cols,
    "cat_cols": cat_cols,
    "num_summary": num_summary,
    "cat_options": cat_options,
    "classes_": final_model.classes_.tolist(),
}

In [13]:
from joblib import dump
dump(pack, "pack_for_streamlit.joblib")


['pack_for_streamlit.joblib']