### Import Libraries

In [6]:
import pandas as pd
import joblib
import numpy as np

from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer

### Import Data

In [7]:
train = pd.read_csv('../data/readmissions_train.csv')

X = train.drop('readmitted',axis=1)
X.drop(['diag_1_desc'],axis=1,inplace=True)
y = train.pop('readmitted')

### Define Preprocessing step per type of column

In [3]:
#Preprocessing for numerical features
numeric_features = list(X.select_dtypes('int64').columns)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#Preprocessing for categorical features
categorical_features = list(X.select_dtypes('object').columns)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#Preprocessor with all of the steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

### Fit the Preprocessing Pipeline

In [4]:
# Full preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#Train the model-Pipeline
pipeline.fit(X,y)

#Preprocess x
preprocessed = pipeline.transform(X)

#I could also train the model with the sparse matrix. I transform it to padnas because the hook function in custom.py expected a pandas dataframe to be used for scoring.
preprocessed = pd.DataFrame.sparse.from_spmatrix(preprocessed)

### Train XGboost Classifier
Normally, the XGboost classifier could be part of the final scikit-learn pipeline. I am opting to keep them separate in order to create a more complicated example with different pkl files for preprocessing and scoring

In [7]:
model = XGBClassifier()
model.fit(preprocessed, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Save Custom Model files

In [8]:
joblib.dump(pipeline,'custom_model/preprocessing.pkl')
joblib.dump(model, 'custom_model/model.pkl') 

['custom_model/model.pkl']

In [9]:
!drum validation --code-dir ./custom_model --input ../data/readmissions_test.csv --target-type binary --positive-class-label True --negative-class-label False

         True     False
0    0.637696  0.362304
1    0.721781  0.278219
2    0.607439  0.392561
3    0.592136  0.407864
4    0.732156  0.267844
..        ...       ...
495  0.667187  0.332813
496  0.602306  0.397694
497  0.642599  0.357401
498  0.706146  0.293854
499  0.519434  0.480566

[500 rows x 2 columns]
         True     False
0    0.607502  0.392498
1    0.721781  0.278219
2    0.607439  0.392561
3    0.591732  0.408268
4    0.728045  0.271955
..        ...       ...
495  0.698019  0.301981
496  0.602306  0.397694
497  0.629780  0.370220
498  0.705740  0.294260
499  0.513288  0.486712

[500 rows x 2 columns]
         True     False
0    0.634049  0.365951
1    0.721781  0.278219
2    0.607439  0.392561
3    0.609400  0.390600
4    0.724545  0.275455
..        ...       ...
495  0.639094  0.360906
496  0.553947  0.446053
497  0.633498  0.366502
498  0.705740  0.294260
499  0.521550  0.478450

[500 rows x 2 columns]
         True     False
0    0.678945  0.321055
1    0.714485  0

### Validate model can work as `Custom Training Model`

In [16]:
!drum fit --code-dir ./custom_model --input ../data/readmissions_train.csv --target-type binary --target readmitted --positive-class-label True --negative-class-label False

Files were overwritten: {'/var/folders/v3/1cwf1zz90_nczrfzhvxwyjfr0000gp/T/tmpee8j62jd/preprocessing.pkl', '/var/folders/v3/1cwf1zz90_nczrfzhvxwyjfr0000gp/T/tmpee8j62jd/model.pkl'}
Validation Complete 🎉 Your model can be fit to your data,  and predictions can be made on the fit model! 
 You're ready to add it to DataRobot. 
