### Import Libraries

In [1]:
import pandas as pd
import joblib
import numpy as np

from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.impute import SimpleImputer

### Import Data

In [2]:
train = pd.read_csv('../data/readmissions_train.csv')

X = train.drop('readmitted',axis=1)
X.drop(['diag_1_desc'],axis=1,inplace=True)
y = train.pop('readmitted')

### Define Preprocessing step per type of column

In [3]:
#Preprocessing for numerical features
numeric_features = list(X.select_dtypes('int64').columns)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

#Preprocessing for categorical features
categorical_features = list(X.select_dtypes('object').columns)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

#Preprocessor with all of the steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

### Fit the Preprocessing Pipeline

In [4]:
# Full preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

#Train the model-Pipeline
pipeline.fit(X,y)

#Preprocess x
preprocessed = pipeline.transform(X)

#I could also train the model with the sparse matrix. I transform it to padnas because the hook function in custom.py expected a pandas dataframe to be used for scoring.
preprocessed = pd.DataFrame.sparse.from_spmatrix(preprocessed)

### Train XGboost Classifier
Normally, the XGboost classifier could be part of the final scikit-learn pipeline. I am opting to keep them separate in order to create a more complicated example with different pkl files for preprocessing and scoring

In [5]:
model = XGBClassifier()
model.fit(preprocessed, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Save Custom Model files

In [6]:
joblib.dump(pipeline,'custom_model/preprocessing.pkl')
joblib.dump(model, 'custom_model/model.pkl') 

['custom_model/model.pkl']

In [7]:
!drum validation --code-dir ./custom_model --input ../data/readmissions_test.csv --target-type binary --positive-class-label True --negative-class-label False

         True     False
0    0.632026  0.367974
1    0.800197  0.199803
2    0.457199  0.542801
3    0.540906  0.459094
4    0.820947  0.179053
..        ...       ...
495  0.691010  0.308990
496  0.561247  0.438753
497  0.527519  0.472481
498  0.508507  0.491493
499  0.281188  0.718812

[500 rows x 2 columns]
         True     False
0    0.571638  0.428362
1    0.800197  0.199803
2    0.457199  0.542801
3    0.540100  0.459900
4    0.812726  0.187274
..        ...       ...
495  0.752673  0.247327
496  0.561247  0.438753
497  0.501880  0.498120
498  0.507695  0.492305
499  0.268896  0.731104

[500 rows x 2 columns]
         True     False
0    0.624732  0.375268
1    0.800197  0.199803
2    0.457199  0.542801
3    0.575435  0.424564
4    0.805724  0.194276
..        ...       ...
495  0.634823  0.365177
496  0.464528  0.535472
497  0.509316  0.490684
498  0.507695  0.492305
499  0.285420  0.714580

[500 rows x 2 columns]
         True     False
0    0.714525  0.285475
1    0.785605  0

### Validate model can work as `Custom Training Model`

In [8]:
!drum fit --code-dir ./custom_model --input ../data/readmissions_train.csv --target-type binary --target readmitted --positive-class-label True --negative-class-label False

Files were overwritten: {'/var/folders/v3/1cwf1zz90_nczrfzhvxwyjfr0000gp/T/tmpwz02c5nm/model.pkl', '/var/folders/v3/1cwf1zz90_nczrfzhvxwyjfr0000gp/T/tmpwz02c5nm/preprocessing.pkl'}
Validation Complete 🎉 Your model can be fit to your data,  and predictions can be made on the fit model! 
 You're ready to add it to DataRobot. 
