### Import Libraries

In [1]:
import pandas as pd
from catboost import CatBoostClassifier
import pickle

### Preprocessing

I will transfer below preprocessing in `custom.py`

In [2]:
train = pd.read_csv('../data/readmissions_train.csv')

#Preprocessing steps
def find_diabetes_text(txt):
    try:
        if 'diabetes' in txt.lower():
            return 1
        else:
            return 0
    except:
        0

# Find out if `Diabetes|`diabetes` exists in diag_1_desc column
train['diabetes'] = train['diag_1_desc'].apply(lambda x: find_diabetes_text(x))

# Fill null values for Categorical Features
for c,typ in zip(train.columns,train.dtypes):
    if typ ==object:
        train[c] = train[c].fillna('unknown')

train.drop('diag_1_desc',axis=1,inplace=True)

# Fill null values for numeric features
train = train.fillna(0)

### Split Data into X,y

In [3]:
X = train.drop(['readmitted'],axis=1)
y = train['readmitted']

### Initiate Modeling

In [4]:
model = CatBoostClassifier(iterations=2,
                           depth=2,
                           learning_rate=1,
                           loss_function='Logloss',
                            verbose=True)

cat_features = ['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 
            'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 
            'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
                'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide_metformin',
            'glipizide_metformin', 'glimepiride_pioglitazone', 'metformin_rosiglitazone', 'metformin_pioglitazone', 'change', 'diabetesMed' 
            ]

# train the model
model.fit(X,y, cat_features)

0:	learn: 0.6453099	total: 60.5ms	remaining: 60.5ms
1:	learn: 0.6333235	total: 70ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7ff5c0522e80>

### Save Model

In [5]:
pickle.dump(model, open('custom_model/model.pkl', 'wb'))

### Verify custom model integrity with DRUM

In [6]:
!drum validation --code-dir ./custom_model --input ../data/readmissions_test.csv --target-type binary --positive-class-label True --negative-class-label False

         True     False
0    0.735228  0.264772
1    0.735228  0.264772
2    0.419437  0.580563
3    0.591693  0.408307
4    0.735228  0.264772
..        ...       ...
495  0.591693  0.408307
496  0.591693  0.408307
497  0.486772  0.513228
498  0.419437  0.580563
499  0.419437  0.580563

[500 rows x 2 columns]
         True     False
0    0.735228  0.264772
1    0.735228  0.264772
2    0.419437  0.580563
3    0.591693  0.408307
4    0.735228  0.264772
..        ...       ...
495  0.591693  0.408307
496  0.591693  0.408307
497  0.486772  0.513228
498  0.419437  0.580563
499  0.419437  0.580563

[500 rows x 2 columns]
         True     False
0    0.735228  0.264772
1    0.735228  0.264772
2    0.419437  0.580563
3    0.591693  0.408307
4    0.735228  0.264772
..        ...       ...
495  0.591693  0.408307
496  0.591693  0.408307
497  0.486772  0.513228
498  0.419437  0.580563
499  0.419437  0.580563

[500 rows x 2 columns]
         True     False
0    0.539075  0.460925
1    0.539075  0

### Verify that model can also be trained using drum 
Potentially, I could fit the model using drum and export the file

In [8]:
!drum fit --code-dir ./custom_model --input ../data/readmissions_train.csv --target-type binary --target readmitted --positive-class-label True --negative-class-label False

0:	learn: 0.6455684	total: 69.1ms	remaining: 69.1ms
1:	learn: 0.6328794	total: 79.1ms	remaining: 0us
Files were overwritten: {'/var/folders/v3/1cwf1zz90_nczrfzhvxwyjfr0000gp/T/tmpzdjv_dob/model.pkl'}
Validation Complete 🎉 Your model can be fit to your data,  and predictions can be made on the fit model! 
 You're ready to add it to DataRobot. 
