### Setup

In [None]:
# !pip install kaggle --upgrade
# !pip install catboost
# !pip install shap

In [None]:
from setup import *

In [None]:
wkdir = '.../DATASCI-WiDS'

### Read data

In [None]:
training =  pd.read_csv(wkdir + "/data/TrainingWiDS2021.csv")
data_dictionary = pd.read_csv(wkdir + "/data/DataDictionaryWiDS2021.csv")
training = training.drop(columns=['Unnamed: 0'])

unlabeled = pd.read_csv(wkdir + "/data/UnlabeledWiDS2021.csv")
unlabeled = unlabeled.drop(columns=['Unnamed: 0'])

### Data cleaning

In [None]:
data_types = dabl.detect_types(training)

In [None]:
categoricals = data_types[data_types['categorical']==True].index.tolist()
print("**categorical features**")
print(categoricals)

In [None]:
continuous = data_types[data_types['continuous']==True].index.tolist()
print("**first 10 continuous features**")
print(continuous[:10])
print("**total continuous features**")
print(len(continuous))

In [None]:
useless = data_types[data_types['useless']==True].index.tolist()
print("**useless features**")
print(useless)
print("**total useless features**")
print(len(useless))

### Train-test-validate split of labelled data for parameters tunning

**define target** 

In [None]:
target = 'diabetes_mellitus'

**split data: train-validate-test**

In [None]:
# first split
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)
for train_index, test_index in split.split(training, training[target]):
    strat_train_set = training.loc[train_index]
    strat_test_set = training.loc[test_index]
training['test'] = np.where(training['encounter_id'].isin(strat_test_set['encounter_id']), True, False)
non_test = training.loc[training['test'] == False]
test = training.loc[training['test'] == True]

# second split
train, validate = train_test_split(non_test, test_size=0.2, stratify=non_test[target], random_state=random_state)

In [None]:
print('train : ' + str(train.shape))
print('test : ' + str(test.shape))
print('validate : ' + str(validate.shape))

**handle missing values by the split above**

In [None]:
%%time

## fill NA of numerics with median value
for col in continuous:
    train[col].fillna(train[col].median(), inplace=True)
    test[col].fillna(test[col].median(), inplace=True)
    validate[col].fillna(validate.median(), inplace=True)  

In [None]:
## fill NAs of categoricals with most common values
categorical_nas = ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source']
train[categorical_nas] = train[categorical_nas].fillna(train.mode().iloc[0])
test[categorical_nas] = test[categorical_nas].fillna(test.mode().iloc[0])
validate[categorical_nas] = validate[categorical_nas].fillna(validate.mode().iloc[0])

In [None]:
print('unique values')
for col in categoricals:
    print(col + ' : ' + str(train[col].unique()))

### Features

In [None]:
categoricals.remove(target)
categoricals

In [None]:
continuous.remove('encounter_id')

In [None]:
model_features = continuous + categoricals

## Model

In [None]:
X_train = train[model_features]
y_train = train[target]
X_test = test[model_features]
y_test = test[target]
X_validate = validate[model_features]
y_validate = validate[target]
print(X_train.shape)
print(X_test.shape)
print(X_validate.shape)

### Hyperparameter tuning

#### Random search

In [None]:
grid = {'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.07, 0.1, 0.2, 0.3],
        'depth': [4, 5, 6, 7, 8, 9, 10],
        'l2_leaf_reg': [3, 4, 5],
        'iterations': [400, 500, 600, 700],
        'early_stopping_rounds' : [30, 40, 50, 60],
        'custom_metric':['Logloss', 'AUC', 'Precision', 'Recall', 'F1', 'BalancedAccuracy']}

In [None]:
%%time

model = CatBoostClassifier()

pool_ds = pd.concat([X_train, X_test])
label_ds = pd.concat([y_train, y_test])
rs_pool = Pool(data = pool_ds,
              label = label_ds,
              cat_features = categoricals)

model.randomized_search(grid,
                      rs_pool,
                      y=None,
                      cv=3,
                      n_iter=10,
                      partition_random_seed=0,
                      calc_cv_statistics=True, 
                      search_by_train_test_split=True,
                      refit=True, #refit best model
                      shuffle=True, 
                      stratified=True, 
                      train_size=0.8, 
                      verbose=True)

In [None]:
# get best parms
rs_best_params = model.get_params()
rs_best_params

In [None]:
def plot_feat_imp(model):
    feature_importance_df = pd.DataFrame(model.get_feature_importance(prettified=True))
    plt.figure(figsize=(10, 30));
    sns.barplot(x="Importances", y="Feature Id", data=feature_importance_df);
    plt.title('CatBoost features importance:', fontsize=16, weight="bold");

plot_feat_imp(model)

### Check model's performance on unseen data

https://github.com/catboost/tutorials/blob/master/classification/classification_tutorial.ipynb

In [None]:
plt.figure(figsize=(10, 10))
metrics.plot_roc_curve(model, X_validate, y_validate)
plt.title('ROC curve catBoost RS model', fontsize=14, weight="bold")
plt.show()

In [None]:
y_pred = pd.DataFrame(model.predict(X_validate[model_features]))
y_true = y_validate
print(classification_report(y_true, y_pred))

### Retrain on all data

In [None]:
# {'depth': 6,
#  'od_wait': 50,
#  'l2_leaf_reg': 4,
#  'iterations': 700,
#  'learning_rate': 0.05,
#  'custom_metric': 'BalancedAccuracy'}

In [None]:
%%time

## retrain on all data
pool_ds = pd.concat([X_train, X_test, X_validate])
label_ds = pd.concat([y_train, y_test, y_validate])

model = CatBoostClassifier(**rs_best_params)

model.fit(pool_ds,
          label_ds,
          cat_features = categoricals,
          plot=True)

### Predict on unlabelled data

In [None]:
## fill NA of numerics with median value
for col in continuous:
    unlabeled.fillna(unlabeled.median(), inplace=True)
## fill NA of categoricals with mode    
for col in categoricals:
    unlabeled.fillna(unlabeled.mode().iloc[0], inplace=True)

In [None]:
predicted_probs = pd.DataFrame(model.predict_proba(unlabeled[model_features])[:, 1])
predicted_labels = pd.DataFrame(model.predict(unlabeled[model_features]))

In [None]:
result = unlabeled['encounter_id'].to_frame()
result[target] = predicted_probs
result = result.reset_index(drop=True)

In [None]:
result.to_csv('.../submissions/submission_catboost_RS_180221.csv', index=False)

### Submit to kaggle

In [None]:
!kaggle competitions submit -c widsdatathon2021 -f '.../submission_catboost_RS_180221.csv' -m "catboost RS submission 180221"

In [None]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="code"></form>''')