### Setup

In [None]:
# !pip install kaggle --upgrade
# !pip install lightgbm
# !pip install catboost
# !pip install dabl
# !pip install plotly
# !pip install shap
# !pip install hyperopt

In [None]:
from tools.setup import *

In [None]:
wkdir = '/home/jovyan/work/analysis/DATASCI-WiDS'
random_state = 33

### Read data

In [None]:
training =  pd.read_csv(wkdir + "/data/TrainingWiDS2021.csv")
data_dictionary = pd.read_csv(wkdir + "/data/DataDictionaryWiDS2021.csv")
training = training.drop(columns=['Unnamed: 0'])

unlabeled = pd.read_csv(wkdir + "/data/UnlabeledWiDS2021.csv")
unlabeled = unlabeled.drop(columns=['Unnamed: 0'])

In [None]:
gc.collect()

### Data cleaning

In [None]:
data_types = dabl.detect_types(training)

In [None]:
categoricals = data_types[data_types['categorical']==True].index.tolist()
print("**categorical features**")
print(categoricals)

In [None]:
continuous = data_types[data_types['continuous']==True].index.tolist()
print("**first 10 continuous features**")
print(continuous[:10])
print("**total continuous features**")
print(len(continuous))

In [None]:
useless = data_types[data_types['useless']==True].index.tolist()
print("**useless features**")
print(useless)
print("**total useless features**")
print(len(useless))

### Train-test-validate split of labelled data for parameters tunning

**define target** 

In [None]:
target = 'diabetes_mellitus'

**split data: train-validate-test**

In [None]:
# first split
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random_state)
for train_index, test_index in split.split(training, training[target]):
    strat_train_set = training.loc[train_index]
    strat_test_set = training.loc[test_index]
training['test'] = np.where(training['encounter_id'].isin(strat_test_set['encounter_id']), True, False)
non_test = training.loc[training['test'] == False]
test = training.loc[training['test'] == True]

# second split
train, validate = train_test_split(non_test, test_size=0.2, stratify=non_test[target], random_state=random_state)

In [None]:
print('train : ' + str(train.shape))
print('test : ' + str(test.shape))
print('validate : ' + str(validate.shape))

**handle missing values by the split above**

In [None]:
## fill NA of numerics with median value
for col in continuous:
    train[col].fillna(train[col].median(), inplace=True)
    test[col].fillna(test[col].median(), inplace=True)
    validate[col].fillna(validate.median(), inplace=True)  

In [None]:
## fill NAs of categoricals with most common values
categorical_nas = ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source']
train[categorical_nas] = train[categorical_nas].fillna(train.mode().iloc[0])
test[categorical_nas] = test[categorical_nas].fillna(test.mode().iloc[0])
validate[categorical_nas] = validate[categorical_nas].fillna(validate.mode().iloc[0])

print('unique values')
for col in categoricals:
    print(col + ' : ' + str(train[col].unique()))

### Features

In [None]:
categoricals.remove(target)
categoricals

In [None]:
continuous.remove('encounter_id')

In [None]:
model_features = continuous + categoricals

## Model

In [None]:
X_train = train[continuous]
y_train = train[target]
X_test = test[continuous]
y_test = test[target]
X_validate = validate[continuous]
y_validate = validate[target]
print('train')
print(X_train.shape)
print(y_train.value_counts())
print('test')
print(X_test.shape)
print(y_test.value_counts())
print('validate')
print(X_validate.shape)
print(y_validate.value_counts())

### Hyperparameter tuning - hyperopt

In [None]:
import lightgbm as lgb

In [None]:
grid_params = {'model':lgb.LGBMClassifier, 'param': {
    'class_weight': {0:1, 1:hp.uniform('class_weight_1', 90, 450)},
    'min_sum_hessian_in_leaf': hp.uniform('min_sum_hessian_in_leaf', 0.0, 1.0),
    'max_bin': hp.choice('max_bin', np.arange(50, 750, 25, dtype=int)),
    'num_leaves': hp.choice('num_leaves', np.arange(4, 256, dtype=int)),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample_for_bin': hp.choice('subsample_for_bin', np.arange(10000, X_train.shape[0], dtype=int)),
    'min_child_samples': hp.choice('min_child_samples', np.arange(20, 500,5, dtype=int)),
    'is_unbalance': hp.choice('is_unbalance', np.array([True, False], dtype = bool)), 
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
    'feature_fraction': hp.uniform('feature_fraction', 1/X_train.shape[1], 1.0),        
    'max_depth': hp.choice('max_depth', np.arange(5, 12,1, dtype=int)),    
    'lambda_l1': hp.uniform('lambda_l1', 0.0, 10.0),
    'lambda_l2': hp.uniform('lambda_l2', 0.0, 10.0),
    'bagging_fraction': hp.uniform('bagging_fraction',1/X_train.shape[0]*10,1.0),
    'bagging_freq': hp.choice('bagging_freq', np.arange(1, 11,1, dtype=int)),
    'objective' : 'binary',
    'boost_from_average': False ,
    'boosting_type': hp.choice('boosting_type', np.array(['gbdt', 'dart'], dtype=str)),
    'n_estimators' : hp.choice('n_estimators', np.arange(200, 5000, 50, dtype=int))
}}   

tested_models =[]
tested_models.append(grid_params)
  
hp_space = hp.choice('classifier',tested_models)

In [None]:
trials = Trials()
max_trials = 10

In [None]:
hyperparameter_set = {}
loss_list = []

def objective(params):
    model = params['model'](**params['param'])
    ## fit model
    model.fit(X_train, y_train)    
    
    ## predict
    pred_test = model.predict(X_test) # class prediction
#     pred_test = pd.DataFrame(model.predict_proba(X_test))#.iloc[:, 1] # probability prediction    
   
    ## evaluate predictions, change score if needed
#     score = roc_auc_score(y_test, pred_test.iloc[:,1])  
#     score = precision_score(y_true=y_test, y_pred=pred_test)
    score = f1_score(y_true=y_test, y_pred=pred_test)
    
    ## define loss
    loss = 1-np.round((score), decimals = 6) 
    hyperparameter_set[loss] = params
    
    print('Loss = ' + str(loss) + '\n')
    loss_list.append(loss)

    return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [None]:
%%time
best = fmin(fn = objective, 
            space = hp_space, 
            algo = tpe.suggest, 
            max_evals = max_trials, 
            trials = trials)

### Load the best model with the best hyparameters

In [None]:
from hyperopt import space_eval
best_params_hyperopt = space_eval(hp_space, best)['param']
best_params_hyperopt

### Fit the model on train+test data 

In [None]:
frames = [X_train, X_test]
X_train_test = pd.concat(frames)
frames = [y_train, y_test]
y_train_test = pd.concat(frames)

In [None]:
%%time
model = lgb.LGBMClassifier( 
  bagging_fraction = 0.9278437351065486,
  bagging_freq = 9,
  boost_from_average = False,
  boosting_type =  'gbdt' ,
  class_weight = {0:1, 1:415.5324798320063},
  colsample_bytree = 0.6616924071455909,
  feature_fraction = 0.5057202614187002,
  is_unbalance = False,
  lambda_l1 = 1.1035610361541048,
  lambda_l2 = 9.287119306850947,
  learning_rate = 0.1984964279262592,
  max_bin = 350,
  max_depth = 10,
  min_child_samples = 480,
  min_sum_hessian_in_leaf = 0.9322887572635592,
  n_estimators = 3150,
  num_leaves = 55,
  objective =  'binary',
  subsample_for_bin = 48216)

model.fit(X_train_test, np.ravel(y_train_test), eval_set = (X_validate, np.ravel(y_validate)))

### Model evaluation on X_train_test/X_validate 

In [None]:
predictions_train =  model.predict_proba(X_train_test)[:, 1]
predictions_valid =  model.predict_proba(X_validate)[:, 1]

In [None]:
predictions_test =  model.predict_proba(X_test)[:, 1]

**Confusion matrix**

In [None]:
gc.collect()
sns.set(rc={'figure.figsize': (20, 10)})

## cm train+test set
y_pred_train = model.predict(X_train_test)
cm = confusion_matrix(y_train_test, y_pred_train)
# cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# ax1 = plt.subplot(1, 2, 1)
# sns.heatmap(cm, annot=True, ax=ax1, fmt='.0f', cmap='magma')
# #annot=True to annotate cells

# # labels, title and ticks
# ax1.set_xlabel('Predicted labels')
# ax1.set_ylabel('True labels')
# ax1.set_title('Confusion Matrix Training')
# ax1.xaxis.set_ticklabels(['No diabetes', 'Diabetus melitus'])
# ax1.yaxis.set_ticklabels(['No diabetes', 'Diabetus melitus'])


ax2 = plt.subplot(1, 2, 1)
sns.heatmap(cm, annot=True, ax=ax2, fmt='.2f', cmap='viridis')
#annot=True to annotate cells

# labels, title and ticks
ax2.set_xlabel('Predicted labels')
ax2.set_ylabel('True labels')
ax2.set_title('Confusion Matrix Training set', size = 14)
ax2.xaxis.set_ticklabels(['No diabetes', 'Diabetus melitus'])
ax2.yaxis.set_ticklabels(['No diabetes', 'Diabetus melitus'])


## cm validation set
y_pred_validate = model.predict(X_validate)
cm = confusion_matrix(y_validate, y_pred_validate)
# cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

ax1 = plt.subplot(1, 2, 2)
sns.heatmap(cm, annot=True, ax=ax1, fmt='.0f', cmap='magma')
#annot=True to annotate cells

# labels, title and ticks
ax1.set_xlabel('Predicted labels')
ax1.set_ylabel('True labels')
ax1.set_title('Confusion Matrix Validation set', size = 14)
ax1.xaxis.set_ticklabels(['No diabetes', 'Diabetus melitus'])
ax1.yaxis.set_ticklabels(['No diabetes', 'Diabetus melitus'])

plt.show()

In [None]:
plt.style.use('ggplot')
metrics.plot_roc_curve(model, X_validate, y_validate)
plt.title('ROC curve lightGBM model', fontsize=14, weight="bold")
plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_validate, model.predict(X_validate)))

In [None]:
lgb.plot_importance(model, figsize=(30, 30))
plt.show()

### Retrain on all data

In [None]:
frames = [X_train, X_test, X_validate]
X = pd.concat(frames)
frames = [y_train, y_test, y_validate]
y = pd.concat(frames)

In [None]:
%%time

model.fit(X, y)

### Predict on unlabelled data

In [None]:
unlabeled[continuous].head()

In [None]:
predicted_probs = pd.DataFrame(model.predict_proba(unlabeled[continuous])[:, 1])
# predicted_labels = pd.DataFrame(model.predict(unlabeled[model_features]))

In [None]:
result = unlabeled['encounter_id'].to_frame()
result[target] = predicted_probs

In [None]:
result = result.reset_index(drop=True)

In [None]:
result

In [None]:
result.to_csv('/home/jovyan/work/analysis/DATASCI-WiDS/submissions/submission_lgb_hyperopt_19022021.csv', index=False)

### Submit to kaggle

In [None]:
# !pip install kaggle --upgrade

In [None]:
!kaggle competitions submit -c widsdatathon2021 -f '/home/jovyan/work/analysis/DATASCI-WiDS/submissions/submission_lgb_hyperopt_19022021.csv' -m "lightGBM hyperopt nums"

In [None]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="code."></form>''')