# Import

In [1]:
TARGET = 'death_3year'

In [2]:
import numpy as np 
import pandas as pd 

import os
import yaml

import lightgbm as lgb

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Load

In [3]:
df_raw = pd.read_csv("dataset/processed_data.csv", dtype={'date_procedure_9': 'str', 'date_procedure_10': 'str'})
df_raw.head()

Unnamed: 0,sex,dob,age,race,education_level,zipcode,patient_city,patient_state,underlying_heart_disease,heart_disease,...,n_procedure_180d,n_procedure_1year,classe_meds_cardio_qtde,meds_cardiovasc_qtde,meds_antimicrobianos,nyha_basal_group,cied_final_group_1,procedure_type_new,hospital_stay,split
0,1,1966-01-23,44.6,1.0,2.0,3343010,SAO PAULO,35.0,1.0,0.0,...,0,0,3.0,6.0,3.0,1.0,1,2,1,train
1,1,1961-09-29,51.1,1.0,4.0,71505275,BRASILIA,53.0,1.0,0.0,...,0,0,,0.0,4.0,1.0,1,1,2,train
2,0,1945-08-09,60.1,1.0,4.0,71015068,BRASILIA,53.0,1.0,0.0,...,0,0,,0.0,0.0,1.0,1,2,1,train
3,1,1948-01-25,61.0,1.0,4.0,70753010,BRASILIA,53.0,2.0,0.0,...,0,0,,0.0,8.0,1.0,1,1,2,train
4,1,1934-10-06,69.1,1.0,999.0,70386110,BRASILIA,53.0,,,...,0,0,,,,,1,1,8,train


In [4]:
with open("auxiliar/significant_columns/categorical_{}.yaml".format(TARGET), "r") as stream:
    try:
        initial_cat_features = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)
        
with open("auxiliar/significant_columns/numerical_{}.yaml".format(TARGET), "r") as stream:
    try:
        initial_num_features = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)
        
initial_features = initial_cat_features + initial_num_features

# Split

In [5]:
X_df = df_raw[initial_features]
y_df = df_raw[TARGET]

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df,
                                                    test_size = 0.3,
                                                    random_state = 0,
                                                    stratify = y_df)

In [6]:
print("Training features shape: ", X_train.shape)
print("Testing features shape: ", X_test.shape)

Training features shape:  (11036, 89)
Testing features shape:  (4730, 89)


# Baseline

In [10]:
N_FOLDS = 5
MAX_EVALS = 5

In [11]:
train_set = lgb.Dataset(data=X_train, label = y_train, free_raw_data = False)
test_set = lgb.Dataset(data = X_test, label = y_test, free_raw_data = False)

In [18]:
# Get default hyperparameters
model = lgb.LGBMClassifier()
default_params = model.get_params()

# Remove the number of estimators because we set this to 10000 in the cv call
del default_params['n_estimators']
del default_params['importance_type']
del default_params['silent']

# Cross validation with early stopping
cv_results = lgb.cv(default_params, train_set, num_boost_round = 10000,
                    callbacks=[lgb.early_stopping(stopping_rounds = 200)], 
                    metrics = 'auc', nfold = N_FOLDS, seed = 314)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4085
[LightGBM] [Info] Number of data points in the train set: 8828, number of used features: 87
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4085
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 87
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4085
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 87
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4085
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 87
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [In

In [19]:
print('The maximum validation ROC AUC was: {:.5f} with a standard deviation of {:.5f}.'.format(cv_results['auc-mean'][-1], cv_results['auc-stdv'][-1]))
print('The optimal number of boosting rounds (estimators) was {}.'.format(len(cv_results['auc-mean'])))

The maximum validation ROC AUC was: 0.78203 with a standard deviation of 0.01587.
The optimal number of boosting rounds (estimators) was 23.


In [20]:
from sklearn.metrics import roc_auc_score

# Optimal number of esimators found in cv
model.n_estimators = len(cv_results['auc-mean'])
# model.n_estimators = 80

# Train and make predicions with model
model.fit(X_train, y_train)
preds = model.predict_proba(X_test)[:, 1]
baseline_auc = roc_auc_score(y_test, preds)

print('The baseline model scores {:.5f} ROC AUC on the test set.'.format(baseline_auc))

The baseline model scores 0.80835 ROC AUC on the test set.


# Hyperparameter Tuning

In [21]:
def objective(hyperparameters, iteration):
    """Objective function for grid and random search. Returns
       the cross validation score from a set of hyperparameters."""
    
    # Number of estimators will be found using early stopping
    if 'n_estimators' in hyperparameters.keys():
        del hyperparameters['n_estimators']
    
     # Perform n_folds cross validation
    cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 10000, nfold = N_FOLDS, 
                        callbacks=[lgb.early_stopping(stopping_rounds = 200)], metrics = 'auc', seed = 42)
    
    # results to retun
    score = cv_results['auc-mean'][-1]
    estimators = len(cv_results['auc-mean'])
    hyperparameters['n_estimators'] = estimators 
    
    return [score, hyperparameters, iteration]

In [42]:
import itertools
import random

def random_search(param_grid, max_evals = MAX_EVALS):
    """Random search for hyperparameter optimization"""
    
    # Dataframe for results
    results = pd.DataFrame(columns = ['score', 'hyperparameters', 'iteration'],
                                  index = list(range(MAX_EVALS)))
    
    # Keep searching until reach max evaluations
    for i in range(MAX_EVALS):
        
        # Choose random hyperparameters
        hyperparameters = {k: random.sample(v, 1)[0] for k, v in param_grid.items()}
        hyperparameters['subsample'] = 1.0 if hyperparameters['boosting_type'] == 'goss' else hyperparameters['subsample']

        # Evaluate randomly selected hyperparameters
        eval_results = objective(hyperparameters, i)
        
        results.loc[i, :] = eval_results
    
    # Sort with best score on top
    results.sort_values('score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    return results 

In [43]:
# Hyperparameter grid
param_grid = {
    'boosting_type': ['gbdt', 'goss', 'dart'],
    'num_leaves': list(range(20, 150)),
    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
    'subsample_for_bin': list(range(20000, 300000, 20000)),
    'min_child_samples': list(range(20, 500, 5)),
    'reg_alpha': list(np.linspace(0, 1)),
    'reg_lambda': list(np.linspace(0, 1)),
    'colsample_bytree': list(np.linspace(0.6, 1, 10)),
    'subsample': list(np.linspace(0.5, 1, 100)),
    'is_unbalance': [True, False]
}

In [44]:
random_results = random_search(param_grid)

print('The best validation score was {:.5f}'.format(random_results.loc[0, 'score']))
print('\nThe best hyperparameters were:')

import pprint
pprint.pprint(random_results.loc[0, 'params'])

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4081
[LightGBM] [Info] Number of data points in the train set: 8828, number of used features: 85
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4081
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 85
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4081
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 85
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4081
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 85
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4081
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 85
[LightGBM] [Info























Early stopping, best iteration is:
[163]	cv_agg's auc: 0.795213 + 0.0226753
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4081
[LightGBM] [Info] Number of data points in the train set: 8828, number of used features: 85
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4081
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 85
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4081
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 85
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4081
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 85
You can set `force_col_wise=true` to remove the overhead.
[L



























Early stopping, best iteration is:
[135]	cv_agg's auc: 0.794368 + 0.0213528
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4083
[LightGBM] [Info] Number of data points in the train set: 8828, number of used features: 86
[LightGBM] [Info] Using GOSS
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4083
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 86
[LightGBM] [Info] Using GOSS
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4083
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 86
[LightGBM] [Info] Using GOSS
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4083
[LightGBM] [Info] Number of data points in the train set: 8829, number of used features: 86
[LightGBM] [Info] Using GOSS
You can 























Early stopping, best iteration is:
[175]	cv_agg's auc: 0.796887 + 0.0194921
The best validation score was 0.79689

The best hyperparameters were:


KeyError: 'params'

In [None]:
# Get the best parameters
random_search_params = random_results.loc[0, 'hyperparameters']

# Create, train, test model
model = lgb.LGBMClassifier(**random_search_params, random_state = 42)
model.fit(X_train, y_train)

preds = model.predict_proba(X_test)[:, 1]

print('The best model from random search scores {:.5f} ROC AUC on the test set.'.format(roc_auc_score(y_test, preds)))

In [None]:
pd.options.display.max_colwidth = 100
random_results['hyperparameters'].values

In [45]:
def evaluate(results, name):
    """Evaluate model on test data using hyperparameters in results
       Return dataframe of hyperparameters"""
        
    # Sort with best values on top
    results = results.sort_values('score', ascending = False).reset_index(drop = True)
    
    # Print out cross validation high score
    print('The highest cross validation score from {} was {:.5f} found on iteration {}.'.format(name, results.loc[0, 'score'], results.loc[0, 'iteration']))
    
    # Use best hyperparameters to create a model
    hyperparameters = results.loc[0, 'hyperparameters']
    model = lgb.LGBMClassifier(**hyperparameters)
    
    # Train and make predictions
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_test)[:, 1]
    
    print('ROC AUC from {} on test data = {:.5f}.'.format(name, roc_auc_score(y_test, preds)))
    
    # Create dataframe of hyperparameters
    hyp_df = pd.DataFrame(columns = list(results.loc[0, 'hyperparameters'].keys()))

    # Iterate through each set of hyperparameters that were evaluated
    for i, hyp in enumerate(results['hyperparameters']):
        hyp_df = hyp_df.append(pd.DataFrame(hyp, index = [0]), 
                               ignore_index = True)
        
    # Put the iteration and score in the hyperparameter dataframe
    hyp_df['iteration'] = results['iteration']
    hyp_df['score'] = results['score']
    
    return hyp_df

In [46]:
random_hyp = evaluate(random_results, name = 'random search')

The highest cross validation score from random search was 0.79689 found on iteration 4.
ROC AUC from random search on test data = 0.81471.
