In [None]:
import pandas as pd
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
import numpy as np
%matplotlib inline

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc

In [None]:
from category_encoders import OrdinalEncoder as oe
from catboost import CatBoostClassifier

In [None]:
from catboost import CatBoostClassifier
from catboost import Pool, cv
import optuna
import pandas_profiling as pp

In [None]:
train = pd.read_csv('data/training_set_features.csv', index_col='respondent_id')

In [None]:
labels = pd.read_csv('data/training_set_labels.csv', index_col='respondent_id')

In [None]:
num_cols = train.select_dtypes('number').columns

In [None]:
cat_cols = ['race', 'sex', 'marital_status', 'rent_or_own',  'hhs_geo_region','census_msa', 'employment_industry', 'employment_occupation']

In [None]:
ord_cols = ['age_group', 'education',  'income_poverty','employment_status']

In [None]:
for col in (cat_cols+ord_cols):
    train[col] = train[col].fillna(value='None')

In [None]:
for col in num_cols:
    train[col] = train[col].fillna(value=-1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, labels, random_state=10)

In [None]:
categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
categorical_features_indices

In [None]:
train_dataset = Pool(data=X_train,
                     label=y_train.h1n1_vaccine,
                     cat_features = categorical_features_indices)

In [None]:
def objective(trial):
    param = {
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000,1200,1500]),
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,10),
        'bagging_temperature':trial.suggest_int("bagging_temperature", 0,10),
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),
        'od_type' : "Iter",
        'od_wait' : 100,
        "depth": trial.suggest_int("max_depth", 2,10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
         'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,100,500,1024]),
        'custom_metric' : ['AUC'],
        "loss_function": "Logloss",
        'auto_class_weights':trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
        }

    scores = cv(train_dataset,
            param,
            fold_count=5, 
            early_stopping_rounds=10,         
            plot=False, verbose=False)

    return scores['test-AUC-mean'].max()

In [None]:
sampler = optuna.samplers.TPESampler(seed=68)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=5)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}={},".format(key, value))

In [None]:
final_model = CatBoostClassifier(verbose=False,  cat_features=categorical_features_indices, **trial.params)

In [None]:
final_model.fit(X_train, y_train.h1n1_vaccine)

In [None]:
pred_h1n1 = final_model.predict_proba(X_test)

In [None]:
pred_h1n1 = pred_h1n1[:,1].reshape(-1,1)

In [None]:
roc_auc_score(y_test.h1n1_vaccine, pred_h1n1)

In [None]:
plot_confusion_matrix(final_model, X_test, y_test.h1n1_vaccine)
plt.show();

In [None]:
train_dataset_se = Pool(data=X_train,
                     label=y_train.seasonal_vaccine,
                     cat_features = categorical_features_indices)

In [None]:
def objective2(trial):
    param = {
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000,1200,1500]),
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,10),
        'bagging_temperature':trial.suggest_int("bagging_temperature", 0,10),
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),
        'od_type' : "Iter",
        'od_wait' : 100,
        "depth": trial.suggest_int("max_depth", 2,10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
         'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,100,500,1024]),
        'custom_metric' : ['AUC'],
        "loss_function": "Logloss",
        'auto_class_weights':trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
        }

    scores = cv(train_dataset_se,
            param,
            fold_count=5, 
            early_stopping_rounds=10,         
            plot=False, verbose=False) 

    return scores['test-AUC-mean'].max()

In [None]:
sampler2 = optuna.samplers.TPESampler(seed=70)
study2 = optuna.create_study(direction="maximize", sampler=sampler2)
study2.optimize(objective2, n_trials=5)

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}={},".format(key, value))

In [None]:
final_model_se = CatBoostClassifier(verbose=False,  cat_features=categorical_features_indices, **trial2.params)

In [None]:
final_model_se.fit(X_train, y_train.seasonal_vaccine)

In [None]:
predictions_se = final_model_se.predict_proba(X_test)

In [None]:
predictions_se = predictions_se[:,1].reshape(-1,1)

In [None]:
roc_auc_score(y_test.seasonal_vaccine, predictions_se)

In [None]:
plot_confusion_matrix(final_model, X_test, y_test.seasonal_vaccine)  
plt.show();