<h1>Flu Shot Learning: Predict H1N1 and Seasonal Flu Vaccines</h1>

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import sweetviz as sv
import ydata_profiling as yp
%matplotlib inline

In [2]:
pip install --upgrade ydata_profiling

Requirement already up-to-date: ydata_profiling in /config/.local/lib/python3.8/site-packages (4.6.4)
Note: you may need to restart the kernel to use updated packages.


## Load data

In [3]:
train = pd.read_csv('/config/workspace/dataset/training_set_features.csv', index_col='respondent_id')

In [4]:
labels = pd.read_csv('/config/workspace/dataset/training_set_labels.csv', index_col='respondent_id')

In [5]:
test = pd.read_csv('/config/workspace/dataset/test_set_features.csv', index_col='respondent_id')

### Imputation strategy

In [6]:
num_cols = train.select_dtypes('number').columns

In [7]:
cat_cols = ['race', 'sex', 
       'marital_status', 'rent_or_own',  'hhs_geo_region',
       'census_msa', 'employment_industry', 'employment_occupation']

In [8]:
ord_cols = ['age_group', 'education',  'income_poverty',
        'employment_status']

In [9]:
assert len(num_cols)+len(cat_cols)+len(ord_cols) == train.shape[1]

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from category_encoders import OrdinalEncoder as oe
from catboost import CatBoostClassifier
from catboost import Pool, cv
from sklearn.metrics import roc_curve, roc_auc_score
import optuna

#### Impute train

In [11]:
# Categorical columns None
for col in (cat_cols+ord_cols):
    train[col] = train[col].fillna(value='None')

In [12]:
for col in num_cols:
    train[col] = train[col].fillna(value=-1)

#### Impute test

In [13]:
# Categorical columns None
for col in (cat_cols+ord_cols):
    test[col] = test[col].fillna(value='None')

In [14]:
for col in num_cols:
    test[col] = test[col].fillna(value=-1)

In [15]:
### Train test split

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split( train, labels, test_size=0.3, random_state=68)

In [18]:
# Get the index number of categorical features
categorical_features_indices = np.where(X_train.dtypes != float)[0]

In [19]:
train_dataset = Pool(data=X_train,
                     label=y_train.h1n1_vaccine,
                     cat_features = categorical_features_indices)

In [20]:
def objective(trial):
    param = {
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000,1200,1500]),
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,10),
        'bagging_temperature':trial.suggest_int("bagging_temperature", 0,10),
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),
        'od_type' : "Iter",
        'od_wait' : 100,
        "depth": trial.suggest_int("max_depth", 2,10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
         'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,100,500,1024]),
        'custom_metric' : ['AUC'],
        "loss_function": "Logloss",
        'auto_class_weights':trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
        }

    scores = cv(train_dataset,
            param,
            fold_count=5, 
            early_stopping_rounds=10,         
            plot=False, verbose=False)

    return scores['test-AUC-mean'].max()

In [21]:
sampler = optuna.samplers.TPESampler(seed=68)  # Make the sampler behave in a deterministic way.
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)

[32m[I 2024-02-23 02:55:59,226][0m A new study created in memory with name: no-name-7b9a0f52-d209-4943-8036-66af6d57c93f[0m


Training on fold [0/5]

bestTest = 0.4315445253
bestIteration = 299

Training on fold [1/5]

bestTest = 0.4274361446
bestIteration = 299

Training on fold [2/5]

bestTest = 0.4384665258
bestIteration = 299

Training on fold [3/5]

bestTest = 0.4322242789
bestIteration = 299

Training on fold [4/5]


[32m[I 2024-02-23 02:59:10,905][0m Trial 0 finished with value: 0.863497321698375 and parameters: {'iterations': 300, 'learning_rate': 0.013964954297408176, 'random_strength': 1, 'bagging_temperature': 8, 'max_bin': 10, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 4, 'max_depth': 5, 'l2_leaf_reg': 21.328495943450676, 'one_hot_max_size': 100, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 0 with value: 0.863497321698375.[0m



bestTest = 0.4350673967
bestIteration = 299

Training on fold [0/5]

bestTest = 0.4249302434
bestIteration = 110

Training on fold [1/5]

bestTest = 0.4211924471
bestIteration = 128

Training on fold [2/5]

bestTest = 0.4320824886
bestIteration = 125

Training on fold [3/5]

bestTest = 0.4278454245
bestIteration = 116

Training on fold [4/5]


[32m[I 2024-02-23 03:00:31,489][0m Trial 1 finished with value: 0.863269013814264 and parameters: {'iterations': 1200, 'learning_rate': 0.11477165079768124, 'random_strength': 9, 'bagging_temperature': 6, 'max_bin': 5, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 5, 'max_depth': 7, 'l2_leaf_reg': 0.5714362138520529, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 0 with value: 0.863497321698375.[0m



bestTest = 0.4300049989
bestIteration = 128

Training on fold [0/5]

bestTest = 0.4753784322
bestIteration = 129

Training on fold [1/5]

bestTest = 0.4516162867
bestIteration = 235

Training on fold [2/5]

bestTest = 0.4585822194
bestIteration = 318

Training on fold [3/5]

bestTest = 0.4613918582
bestIteration = 215

Training on fold [4/5]


[32m[I 2024-02-23 03:02:15,575][0m Trial 2 finished with value: 0.8626100061884463 and parameters: {'iterations': 1500, 'learning_rate': 0.06018181691194878, 'random_strength': 6, 'bagging_temperature': 4, 'max_bin': 10, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 6, 'max_depth': 4, 'l2_leaf_reg': 7.937278880272323e-06, 'one_hot_max_size': 12, 'auto_class_weights': 'Balanced'}. Best is trial 0 with value: 0.863497321698375.[0m



bestTest = 0.4639801323
bestIteration = 212

Training on fold [0/5]

bestTest = 0.4331103639
bestIteration = 99

Training on fold [1/5]

bestTest = 0.4282282172
bestIteration = 99

Training on fold [2/5]

bestTest = 0.4367371853
bestIteration = 97

Training on fold [3/5]

bestTest = 0.4338402326
bestIteration = 99

Training on fold [4/5]


[32m[I 2024-02-23 03:03:19,397][0m Trial 3 finished with value: 0.8616760026909407 and parameters: {'iterations': 100, 'learning_rate': 0.04917655399574397, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 30, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 3, 'max_depth': 6, 'l2_leaf_reg': 1.253388039132331e-06, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 0 with value: 0.863497321698375.[0m



bestTest = 0.4323481015
bestIteration = 99

Training on fold [0/5]

bestTest = 0.429245159
bestIteration = 135

Training on fold [1/5]

bestTest = 0.4226120359
bestIteration = 124

Training on fold [2/5]

bestTest = 0.4317775965
bestIteration = 132

Training on fold [3/5]

bestTest = 0.4236917955
bestIteration = 145

Training on fold [4/5]


[32m[I 2024-02-23 03:03:53,577][0m Trial 4 finished with value: 0.8640523372521614 and parameters: {'iterations': 1500, 'learning_rate': 0.20567786125414012, 'random_strength': 8, 'bagging_temperature': 7, 'max_bin': 20, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 7, 'max_depth': 2, 'l2_leaf_reg': 0.0011659140576640084, 'one_hot_max_size': 12, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 4 with value: 0.8640523372521614.[0m



bestTest = 0.4284532832
bestIteration = 130

Training on fold [0/5]

bestTest = 0.4314479907
bestIteration = 46

Training on fold [1/5]

bestTest = 0.4298937944
bestIteration = 64

Training on fold [2/5]

bestTest = 0.4391929406
bestIteration = 56

Training on fold [3/5]

bestTest = 0.4317307431
bestIteration = 55

Training on fold [4/5]


[32m[I 2024-02-23 03:04:19,182][0m Trial 5 finished with value: 0.8597554565769834 and parameters: {'iterations': 100, 'learning_rate': 0.21103536986772822, 'random_strength': 4, 'bagging_temperature': 4, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 6, 'max_depth': 4, 'l2_leaf_reg': 1.5248267732768012e-08, 'one_hot_max_size': 100, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 4 with value: 0.8640523372521614.[0m



bestTest = 0.4347186712
bestIteration = 51

Training on fold [0/5]

bestTest = 0.462311859
bestIteration = 352

Training on fold [1/5]

bestTest = 0.4520983715
bestIteration = 449

Training on fold [2/5]

bestTest = 0.4674704404
bestIteration = 373

Training on fold [3/5]

bestTest = 0.4597967289
bestIteration = 364

Training on fold [4/5]


[32m[I 2024-02-23 03:06:25,125][0m Trial 6 finished with value: 0.8632808058136264 and parameters: {'iterations': 1000, 'learning_rate': 0.039222305770230614, 'random_strength': 5, 'bagging_temperature': 3, 'max_bin': 20, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 10, 'max_depth': 4, 'l2_leaf_reg': 3.544948380552023e-06, 'one_hot_max_size': 10, 'auto_class_weights': 'Balanced'}. Best is trial 4 with value: 0.8640523372521614.[0m



bestTest = 0.4660473039
bestIteration = 322

Training on fold [0/5]

bestTest = 0.4711987083
bestIteration = 214

Training on fold [1/5]


In [None]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}={},".format(key, value))


In [None]:
### Check the model

In [None]:
final_model = CatBoostClassifier(verbose=False,  cat_features=categorical_features_indices, 
                          **trial.params)

In [None]:
final_model.fit(X_train, y_train.h1n1_vaccine)

In [None]:
predictions_h1 = final_model.predict_proba(X_test)

In [None]:
predictions_h1 = predictions_h1[:,1].reshape(-1,1)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
def plot_roc(y_true, y_score, label_name, ax):
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    ax.plot(fpr, tpr)
    ax.plot([0, 1], [0, 1], color='grey', linestyle='--')
    ax.set_ylabel('TPR')
    ax.set_xlabel('FPR')
    ax.set_title(
        f"{label_name}: AUC = {roc_auc_score(y_true, y_score):.4f}"
    )

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
plot_roc(
    y_test['h1n1_vaccine'], 
    predictions_h1, 
    'h1n1_vaccine',
    ax=ax
)

In [None]:
roc_auc_score(y_test.h1n1_vaccine, predictions_h1)

In [None]:
train_dataset_se = Pool(data=X_train,
                     label=y_train.seasonal_vaccine,
                     cat_features = categorical_features_indices)

In [None]:
def objective2(trial):
    param = {
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000,1200,1500]),
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,10),
        'bagging_temperature':trial.suggest_int("bagging_temperature", 0,10),
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),
        'od_type' : "Iter",
        'od_wait' : 100,
        "depth": trial.suggest_int("max_depth", 2,10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
         'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,100,500,1024]),
        'custom_metric' : ['AUC'],
        "loss_function": "Logloss",
        'auto_class_weights':trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
        }

    scores = cv(train_dataset_se,
            param,
            fold_count=5, 
            early_stopping_rounds=10,         
            plot=False, verbose=False) 

    return scores['test-AUC-mean'].max()

In [None]:
sampler = optuna.samplers.TPESampler(seed=68)  # Make the sampler behave in a deterministic way.
study2 = optuna.create_study(direction="maximize", sampler=sampler)
study2.optimize(objective2, n_trials=100)

In [None]:
print("Number of finished trials: {}".format(len(study2.trials)))
print("Best trial:")
trial2 = study2.best_trial
print("  Value: {}".format(trial2.value))
print("  Params: ")
for key, value in trial2.params.items():
    print("    {}={},".format(key, value))


In [None]:
final_model_se = CatBoostClassifier(verbose=False,  cat_features=categorical_features_indices, 
                                    **trial2.params)

In [None]:
final_model_se.fit(X_train, y_train.seasonal_vaccine)

In [None]:
predictions_se = final_model_se.predict_proba(X_test)

In [None]:
predictions_se = predictions_se[:,1].reshape(-1,1)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
plot_roc(
    y_test['seasonal_vaccine'], 
    predictions_se, 
    'seasonal_vaccine',
    ax=ax
)

In [None]:
roc_auc_score(y_test.seasonal_vaccine, predictions_se)

## Combined score

#### Seasonal

In [None]:
final_model_se.fit(train, labels.seasonal_vaccine)

In [None]:
final_se = final_model_se.predict_proba(test)

In [None]:
final_se = final_se[:,1].reshape(-1,1)

#### H1N1

In [None]:
final_model.fit(train, labels.h1n1_vaccine)

In [None]:
final_h1 = final_model.predict_proba(test)

In [None]:
final_h1 = final_h1[:,1].reshape(-1,1)

## Make submission

In [None]:
submission_df = pd.read_csv("./submission_format.csv", 
                            index_col="respondent_id")

In [None]:
# Make sure we have the rows in the same order
np.testing.assert_array_equal(test.index.values, 
                              submission_df.index.values)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = final_h1
submission_df["seasonal_vaccine"] = final_se

submission_df.head()

In [None]:
date = pd.Timestamp.now().strftime(format='%Y-%m-%d_%H-%M_')
submission_df.to_csv(f'predictions/{date}submssion_catboost_optunacvi.csv', index=True)