In [50]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
import pickle
import os
import numpy as np
%matplotlib inline

In [51]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier 
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

In [54]:
from category_encoders import OrdinalEncoder as oe
from catboost import CatBoostClassifier

In [56]:
y = pd.read_csv('training_set_labels.csv').drop('respondent_id', axis = 1)

X = pd.read_csv('training_set_features.csv').drop('respondent_id', axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [102]:
X['income_poverty'].value_counts()

<= $75,000, Above Poverty    12777
> $75,000                     6810
Below Poverty                 2697
Name: income_poverty, dtype: int64

In [57]:
numericals = []
non_numericals = []

for column in X_train.columns:
    if X_train[column].dtype == 'float64':
        numericals.append(column)
    if X_train[column].dtype == 'object':
        non_numericals.append(column)

In [61]:
non_numericals

['age_group',
 'education',
 'race',
 'sex',
 'income_poverty',
 'marital_status',
 'rent_or_own',
 'employment_status',
 'hhs_geo_region',
 'census_msa',
 'employment_industry',
 'employment_occupation']

In [155]:
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant',fill_value= 0, add_indicator = True)),
                               ('scaler', StandardScaler())])

categorical_transformer = Pipeline([('cat_imputer', SimpleImputer(strategy='most_frequent', add_indicator = True)),
                                    ('encoder', OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numericals),
            ("cat", categorical_transformer, non_numericals),
        ]
    )

In [6]:
cat_cols = ['race', 'sex', 
       'marital_status', 'rent_or_own',  'hhs_geo_region',
       'census_msa', 'employment_industry', 'employment_occupation']

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.svm import SVC
from sklearn.linear_model import PassiveAggressiveClassifier

In [18]:
categorical_features_indices = np.where(X_train.dtypes != float)[0]
categorical_features_indices

array([21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34], dtype=int64)

In [64]:
def model_evaluation(X_train, X_test, y_train, y_test,
                         baseline_models, 
                         preprocessor,
                         folder_name = None, 
                         ):
    
    # Create a summary dictionary
    summary_dict = {}
    
    for name, model in baseline_models.items():
        
        # transform the features    
        processor = model['preprocessor']
        X_train_processed = processor.fit_transform(X_train)
        X_test_processed = processor.transform(X_test)
    
        # Cross validation
        model['train_accuracy_score'] = np.mean(cross_val_score(model['regressor'], 
                                                        X_train_processed, y_train.values.ravel(), 
                                                        scoring="accuracy", cv=5))
    
        train_accuracy_score = model['train_accuracy_score']
    
        # fit the new model and make predictions
        new_model = model['regressor']
        new_model.fit(X_train_processed, y_train.values.ravel())
        preds = new_model.predict(X_test_processed)
        y_score = new_model.predict_proba(X_test_processed)

        # get our scoring metrics
        model['test_accuracy_score'] = accuracy_score(y_test, preds)
        test_accuracy_score = model['test_accuracy_score']
        
        model['auc_score'] = roc_auc_score(y_test, y_score[:,1])
        auc_score = model['auc_score']
        
        model['recall_score'] = recall_score(y_test, preds)
        model['precision_score'] = f1_score(y_test, preds)
        model['f1_score'] = precision_score(y_test, preds)
        
        recall = recall_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        precision = precision_score(y_test, preds)
        
        # Visualisations
        fpr, tpr, thresholds = roc_curve(y_test, y_score[:,1])
        model['fpr'] = fpr
        model['tpr'] = tpr
        model['thresholds'] = thresholds
    
        # Saving the model
        if folder_name == None:
            pass
        else:
            os.makedirs(f'models/{name}/{folder_name}') 
            filepath = f'models/{name}/{folder_name}/baseline_model.pickl'
            pickle.dump(new_model, open(filepath, 'wb'))
        
        #Place everything into a dictionary and place that into the summary list
        summary_dict.update({name: {
                                   'train_score': train_accuracy_score, 'test_score': test_accuracy_score,
                                   'recall': recall, 'precision': precision, 'f1': f1,
                                   'auc': auc_score, 'tpr': tpr, 'fpr': fpr
                                   }})

    return summary_dict

In [156]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [70]:
X_train_processed_df = pd.DataFrame(data=X_train_processed)

In [157]:
cbc = CatBoostClassifier(iterations=100,
        learning_rate=0.1,
        random_strength=10,
        bagging_temperature=10,
        max_bin=30,
        grow_policy='Lossguide',
        min_data_in_leaf=10,
        od_type='Iter',
        od_wait=100,
        depth=10,
        l2_leaf_reg=100,
        one_hot_max_size=5,
        custom_metric='AUC',
        loss_function='Logloss',
        auto_class_weights='Balanced',
        verbose=False)

In [127]:
X_train['age_group']

25194    18 - 34 Years
14006    45 - 54 Years
11285    45 - 54 Years
2900     55 - 64 Years
19083    18 - 34 Years
             ...      
21575    55 - 64 Years
5390     55 - 64 Years
860      55 - 64 Years
15795    35 - 44 Years
23654    18 - 34 Years
Name: age_group, Length: 20030, dtype: object

In [158]:
cbc.fit(X_train_processed_df, y_train.h1n1_vaccine)

<catboost.core.CatBoostClassifier at 0x2723a423130>

In [159]:
cbc_pred = cbc.predict(X_train_processed)

In [160]:
cbc_pred.shape

(20030,)

In [161]:
roc_auc_score(y_train.h1n1_vaccine, cbc_pred)

0.7814026136402172

In [162]:
y_test.shape

(6677, 2)

In [163]:
pred = cbc.predict(X_test_processed)

In [164]:
accuracy_score(y_test.h1n1_vaccine, pred)

0.7946682641905047

In [165]:
recall_score(y_test.h1n1_vaccine, pred)

0.744530698659139

In [166]:
f1_score(y_test.h1n1_vaccine, pred)

0.606147658718759

In [167]:
precision_score(y_test.h1n1_vaccine, pred)

0.5111434108527132

In [168]:
cbc.fit(X_train_processed_df, y_train.seasonal_vaccine)

<catboost.core.CatBoostClassifier at 0x2723a423130>

In [169]:
roc_auc_score(y_train.seasonal_vaccine, cbc_pred)

0.6518529548214063

In [170]:
accuracy_score(y_test.seasonal_vaccine, pred)

0.6610753332334881

In [134]:
from catboost import Pool, cv
train_dataset = Pool(X_train_processed,
                     label=y_train.h1n1_vaccine)

In [171]:
def objective(trial):
    param = {
        'iterations':trial.suggest_categorical('iterations', [100,200,300,500,1000,1200,1500]),
        'learning_rate':trial.suggest_float("learning_rate", 0.001, 0.3),
        'random_strength':trial.suggest_int("random_strength", 1,10),
        'bagging_temperature':trial.suggest_int("bagging_temperature", 0,10),
        'max_bin':trial.suggest_categorical('max_bin', [4,5,6,8,10,20,30]),
        'grow_policy':trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'min_data_in_leaf':trial.suggest_int("min_data_in_leaf", 1,10),
        'od_type' : "Iter",
        'od_wait' : 100,
        "depth": trial.suggest_int("max_depth", 2,10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-8, 100),
         'one_hot_max_size':trial.suggest_categorical('one_hot_max_size', [5,10,12,100,500,1024]),
        'custom_metric' : ['AUC'],
        "loss_function": "Logloss",
        'auto_class_weights':trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced']),
        }

    scores = cv(train_dataset,
            param,
            fold_count=5, 
            early_stopping_rounds=10,         
            plot=False, verbose=False)

    return scores['test-AUC-mean'].max()

In [172]:
import optuna
sampler = optuna.samplers.TPESampler(seed=68)  # Make the sampler behave in a deterministic way.
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=20)

[32m[I 2022-01-25 17:29:53,154][0m A new study created in memory with name: no-name-602511eb-a740-4557-a582-d8c86d46ffb4[0m


Training on fold [0/5]

bestTest = 0.4169302285
bestIteration = 554

Training on fold [1/5]

bestTest = 0.4320167812
bestIteration = 554

Training on fold [2/5]

bestTest = 0.4320521886
bestIteration = 620

Training on fold [3/5]

bestTest = 0.4137110671
bestIteration = 512

Training on fold [4/5]


[32m[I 2022-01-25 17:30:11,395][0m Trial 0 finished with value: 0.8646284729540925 and parameters: {'iterations': 1500, 'learning_rate': 0.029356482739949695, 'random_strength': 8, 'bagging_temperature': 10, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 1, 'max_depth': 4, 'l2_leaf_reg': 0.001991194871120998, 'one_hot_max_size': 100, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 0 with value: 0.8646284729540925.[0m



bestTest = 0.4404139222
bestIteration = 531

Training on fold [0/5]

bestTest = 0.4122171118
bestIteration = 153

Training on fold [1/5]

bestTest = 0.4295098882
bestIteration = 148

Training on fold [2/5]

bestTest = 0.431511785
bestIteration = 140

Training on fold [3/5]

bestTest = 0.4122076105
bestIteration = 155

Training on fold [4/5]


[32m[I 2022-01-25 17:30:15,338][0m Trial 1 finished with value: 0.8651912924685078 and parameters: {'iterations': 200, 'learning_rate': 0.1464067066361795, 'random_strength': 10, 'bagging_temperature': 3, 'max_bin': 10, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 3, 'l2_leaf_reg': 0.028402775147703313, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4418847921
bestIteration = 132

Training on fold [0/5]

bestTest = 0.4198312772
bestIteration = 66

Training on fold [1/5]

bestTest = 0.4338020226
bestIteration = 51

Training on fold [2/5]

bestTest = 0.4392519813
bestIteration = 47

Training on fold [3/5]

bestTest = 0.423011524
bestIteration = 52

Training on fold [4/5]


[32m[I 2022-01-25 17:30:19,342][0m Trial 2 finished with value: 0.8604684339560922 and parameters: {'iterations': 200, 'learning_rate': 0.27287829596201946, 'random_strength': 8, 'bagging_temperature': 8, 'max_bin': 10, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 1, 'max_depth': 5, 'l2_leaf_reg': 0.027330135035255495, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4450773829
bestIteration = 54

Training on fold [0/5]

bestTest = 0.4178642463
bestIteration = 474

Training on fold [1/5]

bestTest = 0.4347398981
bestIteration = 424

Training on fold [2/5]

bestTest = 0.4337643232
bestIteration = 540

Training on fold [3/5]

bestTest = 0.4153126011
bestIteration = 382

Training on fold [4/5]


[32m[I 2022-01-25 17:30:27,869][0m Trial 3 finished with value: 0.86340320707297 and parameters: {'iterations': 1200, 'learning_rate': 0.0603209284932487, 'random_strength': 3, 'bagging_temperature': 7, 'max_bin': 4, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 2, 'max_depth': 2, 'l2_leaf_reg': 1.300471404766049e-07, 'one_hot_max_size': 10, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4431121835
bestIteration = 364

Training on fold [0/5]

bestTest = 0.416787541
bestIteration = 88

Training on fold [1/5]

bestTest = 0.4382255685
bestIteration = 65

Training on fold [2/5]

bestTest = 0.4374459833
bestIteration = 78

Training on fold [3/5]

bestTest = 0.4207838483
bestIteration = 73

Training on fold [4/5]


[32m[I 2022-01-25 17:30:30,530][0m Trial 4 finished with value: 0.8588732376008072 and parameters: {'iterations': 300, 'learning_rate': 0.22423670437233847, 'random_strength': 6, 'bagging_temperature': 2, 'max_bin': 30, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 4, 'max_depth': 4, 'l2_leaf_reg': 0.00010293033487726667, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.458253497
bestIteration = 35

Training on fold [0/5]

bestTest = 0.4820225261
bestIteration = 99

Training on fold [1/5]

bestTest = 0.4949513849
bestIteration = 99

Training on fold [2/5]

bestTest = 0.4927225443
bestIteration = 99

Training on fold [3/5]

bestTest = 0.4800932551
bestIteration = 99

Training on fold [4/5]


[32m[I 2022-01-25 17:30:33,082][0m Trial 5 finished with value: 0.8514272932659803 and parameters: {'iterations': 100, 'learning_rate': 0.06628011038512191, 'random_strength': 4, 'bagging_temperature': 4, 'max_bin': 20, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 3, 'max_depth': 2, 'l2_leaf_reg': 13.751833235431702, 'one_hot_max_size': 100, 'auto_class_weights': 'Balanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.5011670429
bestIteration = 99

Training on fold [0/5]

bestTest = 0.4194642491
bestIteration = 126

Training on fold [1/5]

bestTest = 0.4460610622
bestIteration = 68

Training on fold [2/5]

bestTest = 0.4375303515
bestIteration = 92

Training on fold [3/5]

bestTest = 0.4253720716
bestIteration = 89

Training on fold [4/5]


[32m[I 2022-01-25 17:30:45,841][0m Trial 6 finished with value: 0.8583759931549235 and parameters: {'iterations': 1200, 'learning_rate': 0.09658215406978513, 'random_strength': 8, 'bagging_temperature': 2, 'max_bin': 30, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 6, 'max_depth': 10, 'l2_leaf_reg': 2.6558249848041764, 'one_hot_max_size': 5, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4464435425
bestIteration = 81

Training on fold [0/5]

bestTest = 0.4519628868
bestIteration = 96

Training on fold [1/5]

bestTest = 0.4656011196
bestIteration = 93

Training on fold [2/5]

bestTest = 0.4731202274
bestIteration = 111

Training on fold [3/5]

bestTest = 0.4431378536
bestIteration = 107

Training on fold [4/5]


[32m[I 2022-01-25 17:30:48,363][0m Trial 7 finished with value: 0.863268734916975 and parameters: {'iterations': 500, 'learning_rate': 0.2714096381817127, 'random_strength': 4, 'bagging_temperature': 6, 'max_bin': 8, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 2, 'l2_leaf_reg': 4.9369231964322795, 'one_hot_max_size': 1024, 'auto_class_weights': 'Balanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4743531603
bestIteration = 92

Training on fold [0/5]

bestTest = 0.4182557831
bestIteration = 103

Training on fold [1/5]

bestTest = 0.4330939869
bestIteration = 91

Training on fold [2/5]

bestTest = 0.4306942257
bestIteration = 107

Training on fold [3/5]

bestTest = 0.4132711653
bestIteration = 83

Training on fold [4/5]


[32m[I 2022-01-25 17:30:51,091][0m Trial 8 finished with value: 0.8628349525838763 and parameters: {'iterations': 1500, 'learning_rate': 0.2053434310118264, 'random_strength': 8, 'bagging_temperature': 0, 'max_bin': 20, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 10, 'max_depth': 3, 'l2_leaf_reg': 9.501510078266123e-06, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4444497564
bestIteration = 81

Training on fold [0/5]

bestTest = 0.4468518985
bestIteration = 16

Training on fold [1/5]

bestTest = 0.4464740123
bestIteration = 20

Training on fold [2/5]

bestTest = 0.4449233945
bestIteration = 30

Training on fold [3/5]

bestTest = 0.4367403759
bestIteration = 18

Training on fold [4/5]


[32m[I 2022-01-25 17:30:53,634][0m Trial 9 finished with value: 0.849725480961643 and parameters: {'iterations': 100, 'learning_rate': 0.25900665720714294, 'random_strength': 3, 'bagging_temperature': 0, 'max_bin': 6, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 8, 'max_depth': 7, 'l2_leaf_reg': 1.1694576328936887e-07, 'one_hot_max_size': 12, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.460238864
bestIteration = 40

Training on fold [0/5]

bestTest = 0.463580193
bestIteration = 82

Training on fold [1/5]

bestTest = 0.491848518
bestIteration = 64

Training on fold [2/5]

bestTest = 0.4904919368
bestIteration = 41

Training on fold [3/5]

bestTest = 0.4731387746
bestIteration = 74

Training on fold [4/5]


[32m[I 2022-01-25 17:30:58,198][0m Trial 10 finished with value: 0.8501888132064981 and parameters: {'iterations': 200, 'learning_rate': 0.15034104921866256, 'random_strength': 10, 'bagging_temperature': 4, 'max_bin': 5, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 6, 'max_depth': 7, 'l2_leaf_reg': 0.06866979073979589, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4895294901
bestIteration = 54

Training on fold [0/5]

bestTest = 0.4210802771
bestIteration = 1499

Training on fold [1/5]

bestTest = 0.4355994753
bestIteration = 1499

Training on fold [2/5]

bestTest = 0.4354660049
bestIteration = 1499

Training on fold [3/5]

bestTest = 0.4198432
bestIteration = 1499

Training on fold [4/5]


[32m[I 2022-01-25 17:32:20,200][0m Trial 11 finished with value: 0.8626007200419238 and parameters: {'iterations': 1500, 'learning_rate': 0.008172454094813715, 'random_strength': 10, 'bagging_temperature': 9, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 4, 'max_depth': 5, 'l2_leaf_reg': 0.004159650728547698, 'one_hot_max_size': 100, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4440428936
bestIteration = 1499

Training on fold [0/5]

bestTest = 0.4152194359
bestIteration = 101

Training on fold [1/5]

bestTest = 0.4344109053
bestIteration = 89

Training on fold [2/5]

bestTest = 0.4334137952
bestIteration = 99

Training on fold [3/5]

bestTest = 0.4189940948
bestIteration = 107

Training on fold [4/5]


[32m[I 2022-01-25 17:32:23,739][0m Trial 12 finished with value: 0.8625164587073423 and parameters: {'iterations': 1000, 'learning_rate': 0.15199867028113, 'random_strength': 7, 'bagging_temperature': 10, 'max_bin': 10, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 1, 'max_depth': 4, 'l2_leaf_reg': 0.00013028549097931553, 'one_hot_max_size': 100, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4406147389
bestIteration = 89

Training on fold [0/5]

bestTest = 0.4327069573
bestIteration = 199

Training on fold [1/5]

bestTest = 0.4441346609
bestIteration = 199

Training on fold [2/5]

bestTest = 0.4439378042
bestIteration = 199

Training on fold [3/5]

bestTest = 0.4282226108
bestIteration = 199

Training on fold [4/5]


[32m[I 2022-01-25 17:32:39,034][0m Trial 13 finished with value: 0.8596106853026626 and parameters: {'iterations': 200, 'learning_rate': 0.015312092326589866, 'random_strength': 1, 'bagging_temperature': 5, 'max_bin': 6, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 3, 'max_depth': 6, 'l2_leaf_reg': 0.20127448967652853, 'one_hot_max_size': 1024, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.450678261
bestIteration = 199

Training on fold [0/5]

bestTest = 0.4633250379
bestIteration = 89

Training on fold [1/5]

bestTest = 0.4798457975
bestIteration = 92

Training on fold [2/5]

bestTest = 0.4970905181
bestIteration = 43

Training on fold [3/5]

bestTest = 0.4630943367
bestIteration = 90

Training on fold [4/5]


[32m[I 2022-01-25 17:32:44,930][0m Trial 14 finished with value: 0.8527818406106465 and parameters: {'iterations': 1500, 'learning_rate': 0.14690216116501453, 'random_strength': 10, 'bagging_temperature': 2, 'max_bin': 10, 'grow_policy': 'Lossguide', 'min_data_in_leaf': 3, 'max_depth': 9, 'l2_leaf_reg': 0.0012700604558864655, 'one_hot_max_size': 5, 'auto_class_weights': 'Balanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4881113768
bestIteration = 88

Training on fold [0/5]

bestTest = 0.4144923871
bestIteration = 151

Training on fold [1/5]

bestTest = 0.4351198193
bestIteration = 135

Training on fold [2/5]

bestTest = 0.4380786963
bestIteration = 149

Training on fold [3/5]

bestTest = 0.4220213145
bestIteration = 125

Training on fold [4/5]


[32m[I 2022-01-25 17:32:49,995][0m Trial 15 finished with value: 0.8609241246295614 and parameters: {'iterations': 500, 'learning_rate': 0.10902072034609422, 'random_strength': 9, 'bagging_temperature': 10, 'max_bin': 4, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 5, 'max_depth': 4, 'l2_leaf_reg': 9.46314667205651e-06, 'one_hot_max_size': 12, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4421279212
bestIteration = 138

Training on fold [0/5]

bestTest = 0.4164598089
bestIteration = 117

Training on fold [1/5]

bestTest = 0.4316936218
bestIteration = 95

Training on fold [2/5]

bestTest = 0.4328478872
bestIteration = 129

Training on fold [3/5]

bestTest = 0.4151045199
bestIteration = 96

Training on fold [4/5]


[32m[I 2022-01-25 17:32:53,002][0m Trial 16 finished with value: 0.8637084217832557 and parameters: {'iterations': 300, 'learning_rate': 0.19373131951584033, 'random_strength': 6, 'bagging_temperature': 4, 'max_bin': 8, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'max_depth': 3, 'l2_leaf_reg': 0.17933449466684065, 'one_hot_max_size': 500, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4410023887
bestIteration = 118

Training on fold [0/5]

bestTest = 0.4198496509
bestIteration = 270

Training on fold [1/5]

bestTest = 0.4340202546
bestIteration = 293

Training on fold [2/5]

bestTest = 0.4424190968
bestIteration = 193

Training on fold [3/5]

bestTest = 0.4177517892
bestIteration = 297

Training on fold [4/5]


[32m[I 2022-01-25 17:33:12,467][0m Trial 17 finished with value: 0.8615975747025792 and parameters: {'iterations': 1000, 'learning_rate': 0.049633505636177605, 'random_strength': 9, 'bagging_temperature': 7, 'max_bin': 5, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 2, 'max_depth': 6, 'l2_leaf_reg': 0.007808292782556029, 'one_hot_max_size': 100, 'auto_class_weights': 'SqrtBalanced'}. Best is trial 1 with value: 0.8651912924685078.[0m



bestTest = 0.4389945698
bestIteration = 279

Training on fold [0/5]

bestTest = 0.4448853147
bestIteration = 250

Training on fold [1/5]

bestTest = 0.4641279581
bestIteration = 160

Training on fold [2/5]

bestTest = 0.4672899176
bestIteration = 217

Training on fold [3/5]

bestTest = 0.4426935548
bestIteration = 256

Training on fold [4/5]


[32m[I 2022-01-25 17:33:18,753][0m Trial 18 finished with value: 0.8658380772528973 and parameters: {'iterations': 1500, 'learning_rate': 0.10620743554253874, 'random_strength': 7, 'bagging_temperature': 3, 'max_bin': 6, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 2, 'max_depth': 3, 'l2_leaf_reg': 77.18909314164286, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 18 with value: 0.8658380772528973.[0m



bestTest = 0.4696789675
bestIteration = 199

Training on fold [0/5]

bestTest = 0.445803794
bestIteration = 199

Training on fold [1/5]

bestTest = 0.4617681462
bestIteration = 179

Training on fold [2/5]

bestTest = 0.4674646175
bestIteration = 151

Training on fold [3/5]

bestTest = 0.4445566233
bestIteration = 178

Training on fold [4/5]


[32m[I 2022-01-25 17:33:24,123][0m Trial 19 finished with value: 0.8659247618189543 and parameters: {'iterations': 200, 'learning_rate': 0.11785297050011456, 'random_strength': 5, 'bagging_temperature': 3, 'max_bin': 10, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 2, 'max_depth': 3, 'l2_leaf_reg': 99.11695455168547, 'one_hot_max_size': 500, 'auto_class_weights': 'Balanced'}. Best is trial 19 with value: 0.8659247618189543.[0m



bestTest = 0.4695991366
bestIteration = 175



In [141]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}={},".format(key, value))


Number of finished trials: 20
Best trial:
  Value: 0.8659247618189543
  Params: 
    iterations=200,
    learning_rate=0.11785297050011456,
    random_strength=5,
    bagging_temperature=3,
    max_bin=10,
    grow_policy=Depthwise,
    min_data_in_leaf=2,
    max_depth=3,
    l2_leaf_reg=99.11695455168547,
    one_hot_max_size=500,
    auto_class_weights=Balanced,


In [143]:
final_model = CatBoostClassifier(verbose=False,**trial.params)

In [145]:
final_model.fit(X_train_processed, y_train.h1n1_vaccine)

<catboost.core.CatBoostClassifier at 0x2723a3c19d0>

In [149]:
predictions_h1 = final_model.predict(X_test_processed)

In [150]:
roc_auc_score(y_test.h1n1_vaccine, predictions_h1)

0.7940792951423642

In [154]:
final_model.fit(X, y.h1n1_vaccine)

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=21]="55 - 64 Years": Cannot convert 'b'55 - 64 Years'' to float