In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
import numpy as np
%matplotlib inline

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc

In [None]:
from category_encoders import OrdinalEncoder as oe

In [None]:
from catboost import CatBoostClassifier
from catboost import Pool, cv
import optuna
import pandas_profiling as pp

In [None]:
train = pd.read_csv('data/training_set_features.csv', index_col='respondent_id')

In [None]:
labels = pd.read_csv('data/training_set_labels.csv', index_col='respondent_id')

In [None]:
num_cols = train.select_dtypes('number').columns

In [None]:
cat_cols = ['race', 'sex', 'marital_status', 'rent_or_own',  'hhs_geo_region','census_msa', 'employment_industry', 'employment_occupation']

In [None]:
ord_cols = ['age_group', 'education',  'income_poverty','employment_status']

In [None]:
for col in (cat_cols+ord_cols):
    train[col] = train[col].fillna(value='None')

In [None]:
for col in num_cols:
    train[col] = train[col].fillna(value=-1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, labels, random_state=10)

In [None]:
categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
categorical_features_indices

In [None]:
params = {'iterations':1500,
         'learning_rate':0.3,
         'cat_features': categorical_features_indices,
         'random_strength':3,
         'depth':4,
         'max_bin':6,
         'grow_policy':'SymmetricTree',
         'eval_metric':'AUC',
         'od_type':'Iter',
         'od_wait':100,
         'l2_leaf_reg':10,
            'bootstrap_type': 'Bayesian',
         'bagging_temperature':10,
         'loss_function':'Logloss',
         'auto_class_weights':'Balanced',
         'verbose':False,
         }

In [None]:
model_classifier = CatBoostClassifier(**params)

In [None]:
model_classifier.fit(X_train, y_train.h1n1_vaccine, eval_set=(X_test, y_test.h1n1_vaccine), use_best_model=True, plot=True);

In [None]:
model_classifier.fit(X_train, y_train.seasonal_vaccine, eval_set=(X_test, y_test.seasonal_vaccine), use_best_model=True, plot=True);

In [None]:
feat_importance = [t for t in zip(train, model_classifier.get_feature_importance())]
feat_importance_df = pd.DataFrame(data=feat_importance, columns=['Feature', 'VarImp'])
feat_importance_df = feat_importance_df.sort_values('VarImp', ascending=False)
feat_importance_df.head()