# Imports

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier, Pool

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.utils.multiclass import unique_labels


# Load dataset

In [2]:
X = pd.read_csv("../../data/training_set_features.csv").drop("respondent_id", axis=1)
Y = pd.read_csv("../../data/training_set_labels.csv").drop("respondent_id", axis=1)

features = list(X)
targets = list(Y)

target = targets[0]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

cat_features = [i for i in range(len(features))]
nominal_features = list(X_train.select_dtypes(object))
nominal_features_idx = [features.index(feature) for feature in nominal_features]
ordinal_features_idx = [feature_idx for feature_idx in cat_features if feature_idx not in nominal_features_idx]

# Preprocessing

In [4]:
def to_str(x):
    return x.astype(str)

def to_df(x):
    return pd.DataFrame(x)

In [5]:
pipeline = Pipeline(steps=[
    (
        'preprocessing',
        Pipeline(steps=[
            ('fillna', SimpleImputer(strategy='constant', fill_value="nan", copy=False)),
            ('to_str', FunctionTransformer(to_str)),
            ('encoder', ColumnTransformer(
                [('nominal', OneHotEncoder(), nominal_features_idx),
                 ('ordinal', OrdinalEncoder(), ordinal_features_idx)], remainder='passthrough')),
            ('to_df', FunctionTransformer(to_df))
        ])
    )
])

pipeline.fit(X_train)
X_train_processed = pipeline.transform(X_train)
X_test_processed = pipeline.transform(X_test)

# Training

In [6]:
class GradientBoostingClassifierTuned(BaseEstimator):
    def __init__(
        self,
        estimator=GradientBoostingClassifier(),
        parameters_grid=None
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """

        self.estimator = estimator
        self.parameters_grid = parameters_grid

    def fit(self, X, y=None, **kwargs):
        self.classes_ = unique_labels(y)
        random_search = RandomizedSearchCV(self.estimator,
                                           self.parameters_grid,
                                           n_jobs=-1,
                                           n_iter=10,
                                           scoring="roc_auc")
        random_search.fit(X, y)
        self.estimator = random_search.best_estimator_
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)


class ClassifierChainEnsemble(BaseEstimator):
    def __init__(
        self,
        base_learner=GradientBoostingClassifier(),
        orders=[]
    ):
        """
        A Custom BaseEstimator that can switch between classifiers.
        :param estimator: sklearn object - The classifier
        """
        self.chains = [ClassifierChain(base_learner, order=order, random_state=42)
                       for order in orders]
        self.parameters_grid = parameters_grid

    def fit(self, X, y=None, **kwargs):
        self.classes_ = unique_labels(y)

        for chain in self.chains:
            chain.fit(X, y)

        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X, y=None):
        Y_pred_chains = np.array([chain.predict_proba(X) for chain in
                                  self.chains])

        Y_pred_ensemble = Y_pred_chains.mean(axis=0)
        return Y_pred_ensemble

    def score(self, X, y):
        predictions = self.predict_proba(X, y)
        return roc_auc_score(y, predictions)

In [7]:
parameters_grid = {
    'learning_rate': [0.25, 0.1, 0.03],
    'n_estimators': [100, 500, 1000],
    'max_depth': np.linspace(1, 32, 32, endpoint=True),
    'min_samples_split': np.linspace(0.01, 0.5, 10, endpoint=True),
    'min_samples_leaf': np.linspace(0.01, 0.5, 10, endpoint=True)
}

In [8]:
orders = [[0, 1], [1, 0]]
cce = ClassifierChainEnsemble(base_learner=GradientBoostingClassifierTuned(parameters_grid=parameters_grid),
                              orders=orders)
cce.fit(X_train_processed, y_train)

Y_pred_ensemble = cce.predict_proba(X_test_processed, y_test)
cce.score(X_test_processed, y_test)

0.8637286709992643

In [14]:
orders = [[0, 1], [1, 0]]
base_learner=GradientBoostingClassifierTuned(parameters_grid=parameters_grid)

full_pipeline = Pipeline(steps=[
    (
        'preprocessing',
        Pipeline(steps=[
            ('fillna', SimpleImputer(strategy='constant', fill_value="nan", copy=False)),
            ('to_str', FunctionTransformer(to_str)),
            ('encoder', ColumnTransformer(
                [('nominal', OneHotEncoder(), nominal_features_idx),
                 ('ordinal', OrdinalEncoder(), ordinal_features_idx)], remainder='passthrough')),
            ('to_df', FunctionTransformer(to_df))
        ])
    ),
    (
        'model', ClassifierChainEnsemble(base_learner=base_learner,
                                         orders=orders)
    )
])

full_pipeline.fit(X_train, y_train)
train_predictions = full_pipeline.predict_proba(X_train)
test_predictions = full_pipeline.predict_proba(X_test)

print(f"AUC: {roc_auc_score(y_test, test_predictions)}")

AUC: 0.8634278808194351


# Feature Importance

# Submission

In [82]:
X_holdout = pd.read_csv("../../data/test_set_features.csv")
X_holdout_processed = pipeline.transform(X_holdout.drop("respondent_id", axis=1))

holdout_predictions = cce.predict_proba(X_holdout_processed)

In [83]:
submission_df = pd.DataFrame(holdout_predictions, columns=['h1n1_vaccine', 'seasonal_vaccine'])
submission_df['respondent_id'] = X_holdout['respondent_id']
submission_df[['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']].to_csv("../submissions/classifier_chain_gb.csv", index=False)