## GridSearchCV on Sklearn's MultiOutputClassifier ##

Compare the run-times of this solution, where we use the Sklearn MultiOutputClassifier and apply GridSearchCV to it, with the run-times of the solution above, where we transform the multiclass-labels to four categories, train a pipeline containing a gridsearch:

Training: this = 0.8s, above = 1m53,3s

Retraining on full set: this = 1.2s , above = 2m20.5s

Maybe the gridsearch in the pipeline is time consuming and it should rather be a pipeline in the gridsearch?

In [1]:
import xgboost as xgb
from xgboost import XGBClassifier 
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, auc

from sklearn.svm import SVC

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

import auxiliaries as aux

  from pandas import MultiIndex, Int64Index


#### Read and Train-Test Split Data ####

In [None]:
# read the csv-files and take the respondent_id column as index:

X_train_df = pd.read_csv("training_set_features.csv", index_col="respondent_id")
y_train_df = pd.read_csv("training_set_labels.csv", index_col="respondent_id")
X_test_df = pd.read_csv("test_set_features.csv", index_col="respondent_id")

X_train_df.shape, X_test_df.shape
# Output:
# ((26707, 36), (26708, 36))


In [None]:
random_seed = 11
test_size = 0.2
X_train, X_eval, y_train, y_eval = train_test_split(X_train_df, y_train_df, test_size=test_size, random_state=random_seed)

#### Pre-processing Pipeline ####

In [None]:
num_cols = X_train_df.columns[X_train_df.dtypes != "object"].values
non_numeric_columns = X_train_df.columns[X_train_df.dtypes == "object"].values

In [None]:
# prepro pipeline for the numeric columns (we drop the non-numeric columns):

from sklearn.model_selection import GridSearchCV

# impute the most frequent value:
impute_most_frequent_numeric = Pipeline([
    ("most_frequent_imputer", SimpleImputer(strategy="most_frequent"))
    ])


# column transformer with only numerical columns:

preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", impute_most_frequent_numeric, num_cols),
        ("ohe_num", OneHotEncoder(), num_cols),
        ("ohe_non_num", OneHotEncoder(), non_numeric_columns)
    ],
    remainder="drop" # drop all non-numeric columns from the data
)


In [None]:
# grid-search for the multioutput-classifier as a whole:



model = XGBClassifier(use_label_encoder=False, eval_metric="error")
multioutput_model = MultiOutputClassifier(model)

multi_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("multioutput_clf", multioutput_model),
])

max_depth = [2, 4, 8]
n_estimators = [100, 200, 400, 800]


gridcv_params = dict(max_depth=max_depth, n_estimators=n_estimators)

grid_search = GridSearchCV(multi_pipeline, param_grid=gridcv_params, scoring="roc_auc", n_jobs=-1)

#### Train Model and Plot Results ####


In [None]:
grid_result = grid_search.fit(X_train, y_train)

#print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")

In [None]:
y_preds = multi_pipeline.predict_proba(X_eval)

# probabilities for the two categories:
y_preds[0]

In [None]:
# peek at data-structure:

y_eval.head(1)

In [None]:
y_preds_true = true_preds(y_preds, y_eval.index)

In [None]:
aux.plot_roc_h1n1_and_seasonal(y_eval, y_preds_true)

#### Save Submission-File ####

In [None]:
# save to "my_submission_file":

my_submission_file = "my_submission_MultiOutputClassifier_xgboost.csv"

aux.train_fullset_and_save(multi_pipeline, X_test_df, X_train_df, y_train_df, my_submission_file)

Documentation of dmlc XGBoost / py-xgboost(?)

https://xgboost.readthedocs.io/en/stable/python/python_intro.html#setting-parameters

In [None]:
model.get_params()

# output:
#{'objective': 'multi:softprob',
# 'use_label_encoder': False,
# 'base_score': 0.5,
# 'booster': 'gbtree',
# 'colsample_bylevel': 1,
# 'colsample_bynode': 1,
# 'colsample_bytree': 1,
# 'enable_categorical': False,
# 'gamma': 0,
# 'gpu_id': -1,
# 'importance_type': None,
# 'interaction_constraints': '',
# 'learning_rate': 0.300000012,
# 'max_delta_step': 0,
# 'max_depth': 6,
# 'min_child_weight': 1,
# 'missing': nan,
# 'monotone_constraints': '()',
# 'n_estimators': 100,
# 'n_jobs': 24,
# 'num_parallel_tree': 1,
# 'predictor': 'auto',
# 'random_state': 0,
# 'reg_alpha': 0,
# 'reg_lambda': 1,
# 'scale_pos_weight': None,
# 'subsample': 1,
# 'tree_method': 'exact',
# 'validate_parameters': 1,
# 'verbosity': None}