In [60]:
import xgboost as xgb
from xgboost import XGBClassifier 
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.svm import SVC

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [61]:
# read the csv-files and take the respondent_id column as index:

X_train_df = pd.read_csv("training_set_features.csv", index_col="respondent_id")
y_train_df = pd.read_csv("training_set_labels.csv", index_col="respondent_id")
X_test_df = pd.read_csv("test_set_features.csv", index_col="respondent_id")

X_train_df.shape, X_test_df.shape
# Output:
# ((26707, 36), (26708, 36))


((26707, 35), (26708, 35))

In [62]:
random_seed = 11
test_size = 0.2
X_train, X_eval, y_train, y_eval = train_test_split(X_train_df, y_train_df, test_size=test_size, random_state=random_seed)

In [63]:
# transform the two independent binary labels to a categorial output:

label_number_to_categories = { 0: "not vaccinated", 1: "only seasonal", 2 : "only h1n1", 3 : "seasonal and h1n1"}
label_number_to_multilabel = { 0: [0,0], 1: [0,1], 2 : [1,0], 3 : [1,1] }

def multiLableTocategory(l):
    if np.array_equal(l, [0,0]):
        return 0
    if np.array_equal(l,[0,1]):
        return 1
    if np.array_equal(l, [1,0]):
        return 2
    if np.array_equal(l, [1,1]):
        return 3

y_train_cat = [ multiLableTocategory(yt) for yt in y_train.values]
y_eval_cat = [multiLableTocategory(yt) for yt in y_eval.values]

In [64]:
num_cols = X_train_df.columns[X_train_df.dtypes != "object"].values
non_numeric_columns = X_train_df.columns[X_train_df.dtypes == "object"].values

In [73]:
# prepro pipeline:

numeric_preprocessing_pipeline = Pipeline([
    ("simple_imputer", SimpleImputer(strategy="constant", fill_value=np.NaN))
])


# column transformer with only numerical columns:

preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_pipeline, num_cols),
        ("ohe_num", OneHotEncoder(), num_cols),
        ("ohe_non_num", OneHotEncoder(), non_numeric_columns)
    ],
    remainder="drop"
)

xbg_clf = XGBClassifier(use_label_encoder=False)
#multi_estimator_rdf= MultiOutputClassifier(estimator=xbg_clf)

full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("xgb_classifier", xbg_clf),
])



In [74]:
full_pipeline.fit(X_train, y_train_cat)



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('simple_imputer',
                                                                   SimpleImputer(fill_value=nan,
                                                                                 strategy='constant'))]),
                                                  array(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask',
       'behavioral_wash_hands', 'behavioral_large_gatherings',
       'behavioral_outside_h...
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=24, num_parallel_tree=1,
                               objective='multi:softprob

In [75]:
y_preds = full_pipeline.predict(X_eval) # predict_proba ?
accuracy_score(y_eval_cat, y_preds)

0.6988019468363909

In [79]:
xgb_clf = full_pipeline["xgb_classifier"]
type(xgb_clf)

xgboost.sklearn.XGBClassifier

Documentation of dmlc XGBoost / py-xgboost(?)

https://xgboost.readthedocs.io/en/stable/python/python_intro.html#setting-parameters

In [None]:
xgb_clf.get_params()

# output:
#{'objective': 'multi:softprob',
# 'use_label_encoder': False,
# 'base_score': 0.5,
# 'booster': 'gbtree',
# 'colsample_bylevel': 1,
# 'colsample_bynode': 1,
# 'colsample_bytree': 1,
# 'enable_categorical': False,
# 'gamma': 0,
# 'gpu_id': -1,
# 'importance_type': None,
# 'interaction_constraints': '',
# 'learning_rate': 0.300000012,
# 'max_delta_step': 0,
# 'max_depth': 6,
# 'min_child_weight': 1,
# 'missing': nan,
# 'monotone_constraints': '()',
# 'n_estimators': 100,
# 'n_jobs': 24,
# 'num_parallel_tree': 1,
# 'predictor': 'auto',
# 'random_state': 0,
# 'reg_alpha': 0,
# 'reg_lambda': 1,
# 'scale_pos_weight': None,
# 'subsample': 1,
# 'tree_method': 'exact',
# 'validate_parameters': 1,
# 'verbosity': None}