In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pf
%matplotlib inline

In [158]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

RANDOM_SEED = 6    # Set a random seed for reproducibility!

In [159]:
from sklearn.svm import SVR
from sklearn.svm import SVC

In [160]:
train = pd.read_csv('./training_set_features.csv', index_col='respondent_id')
test = pd.read_csv('./test_set_features.csv', index_col='respondent_id')
labels = pd.read_csv('./training_set_labels.csv', index_col='respondent_id')

In [161]:
numeric_cols = train.columns[train.dtypes != "object"].values
# chain preprocessing into a Pipeline object
# each step is a tuple of (name you chose, sklearn transformer)
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='median'))
])

# create the preprocessor stage of final pipeline
# each entry in the transformer list is a tuple of
# (name you choose, sklearn transformer, list of columns)
preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols)
    ],
    remainder = "drop"
)

In [162]:
train.shape[0]

26707

In [163]:
gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=0.01, max_features=2, max_depth=2, random_state=0)

In [164]:
estimators = MultiOutputClassifier(gb_clf)

In [165]:
#estimators = MultiOutputClassifier(
#   estimator=SVR(kernel='rbf', C=1)
#)

In [166]:

full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimators),
])

In [167]:
X_train, X_eval, y_train, y_eval = train_test_split(
    train,
    labels,
    test_size=0.33,
    shuffle=True,
    stratify=labels,
    random_state=RANDOM_SEED
)

In [168]:
%%time

# Train model
full_pipeline.fit(X_train, y_train)



Wall time: 223 ms


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('simple_imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  array(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask',
       'behavioral_wash_hands', 'behavioral_large_gatherings',
       '...
       'health_insurance', 'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective', 'opinion_seas_risk',
       'opinion_seas_sick_from_vacc', 'household_adults',
       'household_children'], dtype=object))])),
            

In [169]:
# Predict on evaluation set
# This competition wants probabilities, not labels
preds = full_pipeline.predict_proba(X_eval)
preds

[array([[0.77949513, 0.22050487],
        [0.8008863 , 0.1991137 ],
        [0.79901097, 0.20098903],
        ...,
        [0.79953727, 0.20046273],
        [0.80229775, 0.19770225],
        [0.7969706 , 0.2030294 ]]),
 array([[0.53774574, 0.46225426],
        [0.55169248, 0.44830752],
        [0.54429131, 0.45570869],
        ...,
        [0.55764068, 0.44235932],
        [0.55929355, 0.44070645],
        [0.51970105, 0.48029895]])]

In [170]:
print("test_probas[0].shape", preds[0].shape)

test_probas[0].shape (8814, 2)


In [171]:
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (8814, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6728,0.220505,0.462254
16516,0.199114,0.448308
3106,0.200989,0.455709
16981,0.227379,0.492759
19111,0.217124,0.47835


In [172]:
roc_auc_score(y_eval, y_preds)

0.8056575558931736

In [173]:
full_pipeline.fit(train, labels)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('standard_scaler',
                                                                   StandardScaler()),
                                                                  ('simple_imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  array(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask',
       'behavioral_wash_hands', 'behavioral_large_gatherings',
       '...
       'health_insurance', 'opinion_h1n1_vacc_effective',
       'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
       'opinion_seas_vacc_effective', 'opinion_seas_risk',
       'opinion_seas_sick_from_vacc', 'household_adults',
       'household_children'], dtype=object))])),
            

In [174]:
# Predict on evaluation set
# This competition wants probabilities, not labels
preds = full_pipeline.predict(X_eval)
preds

array([[0, 0],
       [0, 0],
       [0, 0],
       ...,
       [0, 0],
       [0, 0],
       [0, 0]], dtype=int64)

In [175]:
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (8814, 1)


Unnamed: 0_level_0,h1n1_vaccine
respondent_id,Unnamed: 1_level_1
6728,0
16516,0
3106,0
16981,0
19111,0


In [176]:
roc_auc_score(y_eval, y_preds)

0.5467924709497743

In [177]:
print("test_probas[0].shape", preds.shape)

test_probas[0].shape (8814, 2)


In [178]:
test_probas = full_pipeline.predict(test)

In [179]:
submission_df = pd.read_csv("submission_format.csv", 
                            index_col="respondent_id")
submission_df.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.5,0.7
26708,0.5,0.7
26709,0.5,0.7
26710,0.5,0.7
26711,0.5,0.7


In [180]:
np.testing.assert_array_equal(test.index.values, 
                              submission_df.index.values)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = test_probas[0][:, 1]
submission_df["seasonal_vaccine"] = test_probas[1][:, 1]

submission_df.head()

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
submission_df.to_csv('my_submission.csv', index=True)
!head my_submission.csv

'head' is not recognized as an internal or external command,
operable program or batch file.
