In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 40)

features_df = pd.read_csv('training_set_features.csv', index_col = 'respondent_id')
labels_df = pd.read_csv("training_set_labels.csv", index_col="respondent_id")

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

RANDOM_SEED = 6

In [3]:
numeric_cols = features_df.columns[features_df.dtypes != "object"].values
numeric_cols 

categorical_features = features_df.select_dtypes(include=['object']).columns
categorical_features

Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
       'census_msa', 'employment_industry', 'employment_occupation'],
      dtype='object')

In [4]:

numeric_preprocessing_steps = Pipeline([('standard_scaler', StandardScaler()),
                                        ('simple_imputer', SimpleImputer(strategy='median'))])

categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant'))
      ,('encoder', OrdinalEncoder())
])


preprocessor = ColumnTransformer(transformers = [("numeric", numeric_preprocessing_steps, numeric_cols),
                                                ('categorical', categorical_transformer, categorical_features)],
                                 remainder = "drop")

In [18]:
estimators = MultiOutputClassifier(estimator=LogisticRegression(penalty="l2", C=1, max_iter=200))

In [19]:
full_pipeline = Pipeline([("preprocessor", preprocessor),("estimators", estimators),])

In [20]:
X_train, X_eval, y_train, y_eval = train_test_split(features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED)

In [21]:
%%time

# Train model
full_pipeline.fit(X_train, y_train)

# Predict on evaluation set
# This competition wants probabilities, not labels
preds = full_pipeline.predict_proba(X_eval)
preds

CPU times: user 1.33 s, sys: 30.5 ms, total: 1.36 s
Wall time: 439 ms


[array([[0.68123215, 0.31876785],
        [0.86866047, 0.13133953],
        [0.77032177, 0.22967823],
        ...,
        [0.9274836 , 0.0725164 ],
        [0.92230142, 0.07769858],
        [0.93357794, 0.06642206]]),
 array([[0.61421728, 0.38578272],
        [0.4991771 , 0.5008229 ],
        [0.23994877, 0.76005123],
        ...,
        [0.85553417, 0.14446583],
        [0.90263973, 0.09736027],
        [0.13714155, 0.86285845]])]

In [22]:
print("test_probas[0].shape", preds[0].shape)
print("test_probas[1].shape", preds[1].shape)

test_probas[0].shape (8814, 2)
test_probas[1].shape (8814, 2)


In [23]:
y_preds = pd.DataFrame({"h1n1_vaccine": preds[0][:, 1],"seasonal_vaccine": preds[1][:, 1],},
                       index = y_eval.index)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (8814, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6728,0.318768,0.385783
16516,0.13134,0.500823
3106,0.229678,0.760051
16981,0.74354,0.909441
19111,0.230699,0.691115


In [24]:
roc_auc_score(y_eval, y_preds)

0.8396461099877355

In [25]:


full_pipeline.fit(features_df, labels_df);



In [26]:
test_features_df = pd.read_csv("test_set_features.csv", 
                               index_col="respondent_id")


In [27]:
test_probas = full_pipeline.predict_proba(test_features_df)
test_probas

submission_df = pd.read_csv("submission_format.csv", 
                            index_col="respondent_id")

np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = test_probas[0][:, 1]
submission_df["seasonal_vaccine"] = test_probas[1][:, 1]

submission_df.head()

submission_df.to_csv('my_submission.csv', index=True)
!head my_submission.csv

respondent_id,h1n1_vaccine,seasonal_vaccine
26707,0.09254276599659124,0.29810539119905677
26708,0.04347660806948814,0.04534987443282374
26709,0.42812219486590447,0.6781070729351514
26710,0.45831561648353003,0.8684481297634488
26711,0.19350916437181787,0.45527302514710427
26712,0.44298084267123305,0.9260404659600844
26713,0.3843064156313638,0.535098198818608
26714,0.15498926307311353,0.316482696513783
26715,0.04123712259137317,0.18709634837734243
