In [243]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, roc_auc_score

RANDOM_SEED = 6    # Set a random seed for reproducibility!


from pathlib import Path
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 100)

In [244]:
DATA_PATH = Path.cwd()

features_df = pd.read_csv(DATA_PATH / "training_set_features.csv", index_col="respondent_id")
labels_df = pd.read_csv(DATA_PATH / "training_set_labels.csv", index_col="respondent_id")

test_features_df = pd.read_csv(DATA_PATH / "test_set_features.csv", index_col="respondent_id")

In [245]:
features_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,55 - 64 Years,< 12 Years,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,4.0,4.0,2.0,4.0,35 - 44 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,1.0,0.0,0.0,,3.0,1.0,1.0,4.0,1.0,2.0,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,3.0,3.0,5.0,5.0,4.0,1.0,65+ Years,12 Years,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,2.0,3.0,1.0,4.0,45 - 54 Years,Some College,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,1.0,1.0,5.0,2.0,2.0,65+ Years,Some College,White,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,4.0,2.0,2.0,5.0,1.0,1.0,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,4.0,4.0,2.0,5.0,4.0,2.0,55 - 64 Years,Some College,White,Female,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,2.0,2.0,1.0,2.0,18 - 34 Years,Some College,Hispanic,Female,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg


In [246]:
labels_df.head(5)

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0
1,0,1
2,0,0
3,0,1
4,0,0


In [247]:
features_df.dtypes

h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                             object
income_poverty           

In [253]:
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(features_df)
categorical_columns

['age_group',
 'education',
 'race',
 'sex',
 'income_poverty',
 'marital_status',
 'rent_or_own',
 'employment_status',
 'hhs_geo_region',
 'census_msa',
 'employment_industry',
 'employment_occupation']

In [249]:
features_df[categorical_columns].head(5)

Unnamed: 0_level_0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,55 - 64 Years,< 12 Years,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,,
1,35 - 44 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",pxcmvdjn,xgwztkwe
2,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",rucpziij,xtkaffoo
3,65+ Years,12 Years,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",,
4,45 - 54 Years,Some College,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",wxleyezf,emcorrxb


In [250]:
features_df[categorical_columns].shape

(26707, 12)

In [254]:
# from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder(handle_unknown='error')

encoded_data = enc.fit_transform(features_df[categorical_columns])

test_encoded_data = enc.fit_transform(test_features_df[categorical_columns])

In [255]:
encoded_data

array([[ 3.,  1.,  3., ...,  2., nan, nan],
       [ 1.,  0.,  3., ...,  0., 12., 19.],
       [ 0.,  2.,  3., ...,  0., 14., 21.],
       ...,
       [ 3.,  3.,  3., ...,  0., nan, nan],
       [ 0.,  3.,  1., ...,  2.,  4.,  6.],
       [ 4.,  3.,  3., ...,  1., nan, nan]])

In [256]:
encoded_data = pd.DataFrame(encoded_data, columns=categorical_columns, index=features_df.index)
encoded_data = pd.concat([encoded_data, features_df.drop(columns=categorical_columns)], axis=1)

test_encoded_data = pd.DataFrame(test_encoded_data, columns=categorical_columns, index=test_features_df.index)
test_encoded_data = pd.concat([test_encoded_data, test_features_df.drop(columns=categorical_columns)], axis=1)

In [257]:
encoded_data

Unnamed: 0_level_0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0,3.0,1.0,3.0,0.0,2.0,1.0,0.0,1.0,8.0,2.0,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0
1,1.0,0.0,3.0,1.0,2.0,1.0,1.0,0.0,1.0,0.0,12.0,19.0,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,4.0,4.0,4.0,2.0,4.0,0.0,0.0
2,0.0,2.0,3.0,1.0,0.0,1.0,0.0,0.0,9.0,0.0,14.0,21.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,,1.0,0.0,0.0,,3.0,1.0,1.0,4.0,1.0,2.0,2.0,0.0
3,4.0,0.0,3.0,0.0,2.0,1.0,1.0,1.0,5.0,1.0,,,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,,3.0,3.0,5.0,5.0,4.0,1.0,0.0,0.0
4,2.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,18.0,5.0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,3.0,3.0,2.0,3.0,1.0,4.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,4.0,3.0,3.0,0.0,0.0,1.0,0.0,1.0,9.0,2.0,,,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.0,1.0,1.0,5.0,2.0,2.0,0.0,0.0
26703,0.0,2.0,3.0,1.0,0.0,1.0,1.0,0.0,6.0,1.0,4.0,2.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,4.0,2.0,2.0,5.0,1.0,1.0,1.0,0.0
26704,3.0,3.0,3.0,0.0,,1.0,0.0,,6.0,0.0,,,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,4.0,4.0,2.0,5.0,4.0,2.0,0.0,0.0
26705,0.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,5.0,2.0,4.0,6.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,2.0,2.0,1.0,2.0,1.0,0.0


In [314]:
# import sys
# from impyute.imputation.cs import fast_knn
# sys.setrecursionlimit(100000) #Increase the recursion limit of the OS

# # start the KNN training
# imputed_training = fast_knn(concatenated_data.values, k=30)
# concatenated_data = pd.DataFrame(imputed_training, columns=concatenated_data.columns)

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

# imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp = KNNImputer(n_neighbors=200)

imputed_training_data = imp.fit_transform(encoded_data)
imputed_test_data = imp.fit_transform(test_encoded_data)

concatenated_data = pd.DataFrame(imputed_training_data, columns=encoded_data.columns, index=features_df.index)
concatenated_test_data = pd.DataFrame(imputed_test_data, columns=test_encoded_data.columns, index=test_features_df.index)

In [315]:
X_train, X_eval, y_train, y_eval = train_test_split(
    concatenated_data,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)

In [316]:
X_train

Unnamed: 0_level_0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
16069,0.0,2.000,3.0,0.0,0.00,0.00,1.000,1.00,1.0,0.0,9.440,10.975,2.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.000,0.000,0.0,0.0,1.0,1.000,2.0,2.0,2.0,4.0,4.0,2.0,1.0,0.0
15318,2.0,2.000,3.0,1.0,1.00,0.00,0.000,2.00,0.0,0.0,8.950,10.565,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.000,0.000,0.0,0.0,0.0,1.000,5.0,2.0,2.0,5.0,1.0,1.0,1.0,0.0
21253,4.0,2.000,3.0,1.0,1.00,0.00,0.000,1.00,8.0,2.0,8.775,12.440,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.000,0.000,1.0,0.0,0.0,1.000,5.0,1.0,1.0,5.0,2.0,1.0,1.0,0.0
24458,0.0,1.945,3.0,0.0,0.61,0.26,0.195,0.28,9.0,2.0,9.895,9.270,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.325,0.405,0.0,0.0,0.0,0.885,4.0,4.0,2.0,4.0,4.0,2.0,1.0,2.0
17301,4.0,0.000,0.0,1.0,0.00,1.00,1.000,1.00,7.0,1.0,10.640,12.840,2.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.000,0.000,0.0,0.0,0.0,1.000,4.0,4.0,2.0,4.0,4.0,4.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22851,1.0,2.000,1.0,0.0,0.00,1.00,1.000,0.00,6.0,1.0,0.000,22.000,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.000,0.0,0.0,0.0,0.800,3.0,1.0,2.0,4.0,1.0,2.0,1.0,1.0
10492,1.0,2.000,2.0,1.0,1.00,0.00,0.000,0.00,1.0,1.0,4.000,2.000,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000,1.000,1.0,0.0,1.0,1.000,4.0,2.0,1.0,4.0,4.0,4.0,1.0,1.0
12016,0.0,0.000,0.0,0.0,2.00,1.00,1.000,1.00,3.0,1.0,10.790,13.160,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.000,1.000,1.0,1.0,0.0,0.615,3.0,5.0,5.0,2.0,2.0,4.0,0.0,3.0
8855,4.0,2.000,3.0,1.0,1.00,0.00,0.000,2.00,6.0,0.0,10.215,13.265,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.000,1.000,0.0,0.0,0.0,1.000,4.0,2.0,1.0,4.0,2.0,2.0,1.0,0.0


In [335]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SequentialFeatureSelector

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# classifier = LogisticRegression(random_state=0, penalty="l2", C=1, max_iter=1000)
# classifier = LogisticRegressionCV(cv=5, random_state=0, max_iter=1000)
# classifier = PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3)
# classifier = SGDClassifier(max_iter=1000, tol=1e-3, loss="modified_huber")
# classifier = KNeighborsClassifier(n_neighbors=50)
# classifier = XGBClassifier()

classifier = CatBoostClassifier(
    verbose=False,
    iterations=1000
)


pipe = make_pipeline(StandardScaler(), MultiOutputClassifier(classifier))

pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('multioutputclassifier',
                 MultiOutputClassifier(estimator=<catboost.core.CatBoostClassifier object at 0x000001D0669FD2E0>))])

In [336]:
y_eval_pred=pipe.predict_proba(X_eval)
# y_eval_pred
y_eval_pred = pd.DataFrame(
    {
        "h1n1_vaccine": y_eval_pred[0][:, 1],
        "seasonal_vaccine": y_eval_pred[1][:, 1],
    },
    index = y_eval.index
)
print("y_eval_pred.shape:", y_eval_pred.shape)
y_eval_pred.head()

y_eval_pred.shape: (8814, 2)


Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6728,0.560056,0.496444
16516,0.056728,0.459835
3106,0.050778,0.568094
16981,0.973527,0.953922
19111,0.062218,0.666868


In [337]:
y_eval_pred

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6728,0.560056,0.496444
16516,0.056728,0.459835
3106,0.050778,0.568094
16981,0.973527,0.953922
19111,0.062218,0.666868
...,...,...
6876,0.162510,0.268224
5834,0.815259,0.792058
13478,0.028111,0.161290
18399,0.105529,0.063559


In [338]:
roc_auc_score(y_eval, y_eval_pred)

0.863775444686177

# GENERATING THE PREDICTIONS FOR THE TEST SET

In [321]:
pipe.fit(concatenated_data, labels_df)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('multioutputclassifier',
                 MultiOutputClassifier(estimator=<catboost.core.CatBoostClassifier object at 0x000001D0636A0CD0>))])

In [322]:
concatenated_test_data

Unnamed: 0_level_0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
26707,1.0,2.00,1.0,0.0,1.00,1.0,1.000,0.000,7.0,0.0,1.000,7.000,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.000,5.0,1.0,1.0,5.0,1.0,1.0,1.0,0.0
26708,0.0,0.00,3.0,1.0,2.00,1.0,1.000,0.000,1.0,2.0,1.000,20.000,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,4.0,1.0,1.0,4.0,1.0,1.0,3.0,0.0
26709,3.0,2.00,3.0,1.0,1.00,0.0,0.000,0.000,5.0,2.0,10.000,12.000,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.955,5.0,4.0,2.0,5.0,4.0,4.0,1.0,0.0
26710,4.0,0.00,3.0,0.0,0.00,0.0,0.000,1.000,5.0,0.0,10.285,12.575,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.000,4.0,2.0,2.0,4.0,4.0,2.0,1.0,0.0
26711,1.0,0.00,0.0,0.0,0.00,1.0,0.000,0.000,6.0,2.0,4.000,10.000,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.000,5.0,2.0,4.0,4.0,4.0,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,1.0,1.87,3.0,0.0,0.63,0.3,0.255,0.225,2.0,1.0,8.955,12.130,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.950,4.0,2.0,2.0,4.0,2.0,1.0,1.0,1.0
53411,0.0,0.00,3.0,1.0,2.00,0.0,1.000,0.000,9.0,2.0,4.000,18.000,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.000,4.0,1.0,1.0,5.0,2.0,2.0,1.0,3.0
53412,0.0,3.00,3.0,0.0,2.00,1.0,1.000,1.000,9.0,0.0,9.730,11.405,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000,4.0,3.0,1.0,4.0,3.0,1.0,1.0,0.0
53413,3.0,3.00,3.0,0.0,0.00,0.0,0.000,1.000,1.0,0.0,9.655,11.840,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.945,2.0,3.0,4.0,4.0,3.0,2.0,1.0,0.0


In [323]:
test_probas = pipe.predict_proba(concatenated_test_data)
test_probas

[array([[0.84673312, 0.15326688],
        [0.972976  , 0.027024  ],
        [0.82534479, 0.17465521],
        ...,
        [0.84758747, 0.15241253],
        [0.98734558, 0.01265442],
        [0.51549048, 0.48450952]]),
 array([[0.67804146, 0.32195854],
        [0.98683398, 0.01316602],
        [0.20826537, 0.79173463],
        ...,
        [0.81179713, 0.18820287],
        [0.63982648, 0.36017352],
        [0.26409113, 0.73590887]])]

In [324]:
submission_df = pd.read_csv(DATA_PATH / "submission_format.csv", index_col="respondent_id")
submission_df.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.5,0.7
26708,0.5,0.7
26709,0.5,0.7
26710,0.5,0.7
26711,0.5,0.7


In [325]:
# Make sure we have the rows in the same order
np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)

# Save predictions to submission data frame
submission_df["h1n1_vaccine"] = test_probas[0][:, 1]
submission_df["seasonal_vaccine"] = test_probas[1][:, 1]

submission_df.head()

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.153267,0.321959
26708,0.027024,0.013166
26709,0.174655,0.791735
26710,0.696905,0.90089
26711,0.280597,0.423716


In [326]:
submission_df.to_csv('submission.csv', index=True)

In [59]:
# !pip install catboost

Collecting catboost
  Downloading catboost-1.1.1-cp39-none-win_amd64.whl (74.0 MB)
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.1.1 graphviz-0.20.1


In [207]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y = make_regression(n_samples=100000, n_features=100, n_informative=20, random_state=0)
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)

In [208]:
len(train_X)

75000