In [30]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.impute import MissingIndicator, SimpleImputer, KNNImputer 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve



In [31]:
features_df = pd.read_csv('../Data/training_set_features.csv')
labels_df = pd.read_csv('../Data/training_set_labels.csv')

# check that the rows between the features and the labels match up
np.testing.assert_array_equal(features_df.index.values, labels_df.index.values)

# merge features_df and labels_df 
df = labels_df.merge(features_df, how = 'inner', on='respondent_id')

# drop duplicate 
df.drop_duplicates(inplace=True)

In [32]:
df.shape

(26707, 38)

In [33]:
df.info()

# 17  health_insurance, 36  employment_industry, 37  employment_occupation  about 50% missing 


<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_vaccine                 26707 non-null  int64  
 2   seasonal_vaccine             26707 non-null  int64  
 3   h1n1_concern                 26615 non-null  float64
 4   h1n1_knowledge               26591 non-null  float64
 5   behavioral_antiviral_meds    26636 non-null  float64
 6   behavioral_avoidance         26499 non-null  float64
 7   behavioral_face_mask         26688 non-null  float64
 8   behavioral_wash_hands        26665 non-null  float64
 9   behavioral_large_gatherings  26620 non-null  float64
 10  behavioral_outside_home      26625 non-null  float64
 11  behavioral_touch_face        26579 non-null  float64
 12  doctor_recc_h1n1             24547 non-null  float64
 13  doctor_recc_seas

In [34]:
# Check balance of target variables 

print("H1N1 Vaccination (counts)")
print(df["h1n1_vaccine"].value_counts())
print()
print("Seasonal Flu Vaccination (counts)")
print(df["seasonal_vaccine"].value_counts())
print()
print("H1N1 Vaccination (percentages)")
print(df["h1n1_vaccine"].value_counts(normalize=True))
print()
print("Seasonal Flu Vaccination (percentages)")
print(df["seasonal_vaccine"].value_counts(normalize=True))

# Only 21% of people are vaccinated. H1N1 vaccination data is highly imbalanced. 
# For model tuning, use SMOTE or other methods.  



H1N1 Vaccination (counts)
0    21033
1     5674
Name: h1n1_vaccine, dtype: int64

Seasonal Flu Vaccination (counts)
0    14272
1    12435
Name: seasonal_vaccine, dtype: int64

H1N1 Vaccination (percentages)
0    0.787546
1    0.212454
Name: h1n1_vaccine, dtype: float64

Seasonal Flu Vaccination (percentages)
0    0.534392
1    0.465608
Name: seasonal_vaccine, dtype: float64


### Missing Values

In [35]:
# drop respondent_id
#df.drop('respondent_id', axis=1, inplace=True)

# report missing values 
df.isna().sum()

respondent_id                      0
h1n1_vaccine                       0
seasonal_vaccine                   0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
s

#### Data imputation

In [36]:
# drop variables of random characters 

df.drop(['employment_industry',  'employment_occupation', 'hhs_geo_region'], axis=1, inplace=True)



In [37]:
# for socioeconomic categoircal variables, drop missing rows 

df.dropna(subset=['health_worker', 'education','income_poverty', 'marital_status', 
                    'rent_or_own', 'employment_status', 'household_adults', 
                    'household_children' ], inplace=True)


In [38]:
#features_df = pd.read_csv('../Data/training_set_features.csv')
#x_df = features_df

survey_col = ['opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
 'opinion_seas_vacc_effective', 'opinion_seas_risk','opinion_seas_sick_from_vacc']

behavior_col = ['behavioral_antiviral_meds', 'behavioral_face_mask',
                'behavioral_large_gatherings','behavioral_outside_home']

behavior_col_2 = ['behavioral_avoidance', 
                'behavioral_wash_hands','behavioral_touch_face']


doc_rec = ['doctor_recc_h1n1','doctor_recc_seasonal']

def impute_missing_data(dataframe, column_list, fillvalue):
    for column in column_list:
        dataframe[column].fillna(fillvalue, inplace = True)
      
    
impute_missing_data(df, survey_col, 3)
impute_missing_data(df, ['h1n1_concern'], 2)
impute_missing_data(df, ['h1n1_knowledge'], 0)
impute_missing_data(df, behavior_col, 0)
impute_missing_data(df, behavior_col_2, 1)
impute_missing_data(df, doc_rec, 0)
impute_missing_data(df, ['chronic_med_condition'], 0)
impute_missing_data(df, ['child_under_6_months'], 0)


In [39]:
df.isnull().sum()

respondent_id                     0
h1n1_vaccine                      0
seasonal_vaccine                  0
h1n1_concern                      0
h1n1_knowledge                    0
behavioral_antiviral_meds         0
behavioral_avoidance              0
behavioral_face_mask              0
behavioral_wash_hands             0
behavioral_large_gatherings       0
behavioral_outside_home           0
behavioral_touch_face             0
doctor_recc_h1n1                  0
doctor_recc_seasonal              0
chronic_med_condition             0
child_under_6_months              0
health_worker                     0
health_insurance               9400
opinion_h1n1_vacc_effective       0
opinion_h1n1_risk                 0
opinion_h1n1_sick_from_vacc       0
opinion_seas_vacc_effective       0
opinion_seas_risk                 0
opinion_seas_sick_from_vacc       0
age_group                         0
education                         0
race                              0
sex                         

#### train_test_split

In [40]:
# Define X and y, 30-70(training)

X=df.drop(['h1n1_vaccine','seasonal_vaccine'], axis=1)
y=df[['h1n1_vaccine','seasonal_vaccine', 'respondent_id' ]]

# Train test split 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# X_train 15304 rows 

#### Imputing Health Insurance 

In [41]:
# create dummies for objects to use KNNImputer

X_train_obj = X_train.select_dtypes(include='object')

# one hot encoding for objects 
ohe = OneHotEncoder(drop='first', sparse=False)

dums = ohe.fit_transform(X_train_obj)

dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=X_train_obj.index)

dums_df
dums_df.shape #15304 rows 



(15304, 19)

In [44]:
# Health insurance breakdown 
X_train.health_insurance.value_counts(normalize=True)


1.0    0.882751
0.0    0.117249
Name: health_insurance, dtype: float64

In [45]:
# Imputation using other socio-econmic features   

dums_df_insurance = pd.concat([X_train.health_insurance, dums_df], axis=1)

knnimpute = KNNImputer()
dums_df_insurance_imputed = pd.DataFrame(knnimpute.fit_transform(dums_df_insurance), 
                                         columns = dums_df_insurance.columns,
                                        index=dums_df_insurance.index)

dums_df_insurance_imputed # 15304 rows


Unnamed: 0,health_insurance,x0_35 - 44 Years,x0_45 - 54 Years,x0_55 - 64 Years,x0_65+ Years,x1_< 12 Years,x1_College Graduate,x1_Some College,x2_Hispanic,x2_Other or Multiple,x2_White,x3_Male,"x4_> $75,000",x4_Below Poverty,x5_Not Married,x6_Rent,x7_Not in Labor Force,x7_Unemployed,"x8_MSA, Principle City",x8_Non-MSA
26356,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
1925,0.8,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2668,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5325,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
841,0.8,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14593,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
26345,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
6555,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1035,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [15]:
dums_df_insurance_imputed.health_insurance.value_counts()

1.0    12105
0.8     1278
0.0     1046
0.6      582
0.4      216
0.2       77
Name: health_insurance, dtype: int64

In [47]:
# convert to 0,1 value 


dums_df_insurance_imputed.health_insurance = (dums_df_insurance_imputed.health_insurance
                                              > 0.5).astype(int)

dums_df_insurance_imputed.health_insurance.value_counts(normalize = True)


1    0.912507
0    0.087493
Name: health_insurance, dtype: float64

In [17]:
dums_df_insurance_imputed


Unnamed: 0,health_insurance,x0_35 - 44 Years,x0_45 - 54 Years,x0_55 - 64 Years,x0_65+ Years,x1_< 12 Years,x1_College Graduate,x1_Some College,x2_Hispanic,x2_Other or Multiple,x2_White,x3_Male,"x4_> $75,000",x4_Below Poverty,x5_Not Married,x6_Rent,x7_Not in Labor Force,x7_Unemployed,"x8_MSA, Principle City",x8_Non-MSA
26356,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
1925,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2668,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5325,1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
841,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14593,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
26345,1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
6555,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1035,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [18]:
# Drop the old object columns and health insurance column from X_train 
# and concat the imputed df. 

X_train_1 = X_train.drop(X_train_obj.columns, axis=1)
X_train_1 = X_train_1.drop('health_insurance', axis=1)

X_train_imputed = pd.concat([X_train_1, dums_df_insurance_imputed], axis=1)



X_train_imputed


Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,x2_White,x3_Male,"x4_> $75,000",x4_Below Poverty,x5_Not Married,x6_Rent,x7_Not in Labor Force,x7_Unemployed,"x8_MSA, Principle City",x8_Non-MSA
26356,26356,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
1925,1925,2.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2668,2668,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5325,5325,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
841,841,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14593,14593,2.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
26345,26345,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
6555,6555,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1035,1035,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [19]:
import os
cwd = os.getcwd()
path = cwd + "/X_train_imputed.csv"
X_train_imputed.to_csv(path, index=False)

In [20]:
# do the same imputation for X_test

# create dummies for objects
X_test_obj = X_test.select_dtypes(include='object')

# one hot encoding 
dums_t = ohe.transform(X_test_obj)

dums_t_df = pd.DataFrame(dums_t,
                       columns=ohe.get_feature_names(),
                       index=X_test_obj.index)

dums_t_df
dums_t_df.shape #6559 rows 


(6559, 19)

In [21]:
# KNNImputation using other socio-econmic features   

# concat health insurance column 
dums_t_df_insurance = pd.concat([X_test.health_insurance, dums_t_df], axis=1)

dums_t_df_insurance_imputed = pd.DataFrame(knnimpute.transform(dums_t_df_insurance), 
                                         columns = dums_t_df_insurance.columns,
                                        index=dums_t_df_insurance.index)

dums_t_df_insurance_imputed #6559 rows

Unnamed: 0,health_insurance,x0_35 - 44 Years,x0_45 - 54 Years,x0_55 - 64 Years,x0_65+ Years,x1_< 12 Years,x1_College Graduate,x1_Some College,x2_Hispanic,x2_Other or Multiple,x2_White,x3_Male,"x4_> $75,000",x4_Below Poverty,x5_Not Married,x6_Rent,x7_Not in Labor Force,x7_Unemployed,"x8_MSA, Principle City",x8_Non-MSA
15045,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
14123,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25010,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
24771,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23611,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20174,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19888,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
22054,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [22]:
dums_t_df_insurance_imputed.health_insurance.value_counts()

1.0    5151
0.8     564
0.0     450
0.6     251
0.4     101
0.2      42
Name: health_insurance, dtype: int64

In [23]:
# convert to 0,1 value 

dums_t_df_insurance_imputed.health_insurance = (dums_t_df_insurance_imputed.health_insurance
                                              > 0.5).astype(int)

dums_t_df_insurance_imputed.health_insurance.value_counts()


1    5966
0     593
Name: health_insurance, dtype: int64

In [24]:
# Drop the old object columns and health insurance column from X_test 
# and concat the imputed df. 

X_test_1 = X_test.drop(X_test_obj.columns, axis=1)
X_test_1 = X_test_1.drop('health_insurance', axis=1)

X_test_imputed = pd.concat([X_test_1, dums_t_df_insurance_imputed], axis=1)



X_test_imputed #6559 rows


Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,x2_White,x3_Male,"x4_> $75,000",x4_Below Poverty,x5_Not Married,x6_Rent,x7_Not in Labor Force,x7_Unemployed,"x8_MSA, Principle City",x8_Non-MSA
15045,15045,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
14123,14123,3.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25010,25010,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
24771,24771,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23611,23611,2.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,7735,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20174,20174,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19888,19888,2.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
22054,22054,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [25]:
# save X_test_imputed
cwd = os.getcwd()
path = cwd + "/X_test_imputed.csv"
X_test_imputed.to_csv(path, index=False)

In [26]:
# save y_test, y_train 

# y_train
cwd = os.getcwd()
path = cwd + "/y_train.csv"
y_train.to_csv(path, index=False)

# y_test
cwd = os.getcwd()
path = cwd + "/y_test.csv"
y_test.to_csv(path, index=False)




In [27]:
#pd.read_csv('X_train_imputed')

In [28]:
y_test

Unnamed: 0,h1n1_vaccine,seasonal_vaccine,respondent_id
15045,0,0,15045
14123,1,1,14123
25010,1,1,25010
24771,0,0,24771
23611,1,1,23611
...,...,...,...
7735,0,1,7735
20174,1,1,20174
19888,1,1,19888
22054,0,1,22054


In [29]:
X_test_imputed

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,x2_White,x3_Male,"x4_> $75,000",x4_Below Poverty,x5_Not Married,x6_Rent,x7_Not in Labor Force,x7_Unemployed,"x8_MSA, Principle City",x8_Non-MSA
15045,15045,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
14123,14123,3.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25010,25010,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
24771,24771,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23611,23611,2.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,7735,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20174,20174,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19888,19888,2.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
22054,22054,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [48]:

nb_list_for_ohe = ['h1n1_concern', 'h1n1_knowledge', 'opinion_h1n1_vacc_effective',
'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
'opinion_seas_risk', 'opinion_seas_sick_from_vacc']


In [52]:

ohe = OneHotEncoder(drop='first', sparse=False)
dums = ohe.fit_transform(X_train_imputed[nb_list_for_ohe])
dums_df = pd.DataFrame(dums,
                    columns=ohe.get_feature_names(),
                    index=X_train_imputed.index)
#df_cat_dropped = X_dataframe.drop(cat_col_list, axis = 1)
#dums_df_concated = pd.concat([df_cat_dropped, dums_df], axis=1)

In [53]:
dums_df

Unnamed: 0,x0_1.0,x0_2.0,x0_3.0,x1_1.0,x1_2.0,x2_2.0,x2_3.0,x2_4.0,x2_5.0,x3_2.0,...,x5_4.0,x5_5.0,x6_2.0,x6_3.0,x6_4.0,x6_5.0,x7_2.0,x7_3.0,x7_4.0,x7_5.0
26356,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1925,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2668,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5325,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
841,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14593,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
26345,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6555,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1035,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
