In [23]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.impute import MissingIndicator, SimpleImputer, KNNImputer 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve



In [24]:
features_df = pd.read_csv('../Data/training_set_features.csv')
labels_df = pd.read_csv('../Data/training_set_labels.csv')

# check that the rows between the features and the labels match up
np.testing.assert_array_equal(features_df.index.values, labels_df.index.values)

# merge features_df and labels_df 
df = labels_df.merge(features_df, how = 'inner', on='respondent_id')

# drop duplicate 
df.drop_duplicates(inplace=True)

In [25]:
df.shape

(26707, 38)

In [26]:
df.info()

# 17  health_insurance, 36  employment_industry, 37  employment_occupation  about 50% missing 


<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_vaccine                 26707 non-null  int64  
 2   seasonal_vaccine             26707 non-null  int64  
 3   h1n1_concern                 26615 non-null  float64
 4   h1n1_knowledge               26591 non-null  float64
 5   behavioral_antiviral_meds    26636 non-null  float64
 6   behavioral_avoidance         26499 non-null  float64
 7   behavioral_face_mask         26688 non-null  float64
 8   behavioral_wash_hands        26665 non-null  float64
 9   behavioral_large_gatherings  26620 non-null  float64
 10  behavioral_outside_home      26625 non-null  float64
 11  behavioral_touch_face        26579 non-null  float64
 12  doctor_recc_h1n1             24547 non-null  float64
 13  doctor_recc_seas

In [5]:
# Check balance of target variables 

print("H1N1 Vaccination (counts)")
print(df["h1n1_vaccine"].value_counts())
print()
print("Seasonal Flu Vaccination (counts)")
print(df["seasonal_vaccine"].value_counts())
print()
print("H1N1 Vaccination (percentages)")
print(df["h1n1_vaccine"].value_counts(normalize=True))
print()
print("Seasonal Flu Vaccination (percentages)")
print(df["seasonal_vaccine"].value_counts(normalize=True))

# Only 21% of people are vaccinated. H1N1 vaccination data is highly imbalanced. 
# For model tuning, use SMOTE or other methods.  



H1N1 Vaccination (counts)
0    21033
1     5674
Name: h1n1_vaccine, dtype: int64

Seasonal Flu Vaccination (counts)
0    14272
1    12435
Name: seasonal_vaccine, dtype: int64

H1N1 Vaccination (percentages)
0    0.787546
1    0.212454
Name: h1n1_vaccine, dtype: float64

Seasonal Flu Vaccination (percentages)
0    0.534392
1    0.465608
Name: seasonal_vaccine, dtype: float64


### Missing Values

In [27]:
# drop respondent_id
df.drop('respondent_id', axis=1, inplace=True)

# report missing values 
df.isna().sum()

h1n1_vaccine                       0
seasonal_vaccine                   0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
i

#### Data imputation

In [28]:
# drop variables of random characters 

df.drop(['employment_industry',  'employment_occupation', 'hhs_geo_region'], axis=1, inplace=True)



In [29]:
# for socioeconomic categoircal variables, drop missing rows 

df.dropna(subset=['health_worker', 'education','income_poverty', 'marital_status', 
                    'rent_or_own', 'employment_status', 'household_adults', 
                    'household_children' ], inplace=True)


In [30]:
#features_df = pd.read_csv('../Data/training_set_features.csv')
#x_df = features_df

survey_col = ['opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
 'opinion_seas_vacc_effective', 'opinion_seas_risk','opinion_seas_sick_from_vacc']

behavior_col = ['behavioral_antiviral_meds', 'behavioral_face_mask',
                'behavioral_large_gatherings','behavioral_outside_home']

behavior_col_2 = ['behavioral_avoidance', 
                'behavioral_wash_hands','behavioral_touch_face']


doc_rec = ['doctor_recc_h1n1','doctor_recc_seasonal']

def impute_missing_data(dataframe, column_list, fillvalue):
    for column in column_list:
        dataframe[column].fillna(fillvalue, inplace = True)
      
    
impute_missing_data(df, survey_col, 3)
impute_missing_data(df, ['h1n1_concern'], 2)
impute_missing_data(df, ['h1n1_knowledge'], 0)
impute_missing_data(df, behavior_col, 0)
impute_missing_data(df, behavior_col_2, 1)
impute_missing_data(df, doc_rec, 0)
impute_missing_data(df, ['chronic_med_condition'], 0)
impute_missing_data(df, ['child_under_6_months'], 0)


In [31]:
df.isnull().sum()

h1n1_vaccine                      0
seasonal_vaccine                  0
h1n1_concern                      0
h1n1_knowledge                    0
behavioral_antiviral_meds         0
behavioral_avoidance              0
behavioral_face_mask              0
behavioral_wash_hands             0
behavioral_large_gatherings       0
behavioral_outside_home           0
behavioral_touch_face             0
doctor_recc_h1n1                  0
doctor_recc_seasonal              0
chronic_med_condition             0
child_under_6_months              0
health_worker                     0
health_insurance               9400
opinion_h1n1_vacc_effective       0
opinion_h1n1_risk                 0
opinion_h1n1_sick_from_vacc       0
opinion_seas_vacc_effective       0
opinion_seas_risk                 0
opinion_seas_sick_from_vacc       0
age_group                         0
education                         0
race                              0
sex                               0
income_poverty              

#### train_test_split

In [32]:
# Define X and y, 30-70(training)

X=df.drop(['h1n1_vaccine','seasonal_vaccine'], axis=1)
y=df[['h1n1_vaccine','seasonal_vaccine']]

# Train test split 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# X_train 15304 rows 

#### Imputing Health Insurance 

In [33]:
X_train.shape

(15304, 32)

In [34]:
# create dummies for objects to use for KNNImputer

X_train_obj = X_train.select_dtypes(include='object')

# one hot encoding for objects 
ohe = OneHotEncoder(drop='first', sparse=False)

dums = ohe.fit_transform(X_train_obj)

dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=X_train_obj.index)

dums_df
dums_df.shape #15304 rows 



(15304, 19)

In [14]:
# Imputing helath insurance with KNNImputer 
#knnimpute = KNNImputer()
#X_train_imputed = pd.DataFrame(knnimpute.fit_transform(X_train), columns = X_train.columns)

#X_train_imputed

#X_train_imputed.health_insurance.value_counts()
#X_train_imputed.health_insurance = (X_train_imputed.health_insurance > 0.5).astype(int)
#X_train_imputed.health_insurance.value_counts()

In [35]:
dums_df

Unnamed: 0,x0_35 - 44 Years,x0_45 - 54 Years,x0_55 - 64 Years,x0_65+ Years,x1_< 12 Years,x1_College Graduate,x1_Some College,x2_Hispanic,x2_Other or Multiple,x2_White,x3_Male,"x4_> $75,000",x4_Below Poverty,x5_Not Married,x6_Rent,x7_Not in Labor Force,x7_Unemployed,"x8_MSA, Principle City",x8_Non-MSA
26356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
1925,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2668,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5325,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
841,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14593,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
26345,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
6555,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1035,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [36]:
# Imputation using other socio-econmic features  

dums_df_insurance = pd.concat([X_train.health_insurance, dums_df], axis=1)

knnimpute = KNNImputer()
dums_df_insurance_imputed = pd.DataFrame(knnimpute.fit_transform(dums_df_insurance), 
                                         columns = dums_df_insurance.columns,
                                        index=dums_df_insurance.index)

dums_df_insurance_imputed # 15304 rows


Unnamed: 0,health_insurance,x0_35 - 44 Years,x0_45 - 54 Years,x0_55 - 64 Years,x0_65+ Years,x1_< 12 Years,x1_College Graduate,x1_Some College,x2_Hispanic,x2_Other or Multiple,x2_White,x3_Male,"x4_> $75,000",x4_Below Poverty,x5_Not Married,x6_Rent,x7_Not in Labor Force,x7_Unemployed,"x8_MSA, Principle City",x8_Non-MSA
26356,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
1925,0.8,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2668,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5325,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
841,0.8,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14593,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
26345,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
6555,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1035,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [37]:
dums_df_insurance_imputed.health_insurance.value_counts()

1.0    12105
0.8     1278
0.0     1046
0.6      582
0.4      216
0.2       77
Name: health_insurance, dtype: int64

In [38]:
# convert to 0,1 value 


dums_df_insurance_imputed.health_insurance = (dums_df_insurance_imputed.health_insurance
                                              > 0.5).astype(int)

dums_df_insurance_imputed.health_insurance.value_counts()


1    13965
0     1339
Name: health_insurance, dtype: int64

In [39]:
dums_df_insurance_imputed


Unnamed: 0,health_insurance,x0_35 - 44 Years,x0_45 - 54 Years,x0_55 - 64 Years,x0_65+ Years,x1_< 12 Years,x1_College Graduate,x1_Some College,x2_Hispanic,x2_Other or Multiple,x2_White,x3_Male,"x4_> $75,000",x4_Below Poverty,x5_Not Married,x6_Rent,x7_Not in Labor Force,x7_Unemployed,"x8_MSA, Principle City",x8_Non-MSA
26356,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
1925,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2668,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5325,1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
841,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14593,1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
26345,1,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
6555,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1035,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [19]:
X_train_obj.columns

Index(['age_group', 'education', 'race', 'sex', 'income_poverty',
       'marital_status', 'rent_or_own', 'employment_status', 'census_msa'],
      dtype='object')

In [40]:
# Drop the old object columns and health insurance column from X_train 
# and concat the imputed df. 

X_train_1 = X_train.drop(X_train_obj.columns, axis=1)
X_train_1 = X_train_1.drop('health_insurance', axis=1)

X_train_imputed = pd.concat([X_train_1, dums_df_insurance_imputed], axis=1)



X_train_imputed


Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,x2_White,x3_Male,"x4_> $75,000",x4_Below Poverty,x5_Not Married,x6_Rent,x7_Not in Labor Force,x7_Unemployed,"x8_MSA, Principle City",x8_Non-MSA
26356,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
1925,2.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2668,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5325,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
841,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14593,2.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
26345,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
6555,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1035,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [41]:
import os
cwd = os.getcwd()
path = cwd + "/X_train_imputed.csv"
X_train_imputed.to_csv(path, index=False)

In [22]:
#pd.read_csv('X_train_imputed')