## This file is used to do the data analysis

In [93]:
# import the package
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import export_graphviz
from sklearn import utils
from sklearn.preprocessing import LabelEncoder

In [94]:
# load the data
path = "./data/"
train_data = pd.read_csv(path + "Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Features.csv")
train_label = pd.read_csv(path + "Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Training_Labels.csv")
test_data = pd.read_csv(path + "Flu_Shot_Learning_Predict_H1N1_and_Seasonal_Flu_Vaccines_-_Test_Features.csv")

# check if the data has the null value
print(train_data.isnull().any())
print(train_label.isnull().any())
print(test_data.isnull().any())

# check data type
print(train_data.dtypes)

respondent_id                  False
h1n1_concern                    True
h1n1_knowledge                  True
behavioral_antiviral_meds       True
behavioral_avoidance            True
behavioral_face_mask            True
behavioral_wash_hands           True
behavioral_large_gatherings     True
behavioral_outside_home         True
behavioral_touch_face           True
doctor_recc_h1n1                True
doctor_recc_seasonal            True
chronic_med_condition           True
child_under_6_months            True
health_worker                   True
health_insurance                True
opinion_h1n1_vacc_effective     True
opinion_h1n1_risk               True
opinion_h1n1_sick_from_vacc     True
opinion_seas_vacc_effective     True
opinion_seas_risk               True
opinion_seas_sick_from_vacc     True
age_group                      False
education                       True
race                           False
sex                            False
income_poverty                  True
m

In [95]:
# check the data
train_data[0:5]

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [96]:
# data preprocess
def data_preprocess(data:pd.DataFrame):
    # remove the null row
    data.dropna(axis=0, how='any', inplace=True)
    
    # remove the useless column
    data.drop(['hhs_geo_region', 'census_msa', 'employment_industry', 'employment_occupation', 'employment_status'], axis = 1, inplace=True)
    
    # object encoder
    categorical_columns = [col for col in data.columns.values if data[col].dtype == 'object']
    for feat in categorical_columns:
        print(feat)
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    
    # print(data['employment_status'].unique())
    # data normalization
    numeric_features = [col for col in data.columns.values if data[col].dtype != 'object'][:-1]
    print(numeric_features)
    data[numeric_features] = data[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
    
    return data

# data preprocess
train_data = data_preprocess(train_data)

age_group
education
race
sex
income_poverty
marital_status
rent_or_own
['respondent_id', 'h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'household_adults']


In [97]:
# check the data
print(train_data.isnull().any())
print(train_data.dtypes)
train_data[0:5]

respondent_id                  False
h1n1_concern                   False
h1n1_knowledge                 False
behavioral_antiviral_meds      False
behavioral_avoidance           False
behavioral_face_mask           False
behavioral_wash_hands          False
behavioral_large_gatherings    False
behavioral_outside_home        False
behavioral_touch_face          False
doctor_recc_h1n1               False
doctor_recc_seasonal           False
chronic_med_condition          False
child_under_6_months           False
health_worker                  False
health_insurance               False
opinion_h1n1_vacc_effective    False
opinion_h1n1_risk              False
opinion_h1n1_sick_from_vacc    False
opinion_seas_vacc_effective    False
opinion_seas_risk              False
opinion_seas_sick_from_vacc    False
age_group                      False
education                      False
race                           False
sex                            False
income_poverty                 False
m

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,opinion_seas_sick_from_vacc,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,household_adults,household_children
1,-1.752784,1.776846,1.12778,-0.241218,0.625878,-0.255667,0.470294,-0.60764,1.72923,0.708865,...,1.471329,-0.578621,-1.802733,0.452419,1.102909,2.363243,1.184291,1.856873,-1.281659,0.0
7,-1.752002,-0.557111,-2.323382,-0.241218,0.625878,-0.255667,0.470294,-0.60764,-0.578202,0.708865,...,-0.825771,0.226694,1.11705,0.452419,-0.906552,-0.87995,-0.844256,-0.538456,1.382222,0.0
10,-1.751611,0.609867,-0.597801,-0.241218,0.625878,-0.255667,0.470294,1.645457,-0.578202,-1.410487,...,1.471329,0.226694,-1.802733,0.452419,1.102909,-0.87995,-0.844256,1.856873,1.382222,0.0
11,-1.751481,-0.557111,1.12778,-0.241218,0.625878,-0.255667,0.470294,-0.60764,-0.578202,-1.410487,...,-0.825771,1.03201,0.143789,0.452419,1.102909,0.741647,-0.844256,-0.538456,0.050281,2.0
15,-1.75096,-0.557111,-0.597801,-0.241218,0.625878,-0.255667,-2.126001,-0.60764,-0.578202,-1.410487,...,-0.825771,-0.578621,0.143789,0.452419,-0.906552,0.741647,-0.844256,-0.538456,0.050281,3.0
