In [25]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize,StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier 
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_curve, roc_auc_score


In [26]:
features_df = pd.read_csv('Data/training_set_features.csv', index_col="respondent_id")
labels_df = pd.read_csv('Data/training_set_labels.csv', index_col="respondent_id")
joined_df = features_df.join(labels_df, how = 'inner')

### A detailed overview and explanation of our data cleaning steps:

The following steps were taken to clean our database.

First we dropped the columns of employment_industry and employment_occupation. Both columns showed a high number of null datapoints with no good way to impute data. Rather than lose 13330 rows of data, we decided to lose these two columns and let the rest of our socio-economic data stand in as proxy.

Next we faced categories with a scattering of data with no real way to impute information. We simply dropped rows with nulls in these columns, slightly (but only slightly) reducing our data set.

Then we created a few special case imputations for some of our data. Most of these columns relate to responses to survery data. Here we imputed default answers as made sense. For instance, if someone fails to answer for whether or not they have a chronic medical condition, we assume that they do not. If a condition was severe enough to be chronic, we feel that a respondent would be aware of that enough to mark yes. These assumptions won't ne right all the time, but we believe that they'll still add to the predictive power of the model. 

The decisions imputations made are as follows:

* Blanks on survey questions were imputed to the 'neutral' or 'don't know' values.

* In behavioral columns we imputed with the majority class. 

* We assumed that if a patient had a doctor's recommendation they would say so.

* And finally was assumed that if a respondent had close contact with a child under 6 months they would know that, and we imputed a no reseponse.

After this data cleaning we are left with a large number of null values in the health insurance column. We felt that this factor was to important to drop, as we had been forced to for employment_industry, for instance. Instead we decided to impute that data via KNN imputation. However, that will have to wait until after we've split the test and training data.

In [27]:
# an all-in-one data cleaning function. Do this BEFORE OHE
# Maybe this should be a class and worked into the pipeline?
def datacleaner(maindataframe):
    #For dropping whole columns 
    def columndrop(dataframe, column_list):
        dataframe.drop(column_list, axis = 1, inplace=True)
    #For dropping rows with na values
    def basicdropna(dataframe, column_list):
        dataframe.dropna(subset=column_list, inplace=True)
    #For special case imputation
    def impute_missing_data(dataframe, column_list, fillvalue):
        for column in column_list:
            dataframe[column].fillna(fillvalue, inplace = True)
    #This creates a number of lists of columns that fall into a few different 
    #categories, that will be processed in different ways. See notes below on how
    #these choices were made.
    drop_columns =  ['employment_industry',  'employment_occupation', 'hhs_geo_region' ]       
        
    general_dropna = ['health_worker', 'education','income_poverty', 'marital_status', 
                    'rent_or_own', 'employment_status', 'household_adults', 
                    'household_children' ]
        
    survey_col = ['opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
         'opinion_seas_vacc_effective', 'opinion_seas_risk','opinion_seas_sick_from_vacc']

    behavior_col = ['behavioral_antiviral_meds', 'behavioral_face_mask',
                'behavioral_large_gatherings','behavioral_outside_home']

    behavior_col_2 = ['behavioral_avoidance', 
                'behavioral_wash_hands','behavioral_touch_face']

    doc_rec = ['doctor_recc_h1n1','doctor_recc_seasonal']
    
    basicdropna(maindataframe, general_dropna)
    columndrop(maindataframe, drop_columns)
    impute_missing_data(maindataframe, survey_col, 3)
    impute_missing_data(maindataframe, ['h1n1_concern'], 2)
    impute_missing_data(maindataframe, ['h1n1_knowledge'], 0)
    impute_missing_data(maindataframe, behavior_col, 0)
    impute_missing_data(maindataframe, behavior_col_2, 1)
    impute_missing_data(maindataframe, doc_rec, 0)
    impute_missing_data(maindataframe, ['chronic_med_condition'], 0)
    impute_missing_data(maindataframe, ['child_under_6_months'], 0)


In [28]:
datacleaner(joined_df)

In [29]:
X=joined_df.drop(['h1n1_vaccine','seasonal_vaccine'], axis=1)
y=joined_df[['h1n1_vaccine','seasonal_vaccine']]

# Train test split, do this before OHE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [30]:
# create OHE for objects, do this before imputer

#This is a list of columns that are obviously non-numeric
cat_col_list = [i for i in X_train.select_dtypes(include='object').columns]

#These columns are numeric, but correspond to non-numeric responses on survey data.
nb_list_for_ohe = ['h1n1_concern', 'h1n1_knowledge', 'opinion_h1n1_vacc_effective',
'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
'opinion_seas_risk', 'opinion_seas_sick_from_vacc']

# Fits OHE on a subset of columns, then reintegrates them into the
# Origional dataframe. Do this after initial cleaning, before 
# health insurace imputation.

ohe = OneHotEncoder(drop='first', sparse=False)

def fit_trans_ohe(X_dataframe, columns):
        
    dums = ohe.fit_transform(X_dataframe[columns])
    dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=X_dataframe.index)
    df_cat_dropped = X_dataframe.drop(cat_col_list, axis = 1)
    dums_df_concated = pd.concat([df_cat_dropped, dums_df], axis=1)
    return dums_df_concated

#We should end up with a fitted ohe instance called 'ohe'

In [31]:
X_train_ohe = fit_trans_ohe(X_train, cat_col_list+nb_list_for_ohe)

In [32]:
# We created a subset list for obviously socio-economic data.    
socio_economic_column_list = ["x0_35 - 44 Years","x0_45 - 54 Years","x0_55 - 64 Years","x0_65+ Years",
                              "x1_< 12 Years","x1_College Graduate","x1_Some College","x2_Hispanic",
                              "x2_Other or Multiple","x2_White","x3_Male", "x4_> $75,000", "x4_Below Poverty",
                              "x5_Not Married", "x6_Rent", "x7_Not in Labor Force","x7_Unemployed",
                              "x8_MSA, Principle City",'x8_Non-MSA', 'health_insurance']


# Fitting an imputer for Health Insurance using socio-economic features, 
# pulling from a TRAINING dataframe that has already been OneHotEncoded.
soc_eco_h_i_imputer_knn = KNNImputer()

def soc_eco_KNN_imputer(imputer, dataframe, column_list):
    soc_econ_base = dataframe[column_list]
    soc_econ_imputed = pd.DataFrame(imputer.fit_transform(soc_econ_base), 
                                         columns = soc_econ_base.columns,
                                        index=soc_econ_base.index)
    remainder_df = dataframe.drop(column_list, axis = 1)
    output_df = remainder_df.join(soc_econ_imputed)
    output_df.health_insurance = output_df.health_insurance.round() 

    return output_df


In [33]:
X_train_imputed = soc_eco_KNN_imputer(soc_eco_h_i_imputer_knn, X_train_ohe, socio_economic_column_list)

With the training data treated, we move on to the test data. Everything done in this section should be transforming only, not fitting!

In [35]:
# This function applies the OHE for the test set only, takes X test dataframe and list of columns to encoded:
def trans_ohe(X_dataframe, columns):
    dums = ohe.transform(X_dataframe[columns])
    dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=X_dataframe.index)
    df_cat_dropped = X_dataframe.drop(cat_col_list, axis = 1)
    dums_df_concated = pd.concat([df_cat_dropped, dums_df], axis=1)
    return dums_df_concated

In [36]:
X_test_ohe = trans_ohe(X_test, cat_col_list+nb_list_for_ohe)

In [37]:
def imputer_transform_only(imputer, dataframe, column_list):
    soc_econ_base = dataframe[column_list]
    soc_econ_imputed = pd.DataFrame(imputer.transform(soc_econ_base), 
                                         columns = soc_econ_base.columns,
                                        index=soc_econ_base.index)
    remainder_df = dataframe.drop(column_list, axis = 1)
    output_df = remainder_df.join(soc_econ_imputed)
    output_df.health_insurance = output_df.health_insurance.round()
    
    return output_df

In [38]:
X_test_imputed = imputer_transform_only(soc_eco_h_i_imputer_knn, X_test_ohe, socio_economic_column_list)

### We now have a working dataset of: 
    'X_train_imputed' and 'y_train' to fit models to, 'X_test_imputed' to generate test predictions, and 'y_test' to validate models with.

0