In [46]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize,StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier 
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_curve, roc_auc_score


from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score, cross_validate

# Recursive Feature Elimination 
from sklearn.feature_selection import RFE

In [3]:
features_df = pd.read_csv('../Data/training_set_features.csv', index_col="respondent_id")
labels_df = pd.read_csv('../Data/training_set_labels.csv', index_col="respondent_id")
joined_df = features_df.join(labels_df, how = 'inner')

In [4]:
# an all-in-one data cleaning function. Do this BEFORE OHE
# Maybe this should be a class and worked into the pipeline?
def datacleaner(maindataframe):
    #For dropping whole columns 
    def columndrop(dataframe, column_list):
        dataframe.drop(column_list, axis = 1, inplace=True)
    #For dropping rows with na values
    def basicdropna(dataframe, column_list):
        dataframe.dropna(subset=column_list, inplace=True)
    #For special case imputation
    def impute_missing_data(dataframe, column_list, fillvalue):
        for column in column_list:
            dataframe[column].fillna(fillvalue, inplace = True)
    #This creates a number of lists of columns that fall into a few different 
    #categories, that will be processed in different ways. See notes below on how
    #these choices were made.
    drop_columns =  ['employment_industry',  'employment_occupation', 'hhs_geo_region']       
        
    general_dropna = ['health_worker', 'education','income_poverty', 'marital_status', 
                    'rent_or_own', 'employment_status', 'household_adults', 
                    'household_children' ]
        
    survey_col = ['opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
         'opinion_seas_vacc_effective', 'opinion_seas_risk','opinion_seas_sick_from_vacc']

    behavior_col = ['behavioral_antiviral_meds', 'behavioral_face_mask',
                'behavioral_large_gatherings','behavioral_outside_home']

    behavior_col_2 = ['behavioral_avoidance', 
                'behavioral_wash_hands','behavioral_touch_face']

    doc_rec = ['doctor_recc_h1n1','doctor_recc_seasonal']
    
    basicdropna(maindataframe, general_dropna)
    columndrop(maindataframe, drop_columns)
    impute_missing_data(maindataframe, survey_col, 3)
    impute_missing_data(maindataframe, ['h1n1_concern'], 2)
    impute_missing_data(maindataframe, ['h1n1_knowledge'], 0)
    impute_missing_data(maindataframe, behavior_col, 0)
    impute_missing_data(maindataframe, behavior_col_2, 1)
    impute_missing_data(maindataframe, doc_rec, 0)
    impute_missing_data(maindataframe, ['chronic_med_condition'], 0)
    impute_missing_data(maindataframe, ['child_under_6_months'], 0)


In [5]:
datacleaner(joined_df)

In [6]:
X=joined_df.drop(['h1n1_vaccine','seasonal_vaccine'], axis=1)
y=joined_df[['h1n1_vaccine','seasonal_vaccine']]

# Train test split, do this before OHE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [7]:
# create OHE for objects, do this before imputer

cat_col_list = [i for i in X_train.select_dtypes(include='object').columns]

nb_list_for_ohe = ['h1n1_concern', 'h1n1_knowledge', 'opinion_h1n1_vacc_effective',
'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
'opinion_seas_risk', 'opinion_seas_sick_from_vacc']

# Fits OHE on a subset of columns, then reintegrates them into the
# Origional dataframe. Do this after initial cleaning, before 
# health insurace imputation.

ohe = OneHotEncoder(drop='first', sparse=False)

def fit_trans_ohe(X_dataframe, columns):
        
    dums = ohe.fit_transform(X_dataframe[columns])
    dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=X_dataframe.index)
    df_cat_dropped = X_dataframe.drop(cat_col_list, axis = 1)
    dums_df_concated = pd.concat([df_cat_dropped, dums_df], axis=1)
    return dums_df_concated

#We should end up with a fitted ohe instance called 'ohe'

In [8]:
X_train_ohe = fit_trans_ohe(X_train, cat_col_list+nb_list_for_ohe)

In [9]:
    
socio_economic_column_list = ["x0_35 - 44 Years","x0_45 - 54 Years","x0_55 - 64 Years","x0_65+ Years",
                              "x1_< 12 Years","x1_College Graduate","x1_Some College","x2_Hispanic",
                              "x2_Other or Multiple","x2_White","x3_Male", "x4_> $75,000", "x4_Below Poverty",
                              "x5_Not Married", "x6_Rent", "x7_Not in Labor Force","x7_Unemployed",
                              "x8_MSA, Principle City",'x8_Non-MSA', 'health_insurance']

# Fitting an imputer for Health Insurance using socio-economic features, 
# pulling from a dataframe that has already been OneHotEncoded


soc_eco_h_i_imputer_knn = KNNImputer()

def soc_eco_KNN_imputer(imputer, dataframe, column_list):
    soc_econ_base = dataframe[column_list]
    soc_econ_imputed = pd.DataFrame(imputer.fit_transform(soc_econ_base), 
                                         columns = soc_econ_base.columns,
                                        index=soc_econ_base.index)
    remainder_df = dataframe.drop(column_list, axis = 1)
    output_df = remainder_df.join(soc_econ_imputed)
    output_df.health_insurance = output_df.health_insurance.round() 

    return output_df


In [10]:
X_train_imputed = soc_eco_KNN_imputer(soc_eco_h_i_imputer_knn, X_train_ohe, socio_economic_column_list)

In [11]:
# The OHE for the test set only, takes X test dataframe and list of columns to encoded:
def trans_ohe(X_dataframe, columns):
    dums = ohe.transform(X_dataframe[columns])
    dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=X_dataframe.index)
    df_cat_dropped = X_dataframe.drop(cat_col_list, axis = 1)
    dums_df_concated = pd.concat([df_cat_dropped, dums_df], axis=1)
    return dums_df_concated

In [12]:
X_test_ohe = trans_ohe(X_test, cat_col_list+nb_list_for_ohe)

In [13]:
def imputer_transform_only(imputer, dataframe, column_list):
    soc_econ_base = dataframe[column_list]
    soc_econ_imputed = pd.DataFrame(imputer.transform(soc_econ_base), 
                                         columns = soc_econ_base.columns,
                                        index=soc_econ_base.index)
    remainder_df = dataframe.drop(column_list, axis = 1)
    output_df = remainder_df.join(soc_econ_imputed)
    output_df.health_insurance = output_df.health_insurance.round()
    
    return output_df

In [14]:
X_test_imputed = imputer_transform_only(soc_eco_h_i_imputer_knn, X_test_ohe, socio_economic_column_list)

### We now have a working dataset of: 
    'X_train_imputed' and 'y_train' to fit models to, 'X_test_ohe' to generate predictions, and 'y_test' to validate models with.

In [18]:
# One hot encoding for non-binary features 

non_binary = ['h1n1_concern', 'h1n1_knowledge', 'opinion_h1n1_vacc_effective',
'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
'opinion_seas_risk', 'opinion_seas_sick_from_vacc']


# X_test data
nb_train = X_train_imputed[non_binary]

ohe = OneHotEncoder(drop='first', sparse=False)

dums = ohe.fit_transform(nb_train)

dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=nb_train.index)




# X_test data

nb_test = X_test_ohe[non_binary]

dums_t = ohe.transform(nb_test)

dums_t_df = pd.DataFrame(dums_t,
                       columns=ohe.get_feature_names(),
                       index=nb_test.index)




In [22]:
# concat one hot encoded df and X 

# X_train data
X_train = X_train_imputed.drop(non_binary, axis=1)

X_train_imputed_ohe = pd.concat([X_train, dums_df], axis=1)

# X_test data
X_test = X_test_ohe.drop(non_binary, axis=1)

X_test_ohe_2 = pd.concat([X_test, dums_t_df], axis=1)

X_train_iputed => some has more than 2 categories

X_train_imputed_ohe => all binary 


X_test_ohe => some has more than 2 categories

X_test_ohe_2 => all binary 

### Baseline Model 

In [23]:
y_train.h1n1_vaccine.value_counts(normalize=True) # =>change to DummyClassifier later 

0    0.783521
1    0.216479
Name: h1n1_vaccine, dtype: float64

In [30]:
y_train.seasonal_vaccine.value_counts(normalize=True) 

0    0.531887
1    0.468113
Name: seasonal_vaccine, dtype: float64

### Logistic Regression 

#### Logistic Regression (default settings)

In [35]:
lr = LogisticRegression()

### with X_train_imputed 

# H1N1 
h1n1 = cross_val_score(estimator=lr, X=X_train_imputed, y=y_train.h1n1_vaccine, 
                cv=5, scoring='roc_auc').mean() 

# Seasonal
sea = cross_val_score(estimator=lr, X=X_train_imputed, y=y_train.seasonal_vaccine, 
                cv=5, scoring='roc_auc').mean() 


### with X_train_imputed_ohe (all binary) 


# H1N1 
h1n1_1 = cross_val_score(estimator=lr, X=X_train_imputed_ohe, y=y_train.h1n1_vaccine, 
                cv=5, scoring='roc_auc').mean() 

# Seasonal
sea_1 = cross_val_score(estimator=lr, X=X_train_imputed_ohe, y=y_train.seasonal_vaccine, 
                cv=5, scoring='roc_auc').mean() 


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

H1N1 Cross-val ROC_AUC score:0.8392174922109827
Seasonal Cross-val ROC_AUC score:0.8563820317814604
H1N1 Cross-val ROC_AUC score (All Binary):0.8392343999877966
Seasonal Cross-val ROC_AUC score (All Binary):0.856398673986693


In [38]:
# Results 
print(f'H1N1 cv-mean ROC_AUC score:{h1n1}')
print(f'Seasonal cv-mean ROC_AUC score:{sea}')

print(f'H1N1 cv-mean ROC_AUC score (All Binary):{h1n1_1}')
print(f'Seasonal cv-mean ROC_AUC score (All Binary):{sea_1}')

H1N1 cv-mean ROC_AUC score:0.8392174922109827
Seasonal cv-mean ROC_AUC score:0.8563820317814604
H1N1 cv-mean ROC_AUC score (All Binary):0.8392343999877966
Seasonal cv-mean ROC_AUC score (All Binary):0.856398673986693


#### Logistic Regression with best params (by grid search) 

In [41]:

# H1N1 
# {'C': 1.0, 'class_weight': 'balanced', 'max_iter': 1000000, 'penalty': 'l1', 'solver': 'saga'}


logreg_h1n1=LogisticRegression(C=1,penalty="l1", class_weight='balanced', max_iter=1000000,
                              solver='saga')

h1n1_bp = cross_val_score(estimator=logreg_h1n1, X=X_train_imputed_ohe, y=y_train.h1n1_vaccine, 
                cv=5, scoring='roc_auc').mean() 

# Seasonal 
#{'C': 1.0, 'class_weight': 'balanced', 'max_iter': 10000, 'penalty': 'l1', 'solver': 'saga'}


logreg_sea=LogisticRegression(C=1,penalty="l1", class_weight='balanced', max_iter=10000,
                              solver='saga')

sea_bp = cross_val_score(estimator=logreg_sea, X=X_train_imputed_ohe, y=y_train.seasonal_vaccine, 
                cv=5, scoring='roc_auc').mean() 


print(f'H1N1 cv-mean ROC_AUC score (All Binary/Best Params):{h1n1_bp}')
print(f'Seasonal cv-mean ROC_AUC score (All Binary/Best Params):{sea_bp}')

H1N1 cv-mean ROC_AUC score (All Binary/Best Params):0.8396002112901335
Seasonal cv-mean ROC_AUC score (All Binary/Best Params):0.8564305668521636


#### Using Feature Ranking with Recursive Feature Elimination, select the most important features.

In [51]:
# H1N1 

# Use the model with all binary / best params 
model_for_RFE = logreg_h1n1=LogisticRegression(C=1,penalty="l1", class_weight='balanced', max_iter=1000000,
                              solver='saga')


# Instantiate and fit the selector
selector = RFE(model_for_RFE)
selector.fit(X_train_imputed_ohe, y_train.h1n1_vaccine) 

# Print the results
#print("Was the column selected?")
#for index, col in enumerate(X_train_imputed_ohe.columns):
#    print(f"{col}: {selector.support_[index]}")

    
#print(selector.support_)
# print(selector.ranking_)
f = selector.get_support(1) #the most important features
X_train_imputed_ohe.columns[f] # final features`



Index(['behavioral_face_mask', 'behavioral_large_gatherings',
       'doctor_recc_h1n1', 'doctor_recc_seasonal', 'child_under_6_months',
       'health_worker', 'x9_3.0', 'x11_3.0', 'x11_4.0', 'x11_5.0', 'x12_2.0',
       'x12_3.0', 'x12_4.0', 'x12_5.0', 'x13_2.0', 'x13_3.0', 'x14_3.0',
       'x15_2.0', 'x15_3.0', 'x15_4.0', 'x15_5.0', 'x16_3.0', 'x16_4.0',
       'x16_5.0', 'x0_55 - 64 Years', 'x0_65+ Years', 'x1_< 12 Years',
       'x2_Hispanic', 'x2_Other or Multiple', 'x2_White', 'x3_Male',
       'x5_Not Married', 'health_insurance', 'x2_3.0', 'x2_4.0', 'x2_5.0',
       'x3_3.0', 'x3_4.0', 'x3_5.0', 'x4_3.0', 'x5_3.0', 'x6_3.0', 'x6_4.0',
       'x6_5.0', 'x7_3.0', 'x7_5.0'],
      dtype='object')

In [52]:
# Seasonal Flu

# Use the model with all binary / best params 
model_for_RFE_1 = logreg_sea=LogisticRegression(C=1,penalty="l1", class_weight='balanced', max_iter=10000,
                              solver='saga')

# Instantiate and fit the selector
selector_1 = RFE(model_for_RFE_1)
selector_1.fit(X_train_imputed_ohe, y_train.seasonal_vaccine) 

# Print the results
#print("Was the column selected?")
#for index, col in enumerate(X_train_imputed_ohe.columns):
#    print(f"{col}: {selector.support_[index]}")

    
#print(selector.support_)
# print(selector.ranking_)
f_1 = selector_1.get_support(1) #the most important features
X_train_imputed_ohe.columns[f_1] # final features`



Index(['behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'health_worker', 'x10_2.0', 'x11_4.0', 'x11_5.0', 'x13_5.0', 'x14_3.0',
       'x14_4.0', 'x14_5.0', 'x15_2.0', 'x15_3.0', 'x15_4.0', 'x15_5.0',
       'x16_2.0', 'x16_3.0', 'x16_4.0', 'x16_5.0', 'x0_35 - 44 Years',
       'x0_45 - 54 Years', 'x0_55 - 64 Years', 'x0_65+ Years', 'x1_< 12 Years',
       'x1_College Graduate', 'x2_Hispanic', 'x2_Other or Multiple',
       'x2_White', 'x4_Below Poverty', 'x7_Unemployed', 'health_insurance',
       'x1_1.0', 'x1_2.0', 'x2_3.0', 'x5_2.0', 'x5_3.0', 'x5_4.0', 'x5_5.0',
       'x6_2.0', 'x6_3.0', 'x6_4.0', 'x6_5.0', 'x7_2.0', 'x7_3.0', 'x7_4.0',
       'x7_5.0'],
      dtype='object')