### Grid Search Codebook - Logistic Regression

This notebook includes grid search codes for  our logistic regression.

--------------Run this part to create our dataframe---------------

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize,StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier 
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_curve, roc_auc_score

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score, cross_validate

In [2]:
features_df = pd.read_csv('../Data/training_set_features.csv', index_col="respondent_id")
labels_df = pd.read_csv('../Data/training_set_labels.csv', index_col="respondent_id")
joined_df = features_df.join(labels_df, how = 'inner')

In [3]:
# an all-in-one data cleaning function. Do this BEFORE OHE
# Maybe this should be a class and worked into the pipeline?
def datacleaner(maindataframe):
    #For dropping whole columns 
    def columndrop(dataframe, column_list):
        dataframe.drop(column_list, axis = 1, inplace=True)
    #For dropping rows with na values
    def basicdropna(dataframe, column_list):
        dataframe.dropna(subset=column_list, inplace=True)
    #For special case imputation
    def impute_missing_data(dataframe, column_list, fillvalue):
        for column in column_list:
            dataframe[column].fillna(fillvalue, inplace = True)
    #This creates a number of lists of columns that fall into a few different 
    #categories, that will be processed in different ways. See notes below on how
    #these choices were made.
    drop_columns =  ['employment_industry',  'employment_occupation', 'hhs_geo_region']       
        
    general_dropna = ['health_worker', 'education','income_poverty', 'marital_status', 
                    'rent_or_own', 'employment_status', 'household_adults', 
                    'household_children' ]
        
    survey_col = ['opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc',
         'opinion_seas_vacc_effective', 'opinion_seas_risk','opinion_seas_sick_from_vacc']

    behavior_col = ['behavioral_antiviral_meds', 'behavioral_face_mask',
                'behavioral_large_gatherings','behavioral_outside_home']

    behavior_col_2 = ['behavioral_avoidance', 
                'behavioral_wash_hands','behavioral_touch_face']

    doc_rec = ['doctor_recc_h1n1','doctor_recc_seasonal']
    
    basicdropna(maindataframe, general_dropna)
    columndrop(maindataframe, drop_columns)
    impute_missing_data(maindataframe, survey_col, 3)
    impute_missing_data(maindataframe, ['h1n1_concern'], 2)
    impute_missing_data(maindataframe, ['h1n1_knowledge'], 0)
    impute_missing_data(maindataframe, behavior_col, 0)
    impute_missing_data(maindataframe, behavior_col_2, 1)
    impute_missing_data(maindataframe, doc_rec, 0)
    impute_missing_data(maindataframe, ['chronic_med_condition'], 0)
    impute_missing_data(maindataframe, ['child_under_6_months'], 0)


In [4]:
datacleaner(joined_df)

In [5]:
X=joined_df.drop(['h1n1_vaccine','seasonal_vaccine'], axis=1)
y=joined_df[['h1n1_vaccine','seasonal_vaccine']]

# Train test split, do this before OHE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [6]:
# create OHE for objects, do this before imputer

cat_col_list = [i for i in X_train.select_dtypes(include='object').columns]

nb_list_for_ohe = ['h1n1_concern', 'h1n1_knowledge', 'opinion_h1n1_vacc_effective',
'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
'opinion_seas_risk', 'opinion_seas_sick_from_vacc']

# Fits OHE on a subset of columns, then reintegrates them into the
# Origional dataframe. Do this after initial cleaning, before 
# health insurace imputation.

ohe = OneHotEncoder(drop='first', sparse=False)

def fit_trans_ohe(X_dataframe, columns):
        
    dums = ohe.fit_transform(X_dataframe[columns])
    dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=X_dataframe.index)
    df_cat_dropped = X_dataframe.drop(cat_col_list, axis = 1)
    dums_df_concated = pd.concat([df_cat_dropped, dums_df], axis=1)
    return dums_df_concated

#We should end up with a fitted ohe instance called 'ohe'

In [7]:
X_train_ohe = fit_trans_ohe(X_train, cat_col_list+nb_list_for_ohe)

In [8]:
    
socio_economic_column_list = ["x0_35 - 44 Years","x0_45 - 54 Years","x0_55 - 64 Years","x0_65+ Years",
                              "x1_< 12 Years","x1_College Graduate","x1_Some College","x2_Hispanic",
                              "x2_Other or Multiple","x2_White","x3_Male", "x4_> $75,000", "x4_Below Poverty",
                              "x5_Not Married", "x6_Rent", "x7_Not in Labor Force","x7_Unemployed",
                              "x8_MSA, Principle City",'x8_Non-MSA', 'health_insurance']

# Fitting an imputer for Health Insurance using socio-economic features, 
# pulling from a dataframe that has already been OneHotEncoded


soc_eco_h_i_imputer_knn = KNNImputer()

def soc_eco_KNN_imputer(imputer, dataframe, column_list):
    soc_econ_base = dataframe[column_list]
    soc_econ_imputed = pd.DataFrame(imputer.fit_transform(soc_econ_base), 
                                         columns = soc_econ_base.columns,
                                        index=soc_econ_base.index)
    remainder_df = dataframe.drop(column_list, axis = 1)
    output_df = remainder_df.join(soc_econ_imputed)
    output_df.health_insurance = output_df.health_insurance.round() 

    return output_df


In [9]:
X_train_imputed = soc_eco_KNN_imputer(soc_eco_h_i_imputer_knn, X_train_ohe, socio_economic_column_list)

In [10]:
# The OHE for the test set only, takes X test dataframe and list of columns to encoded:
def trans_ohe(X_dataframe, columns):
    dums = ohe.transform(X_dataframe[columns])
    dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names(),
                       index=X_dataframe.index)
    df_cat_dropped = X_dataframe.drop(cat_col_list, axis = 1)
    dums_df_concated = pd.concat([df_cat_dropped, dums_df], axis=1)
    return dums_df_concated

In [11]:
X_test_ohe = trans_ohe(X_test, cat_col_list+nb_list_for_ohe)

In [12]:
def imputer_transform_only(imputer, dataframe, column_list):
    soc_econ_base = dataframe[column_list]
    soc_econ_imputed = pd.DataFrame(imputer.transform(soc_econ_base), 
                                         columns = soc_econ_base.columns,
                                        index=soc_econ_base.index)
    remainder_df = dataframe.drop(column_list, axis = 1)
    output_df = remainder_df.join(soc_econ_imputed)
    output_df.health_insurance = output_df.health_insurance.round()
    
    return output_df

In [13]:
X_test_imputed = imputer_transform_only(soc_eco_h_i_imputer_knn, X_test_ohe, socio_economic_column_list)

### We now have a working dataset of: 
    'X_train_imputed' and 'y_train' to fit models to, 'X_test_imputed' to generate predictions, and 'y_test' to validate models with.
------------------------------------------------------------------------------------------

### Grid Search for Logistic Regression

- For H1N1 vaccination model

In [None]:
# C, penalty, max_iter, solver, class_weight   

grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2", 'elasticnet', 'none'],
     "max_iter":[100, 1000, 10000, 100000, 1000000], 
      'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga' ],
     "class_weight":['balanced', None]} # l1 lasso l2 ridge

logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=5, scoring='roc_auc')
logreg_cv.fit(X_train_ohe,y_train.h1n1_vaccine)

print("Tuned hpyerparameters(best parameters) for H1N1 model: ",logreg_cv.best_params_)
print("ROC_AUC for H1N1 model :",logreg_cv.best_score_)

- For Seasonal vaccination model

In [None]:
# C, penalty, max_iter, solver, class_weight   

grid_s={"C":np.logspace(-3,3,7), "penalty":["l1","l2", 'elasticnet', 'none'],
     "max_iter":[100, 1000, 10000, 100000, 1000000], 
      'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga' ],
     "class_weight":['balanced', None]} # l1 lasso l2 ridge

logreg=LogisticRegression()
logreg_cv_s=GridSearchCV(logreg,grid_s,cv=5, scoring='roc_auc')
logreg_cv_s.fit(X_train_ohe,y_train.seasonal_vaccine)


print("Tuned hpyerparameters(best parameters) for Seasonal Flu model: ",logreg_cv_s.best_params_)
print("ROC_AUC for Seasonal Flu model :",logreg_cv_s.best_score_)

### Comparison with a default model 

In [None]:
# Default Model

lr = LogisticRegression()

# H1N1 
h1n1 = cross_val_score(estimator=lr, X=X_train_ohe, y=y_train.h1n1_vaccine, 
                cv=5, scoring='roc_auc').mean() 

# Seasonal
sea = cross_val_score(estimator=lr, X=X_train_ohe, y=y_train.seasonal_vaccine, 
                cv=5, scoring='roc_auc').mean() 

print("ROC_AUC for H1N1 model :", h1n1)
print()
print("ROC_AUC for H1N1 model :",logreg_cv.best_score_)
print()
print("ROC_AUC for Seasonal Flu model :", sea)
print()
print("ROC_AUC for Seasonal Flu model :",logreg_cv_s.best_score_)

In [None]:

# H1N1 
# {'C': 1.0, 'class_weight': 'balanced', 'max_iter': 1000000, 'penalty': 'l1', 'solver': 'saga'}


logreg_h1n1=LogisticRegression(C=1,penalty="l1", class_weight='balanced', max_iter=1000000,
                              solver='saga')

logreg_h1n1.fit(X_train_imputed, y_train.h1n1_vaccine)

h1n1_bp = cross_val_score(estimator=logreg_h1n1, X=X_train_imputed, y=y_train.h1n1_vaccine, 
                cv=5, scoring='roc_auc').mean() 

# Seasonal 
#{'C': 1.0, 'class_weight': 'balanced', 'max_iter': 10000, 'penalty': 'l1', 'solver': 'saga'}


logreg_sea=LogisticRegression(C=1,penalty="l1", class_weight='balanced', max_iter=10000,
                              solver='saga')

sea_bp = cross_val_score(estimator=logreg_sea, X=X_train_imputed, y=y_train.seasonal_vaccine, 
                cv=5, scoring='roc_auc').mean() 


print(f'H1N1 cv-mean ROC_AUC score (All Binary/Best Params):{h1n1_bp}')
print(f'Seasonal cv-mean ROC_AUC score (All Binary/Best Params):{sea_bp}')