In [41]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
from scipy import stats
from statsmodels.graphics.mosaicplot import mosaic
from statsmodels.stats.contingency_tables import Table
from scipy.stats import chi2_contingency
pd.set_option('display.max_columns', 100)
from sklearn import linear_model
from sklearn import metrics
from statsmodels.discrete.discrete_model import LogitResults
from statsmodels.discrete.discrete_model import Logit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from statsmodels.tools.tools import add_constant

### Functions

In [318]:
#pull in function to dummify columns
def dummify_columns(dataframe,var_list):
    '''
    dummifies a columns, merges with the dataframe, and drops the non-dummified column
    ------------
    dataframe: full dataframe
    variable: list of column names as string
    '''
    for vr in var_list:
        dummified_feature = pd.get_dummies(dataframe[vr], prefix=vr,drop_first=True)
        dataframe = pd.concat([dataframe,dummified_feature],axis=1,sort='False')
    dataframe = dataframe.drop(var_list, axis=1)
    return dataframe

def xy_split(dataframe,target):
    y=dataframe[target]
    X=dataframe.drop(target, axis=1)
    return X,y

def basic_significance(dataframe, list_to_dummify, target):
    #process the dataframe
    df = dataframe.copy()
    df = dummify_columns(df, list_to_dummify)
    X,y = xy_split(df, target)
    X = add_constant(X)
    #fit the model
    logit = Logit(y,X)
    fitted_logit = Logit.fit(logit)
    #store accuracy
    c_mat = confusion_matrix(y, np.round(Logit.predict(logit, fitted_logit.params)))
    accuracy = sum(c_mat.diagonal())/np.sum(c_mat)
    print('model train accuracy: %s' %(accuracy))
    #store significant coefs
    coefs = pd.DataFrame(fitted_logit.pvalues[fitted_logit.pvalues<0.05])
    coefs['coefs'] = fitted_logit.params.filter(items=coefs.index)
    coefs.columns = ['p-values', 'coefs']
    coefs['abs_coefs'] = np.abs(coefs.coefs)
    coefs = coefs.sort_values(by='abs_coefs', ascending = False)
    coefs = coefs.drop('abs_coefs', axis =1)
    return coefs

def forward_selection(dataframe, target, list_to_dummify, criteria='bic'):
    #create target array, intercept only dataframe, and list of variables to select from
    X = pd.DataFrame()
    y = dataframe[target]
    X['const'] = np.ones(cchd.shape[0])
    var_list = list(dataframe.columns)
    var_list.remove(target)
    
    #create empty dictionary to store output of each step
    models = {'model_vars': [], 'scoring_crit':[]}
    
    #define while loop that will run until all variables have been selected
    while len(var_list) > 0: 
        
        #define empty list to store aic/bic values temporarily for step attempt
        crit_vals = []
        
        #try adding variables one by one find lowest vif model for current step
        for var in var_list:
            #create temporary df with all previously selected variables + the new variable being tried
            tempX=pd.concat([X,dataframe[var]],axis=1)
            #dummify the variable if necessary
            if var in list_to_dummify:
                tempX = dummify_columns(tempX, [var])
            #fit the logistic model
            logit = Logit(y,tempX)
            fitted_logit = Logit.fit(logit)
            #store aic or bic in a list for each variable attempted
            if criteria == 'bic':
                crit_vals += [fitted_logit.bic]
            else:
                crit_vals += [fitted_logit.aic]
        
        #find the index of the lowest bic model and store the name of the variable which produced it
        min_crit_idx = crit_vals.index(min(crit_vals))
        best_var = var_list[min_crit_idx]
        
        #add the best variable to the df
        X = pd.concat([X, dataframe[best_var]], axis=1)
        
        #store the variables and aic/bic for the best model at the current step
        models['model_vars']+=[list(X.columns)]
        models['scoring_crit']+=[min(crit_vals)]
        
        #dummify the added variable if necessary
        if best_var in list_to_dummify:
            X = dummify_columns(X, [best_var])
        
        #remove the added variable from the variable list and track progress
        var_list.remove(best_var)
        print('adding var: %s' %(best_var))
        
    return models

def best_forward_set(forward_models):
    model_idx = forward_models['scoring_crit'].index(min(forward_models['scoring_crit']))
    best_cols = forward_models['model_vars'][model_idx]
    best_cols_clean = []
    for i in range(0,len(best_cols)):
        best_cols_clean+=[re.search('\D+', best_cols[i])[0]]
    final_cols=[]
    for i in range(0,len(best_cols_clean)):
        final_cols+=[re.sub("_$", "", best_cols_clean[i])]
    final_cols2=[]
    for i in range(0,len(final_cols)):
        final_cols2+=[re.sub("_(?:Y|M|N|)$", "", final_cols[i])]
    final_cols_clean = set(final_cols2)
    final_cols_clean.remove('const')
    return list(final_cols_clean)

def grid_search_logit(dataframe, columns_to_dummify, target, grid_params):
    df = dataframe.copy()
    df = dummify_columns(df, columns_to_dummify)
    X,y = xy_split(df, target)
    logit = linear_model.LogisticRegression()
    logit.set_params(solver='liblinear')
    log_grid = GridSearchCV(estimator = logit, param_grid=grid_params, scoring='accuracy', cv=5, return_train_score=True)
    log_grid.fit(X,y)
    coefs = pd.Series([item for sublist in log_grid.best_estimator_.fit(X,y).coef_ for item in sublist], index=X.columns)
    order = abs(coefs).sort_values(ascending=False)
    return log_grid.best_score_, log_grid.best_params_, coefs[order.index]

### All_Imputed Model

In [143]:
#load in dataframe
cchd = pd.read_csv('cchd_all_imputed_colfixed.csv')
cchd = cchd.drop('Unnamed: 0', axis=1)

In [22]:
#redefine variable dictionary
variables = {'nominal_categorical_ndummified':['MBSTATE_REC','MRACEHISP','MAR_P','DMAR','MEDUC','FRACEHISP',\
                                    'FEDUC','WIC','RF_PDIAB','RF_GDIAB','RF_PHYPE','RF_GHYPE',\
                                    'RF_EHYPE','RF_PPTERM','RF_FEDRG','RF_ARTEC','DOB_MM',\
                                  'IP_GON','IP_SYPH','IP_CHLAM','IP_HEPB','IP_HEPC', 'PAY', 'SEX'],\
             'nominal_categorical_dummified': ['lrg_miss_imp'],\
           'continuous':['PRECARE','MAGER', 'FAGECOMB','PRIORTERM','PRIORLIVE','PRIORDEAD',\
                         'ILLB_R','ILOP_R','PREVIS','CIG_0','CIG_1','M_Ht_In','BMI',\
                         'WTGAIN','RF_CESARN','OEGest_Comb'],\
            'target':['CA_CCHD']}

#### *Basic non-regularized model for feature significance*

In [285]:
#run non-regularized model with all features
basic_significance(cchd, variables['nominal_categorical_ndummified'], 'CA_CCHD')

Optimization terminated successfully.
         Current function value: 0.623179
         Iterations 6
model train accuracy: 0.6525021204410517


  return ptp(axis=axis, out=out, **kwargs)


Unnamed: 0,p-values,coefs
const,7.799771e-41,8.524838
RF_PDIAB_Y,9.91275e-16,1.182882
FRACEHISP_3,0.01546212,-0.754653
MRACEHISP_4,4.208343e-06,-0.687349
MRACEHISP_7,1.324813e-11,-0.516326
RF_FEDRG_Y,0.02438965,0.422994
FRACEHISP_4,0.0102506,-0.405448
FEDUC_3,0.007198027,-0.366683
MRACEHISP_2,1.674608e-05,-0.364069
FRACEHISP_2,0.0006400508,-0.310094


#### *Forward selection for feature significance*

In [211]:
#find best model via forward selection
forward_models = forward_selection(cchd, 'CA_CCHD', variables['nominal_categorical_ndummified'], criteria='bic')

Optimization terminated successfully.
         Current function value: 0.691633
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.684592
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692059
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.693034
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692392
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.684943
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692994
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692888
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.687272
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.691956
  

Optimization terminated successfully.
         Current function value: 0.645917
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.645548
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.642791
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.645676
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.645993
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.645411
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.646151
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.645340
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.645845
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.646121
  

Optimization terminated successfully.
         Current function value: 0.632411
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.632408
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.631984
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.632064
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.632018
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.631586
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.632033
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.631966
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.631039
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.632234
  

Optimization terminated successfully.
         Current function value: 0.630048
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.630022
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.630056
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.630058
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.630061
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.629478
         Iterations 6
adding var: lrg_miss_imp
Optimization terminated successfully.
         Current function value: 0.629402
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.629434
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.629475
         Iterations 6
Optimization terminated successfully.
         Current fu

Optimization terminated successfully.
         Current function value: 0.628601
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.628290
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.628240
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.628190
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.628433
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.628571
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.628538
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.628527
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.628455
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.628507
  

Optimization terminated successfully.
         Current function value: 0.627724
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627714
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627763
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627692
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627766
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627767
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627355
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627349
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627682
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627761
  

Optimization terminated successfully.
         Current function value: 0.627312
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627375
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627353
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627398
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627308
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627402
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627403
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626934
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626988
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627386
  

Optimization terminated successfully.
         Current function value: 0.627066
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627130
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627129
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627129
         Iterations 6
adding var: MBSTATE_REC
Optimization terminated successfully.
         Current function value: 0.626977
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.627024
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626530
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.625758
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626565
         Iterations 6
Optimization terminated successfully.
         Current fun

Optimization terminated successfully.
         Current function value: 0.626795
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626794
         Iterations 6
adding var: MAR_P
Optimization terminated successfully.
         Current function value: 0.626728
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626257
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.625485
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626139
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626692
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626702
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626725
         Iterations 6
Optimization terminated successfully.
         Current function 

Optimization terminated successfully.
         Current function value: 0.626566
         Iterations 6
adding var: RF_ARTEC
Optimization terminated successfully.
         Current function value: 0.626538
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626069
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.625281
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.625952
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626533
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626537
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626536
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626087
         Iterations 6
Optimization terminated successfully.
         Current functi

Optimization terminated successfully.
         Current function value: 0.625926
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626071
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.626110
         Iterations 6
adding var: FRACEHISP
Optimization terminated successfully.
         Current function value: 0.624767
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.624555
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.624783
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.624836
         Iterations 6
adding var: PAY
Optimization terminated successfully.
         Current function value: 0.624291
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.624014
         Iterations 6
Optimization terminated successfully.
      

In [286]:
#view coefficients and accuracy on best forward selected model
best = best_forward_set(forward_models)
cchd2 = cchd[best+['CA_CCHD']]
basic_significance(cchd2,['RF_PDIAB', 'RF_GDIAB', 'MRACEHISP', 'RF_GHYPE'], 'CA_CCHD')

Optimization terminated successfully.
         Current function value: 0.628972
         Iterations 6
model train accuracy: 0.6468193384223918


  return ptp(axis=axis, out=out, **kwargs)


Unnamed: 0,p-values,coefs
const,7.609377999999999e-110,8.150823
RF_PDIAB_Y,5.463502e-18,1.240836
MRACEHISP_4,2.1400479999999998e-19,-0.871345
MRACEHISP_7,4.166669e-36,-0.651496
MRACEHISP_2,5.551448999999999e-20,-0.549693
RF_GDIAB_Y,4.66529e-06,0.343603
RF_GHYPE_Y,0.0005865349,0.254297
OEGest_Comb,2.781918e-128,-0.229592
PRECARE,6.208380000000001e-55,0.185728
lrg_miss_imp,0.0002065372,0.170243


#### *Grid search model with all estimators for prediction*

In [315]:
#develop predictive logistic model with all predictors
params = {'C':np.logspace(-4,4, 20)}
acc, params, coefs = grid_search_logit(cchd, variables['nominal_categorical_ndummified'], 'CA_CCHD', params)
print('test accracy: %s' %(acc))
print('params: %s' %(params))
coefs

test accracy: 0.6427480916030535
params: {'C': 1.623776739188721}


RF_PDIAB_Y     1.164239
FRACEHISP_3   -0.704456
MRACEHISP_4   -0.647740
RF_EHYPE_Y     0.532727
IP_GON_Y      -0.527003
PAY_4         -0.507921
MRACEHISP_7   -0.487233
MRACEHISP_5    0.430647
RF_FEDRG_Y     0.416325
FRACEHISP_4   -0.406434
MRACEHISP_3    0.375372
MEDUC_8       -0.357997
MRACEHISP_2   -0.352794
FRACEHISP_2   -0.318562
FEDUC_3       -0.312097
RF_GDIAB_Y     0.308235
FRACEHISP_7   -0.302585
PAY_8         -0.285236
RF_PPTERM_Y    0.261015
RF_GHYPE_Y     0.260324
IP_CHLAM_Y     0.254341
IP_SYPH_Y      0.242110
PAY_5          0.233459
OEGest_Comb   -0.213454
PAY_3         -0.208626
PRECARE        0.192968
FEDUC_6       -0.174350
DOB_MM_4       0.170857
DOB_MM_6       0.170171
FRACEHISP_5    0.161243
                 ...   
DOB_MM_3       0.101327
MRACEHISP_6   -0.095498
PAY_2         -0.088334
FEDUC_7       -0.081391
PRIORLIVE      0.073841
DOB_MM_7       0.067853
IP_HEPB_Y     -0.055388
DMAR_2.0       0.039456
FEDUC_8       -0.035512
FRACEHISP_6    0.034633
IP_HEPC_Y     -0

#### *Grid search model with best estimators selected via forward aic*

In [331]:
#forward_aic = forward_selection(cchd, 'CA_CCHD', variables['nominal_categorical_ndummified'], criteria='aic')
#best_aic = best_forward_set(forward_aic)
#best_aic
#best_aic[-1]='CIG_0'
cchd3 = cchd[best_aic+['CA_CCHD']]
dummy = [var for var in best_aic if var in variables['nominal_categorical_ndummified']]
params = {'C':np.logspace(-4,4, 20)}
acc2, params2, coefs2 = grid_search_logit(cchd3,dummy, 'CA_CCHD', params)
print('test accracy: %s' %(acc2))
print('params: %s' %(params2))
coefs2

test accracy: 0.646988973706531
params: {'C': 545.5594781168514}


RF_PDIAB_Y       1.197549
FRACEHISP_3     -0.753628
MRACEHISP_4     -0.671891
PAY_4           -0.650619
RF_EHYPE_Y       0.619194
IP_GON_Y        -0.584385
MRACEHISP_7     -0.494675
MRACEHISP_5      0.483974
MRACEHISP_3      0.441617
RF_FEDRG_Y       0.433806
FRACEHISP_4     -0.423269
FEDUC_3         -0.385981
MRACEHISP_2     -0.348689
RF_GDIAB_Y       0.319024
FRACEHISP_2     -0.314556
PAY_8           -0.311345
FRACEHISP_7     -0.308736
FEDUC_6         -0.285255
IP_CHLAM_Y       0.268125
RF_PPTERM_Y      0.254623
RF_GHYPE_Y       0.246160
FEDUC_4         -0.244459
FEDUC_8         -0.230942
OEGest_Comb     -0.229365
PAY_3           -0.225110
FEDUC_5         -0.222273
PAY_5            0.220142
FEDUC_7         -0.218122
PRECARE          0.192547
FEDUC_2         -0.165785
FRACEHISP_5      0.160870
lrg_miss_imp     0.154250
PAY_6            0.121478
PAY_2           -0.112366
SEX_M            0.112319
MRACEHISP_6     -0.105961
WIC_Y           -0.102027
MBSTATE_REC_2    0.097710
PRIORLIVE   