In [0]:
import pandas as pd
import numpy as np
pd.options.display.max_rows = 400
pd.options.display.max_columns = 400

In [None]:
#function to run simple SQL query from python
def create_table_from_SQL(user, database, password, query):
    '''
    - A function that returns a pandas dataframe from a SQL query in python
    ---------------
    - user: user for your local SQL connection in string format
    - database: schema name where your database is stored in string format
    - password: password to access your local SQL connection in string format
    - query: SQL query in string format; enclose with double quotes and use single quotes
    to designate VARCHAR values within queries; use schema_name.table_name after FROM statement
    '''
    import mysql.connector
    cnx = mysql.connector.connect(user=user, database=database, password=password)
    cursor = cnx.cursor()
    query = query
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall())
    df.columns = cursor.column_names
    return df

In [None]:
#modified version of Bettina's function which creates downsampled dataset for specific defects
#vs overall NICU admissions

def downsample_df (df, variable):

    '''
    Remove undefined information on defect presence admissions (defect == 'U'),
    create a binary target vector, and create a "balanced" dataframe
    with all defect cases and matching numbers of randomly selected non-defect cases.
    --------------------
    df: full dataframe
    variable: variable or defect of interest in string format
    '''

    # remove unknown class from df
    df_no_unknown = df[df[variable].isin(['Y', 'N'])]

    # Create binary target vector, NICU = yes classified as class 0
    df_y_n = pd.DataFrame(np.where((df_no_unknown[variable] == 'Y'), 0, 1))

    # Get indicies of each class' observations
    index_class0 = np.where(df_y_n == 0)[0]
    index_class1 = np.where(df_y_n == 1)[0]

    # Get numbers of observations in class 0
    n_class0 = len(index_class0)

    # Randomly sample the same number of observations from class 1 as in class 0, without replacement
    np.random.seed(0)
    index_class1_downsampled = np.random.choice(index_class1, size=n_class0, replace=False)

    # Create dataframes for NICU and downsampled non-NICU
    df_defect = df_no_unknown.iloc[index_class0]
    df_adj_NONdefect = df_no_unknown.iloc[index_class1_downsampled]

    # Append into 1 dataframe
    df_downsampled = df_defect.append(df_adj_NONdefect)

    return df_downsampled

In [None]:
# function to split out holdout test set
def split_sets(dataframe, seed, test_prop=0.1): 
    '''
    - A function that splits specifically a dataframe into a train and test portion
    - Requires multiple assignment: train, test
    ---------------
    - dataframe: dataframe to be split
    - seed: set seed for reproducability
    - test_prop: takes a float - proportion of dataframe that should be allocated to the test set
    '''

    np.random.seed(seed)
    testIdxes = np.random.choice(range(0,dataframe.shape[0]), size=round(dataframe.shape[0]*test_prop), replace=False)
    trainIdxes = list(set(range(0,dataframe.shape[0])) - set(testIdxes))

    train = dataframe.iloc[trainIdxes,:]
    test  = dataframe.iloc[testIdxes,:]
    
    return train, test

In [0]:
def mlp_convert_cont_floats(df):
    #convert continuous variables to float
    #update to split out ordinal categorical separately
    for x in variables['continuous']:
        df[x]=df[x].astype('float')
    return df

def mlp_convert_nom_cat(df):
    #convert nominal categorical variables to category
    for x in variables['nominal_categorical']:
        df[x]=df[x].astype('category')
    return df

def mlp_convert_ord_cat(df):
    #convert ordinal categorical variables
    df.DOB_MM = df.DOB_MM.astype('category')
    df.PRECARE = df.PRECARE.astype('float')
    return df

def mlp_fill_MAR_blanks(df):
    #change true nulls to fit missingness definitions already in the dataset
    df.isnull().sum()
    df.MAR_P = df.MAR_P.fillna(value='U')
    df.DMAR = df.DMAR.fillna(value=9)
    df.DMAR.replace('',9, inplace=True) # need to take care of 1 vs '1'
#     df.DMAR.replace('1',1, inplace=True)
#     df.DMAR.replace('2',2, inplace=True)
    return df

def mlp_reassign_FRACE(df):
    #combine FRACEHISP unknowns columns
    df.FRACEHISP = df.FRACEHISP.replace(8,9)
    return df

def mlp_reassign_X_NA(df):
    #assign 'X' to 'N' for RF_FEDRG RF_ARTEC and 'Y' for MAR_P since paternity assumed for married
    for x in ['RF_FEDRG', 'RF_ARTEC']:
        df[x].replace('X','N', inplace=True)
    df.MAR_P.replace('X','Y', inplace=True)
    return df

def mlp_reassign_ILs(df):
    #assign 888 to mean for ILLB_R and ILP_R
    for x in ['ILLB_R', 'ILP_R', 'ILOP_R']:
        print(x)
        df[x].replace(888,df.MAGER*12, inplace=True)
    return df


In [0]:
def measure_missing(df, target):
    #create table of missingness proportions
    missing_props = pd.DataFrame()
    for i in range(0,len(missing_vals)):
        temp = df.groupby(target)[missing_dict[list(missing_dict.keys())[i]]].apply\
        (lambda x: np.sum(x==missing_vals[i])/(df.shape[0]/2))
        missing_props = pd.concat([missing_props, temp], axis=1)   

    #create lists of variables with high missingness vs. low missingness
    large_miss = list(missing_props.columns[missing_props.apply(lambda x: sum(x)>0.1, axis=0)])
    small_miss = list(missing_props.columns[missing_props.apply(lambda x: sum(x)<0.1, axis=0)])

    #sort low missingness categorical variables into types
    small_cats = {'cat3': [], 'cat8': [], 'cat9': [], 'catU': []}

    for var in small_miss:
        if var in missing_dict['cat3']:
            small_cats['cat3'].append(var) 
        elif var in missing_dict['cat8']:
            small_cats['cat8'].append(var)
        elif var in missing_dict['cat9']:
            small_cats['cat9'].append(var)
        elif var in missing_dict['catU']:
            small_cats['catU'].append(var)    


    #sort low missingness continuous variables
    small_conts = {'cont9': [], 'cont99': [], 'cont999': [], 'cont99.9': []}

    for var in small_miss:
        if var in missing_dict['cont9']:
            small_conts['cont9'].append(var) 
        elif var in missing_dict['cont99']:
            small_conts['cont99'].append(var)
        elif var in missing_dict['cont999']:
            small_conts['cont999'].append(var)
        elif var in missing_dict['cont99.9']:
            small_conts['cont99.9'].append(var)
    return small_conts, small_cats, large_miss



In [0]:
def mlp_impute_s_cat(df,df_w,small_cats):
    #mode imputation of categoricals with low missingness
    small_vals = [3,8,9,'U']
    for i in range(0, len(small_vals)):
        temp_lis = small_cats[list(small_cats.keys())[i]]
        for x in temp_lis:
            major_cat = df_w[x].value_counts().sort_values(ascending=False).index[0]
            df[x]=df[x].replace(small_vals[i],major_cat)
    return df

def mlp_impute_s_num(df,df_w,small_conts):
    #median imputation of categoricals with low missingness
    #statistical significance of relationship with target imnproves on variable by variable basis after 
    #median imputation
    csmall_vals = [9,99,999,99.9]
    for i in range(0, len(csmall_vals)):
        temp_lis = small_conts[list(small_conts.keys())[i]]
        for x in temp_lis:
            df[x]=df[x].replace(csmall_vals[i],df_w[x].median())
    return df


def binarize9(x):
    if x==9:
        return 1
    else:
        return 0
    
def binarize99(x):
    if x==99:
        return 1
    else:
        return 0

def binarize999(x):
    if x==999:
        return 1
    else:
        return 0

    
def mlp_impute_FAGECOMB(df,df_w):
    #Impute FAGECOMB missing vals and store whether column was imputed    
    df['FAGECOMB_IMP'] = df.FAGECOMB.apply(lambda x: binarize99(x))
    df.FAGECOMB.replace(99, df_w.FAGECOMB.median(),inplace = True)
    return df

def mlp_impute_ILPs(df,df_w):
    #Impute ILOP_R and ILP_R missing vals and store whether column was imputed
    for x in ['ILOP_R', 'ILP_R']:
        df[x+'_IMP'] = df[x].apply(lambda x: binarize999(x))
    for x in ['ILOP_R', 'ILP_R']:
        df[x].replace(999,df_w[x].median(), inplace=True)
    return df

def mlp_impute_FRACE_ED(df,df_w):
    #Impute FRACEHISP and FEDUC missing vals and store whether column was imputed
    for x in ['FRACEHISP', 'FEDUC']:
        df[x+'_IMP'] = df[x].apply(lambda x: binarize9(x))
    for x in ['FRACEHISP', 'FEDUC']:
        df[x].replace(9,df_w[x].mode()[0], inplace=True)
    return df

def mlp_impute_combine(df):
    #Combine imputed flag columns into one
    import re
    imputed_col = list(filter(lambda i: re.search('_IMP',i), df.columns))
    print(len(imputed_col))
    if len(imputed_col)==0:
        return df
    df['lrg_miss_imp']= [1 if sum(df[imputed_col].iloc[i])>1 else 0 for i in range(len(df))]
    df.drop(columns = imputed_col, inplace=True)
    return df

In [0]:
def mlp_all_of_the_above(df,df_w,target):
    df = mlp_fill_MAR_blanks(df)
    df = mlp_reassign_FRACE(df)
    df = mlp_reassign_X_NA(df)
#     df = mlp_reassign_ILs(df)
    df = mlp_convert_cont_floats(df)
    small_conts, small_cats, large_miss = measure_missing(df,target)
    df = mlp_impute_s_cat(df,df_w,small_cats)
    df = mlp_impute_s_num(df,df_w,small_conts)
    df = mlp_impute_FAGECOMB(df,df_w)
    df = mlp_impute_ILPs(df,df_w)
    df = mlp_impute_FRACE_ED(df,df_w)
    df = mlp_convert_nom_cat(df)
    df = mlp_convert_ord_cat(df)
#     df = mlp_impute_combine(df)
    return df

In [0]:
#dummfiy columns

def dummify_columns(dataframe,var_list):
    '''
    dummifies a columns, merges with the dataframe, and drops the non-dummified column
    ------------
    dataframe: full dataframe
    variable: column name as string
    '''
    for vr in var_list:
        dummified_feature = pd.get_dummies(dataframe[vr], prefix=vr,drop_first=True,prefix_sep='__')
        dataframe = pd.concat([dataframe,dummified_feature],axis=1,sort=False)
    dataframe.drop(columns = var_list, inplace = True)
    return dataframe

In [0]:
def add_random_column_to_df (dataframe):
    import random
    mylist = []
    for i in range(0,dataframe.shape[0]):
        x = random.randint(1,1000)
        mylist.append(x)
    dataframe['RANDOM'] = mylist

    return dataframe

In [0]:
#LabelEncoding Function. Thanks Ira!
def LabelEncoding(dataframe):
    '''
    Function that takes a dataframe and transforms it with label encoding on all the categorical features.
    '''
    
    import pandas as pd
    
    #create a list using object types since dataframe.dtypes.value_counts() only shows objects and int64
    objlist = list(dataframe.select_dtypes(include=['object','category']).columns)
    
    #change type then transform column using cat codes
    for col in objlist:
        dataframe[col] = dataframe[col].astype('category')
        dataframe[col] = dataframe[col].cat.codes
    
    return dataframe

In [0]:
# begin dictionary of columns to analyze for CCHD - includes pre-pregnancy and gestational features
# features re: delivery and labor are not useful for this use case
variables = {'nominal_categorical':['MBSTATE_REC','MRACEHISP','MAR_P','DMAR','MEDUC','FRACEHISP',\
                                    'FEDUC','WIC','RF_PDIAB','RF_GDIAB','RF_PHYPE','RF_GHYPE',\
                                    'RF_EHYPE','RF_PPTERM','RF_INFTR','RF_FEDRG','RF_ARTEC','RF_CESAR',\
                                  'IP_GON','IP_SYPH','IP_CHLAM','IP_HEPB','IP_HEPC', 'PAY', 'SEX'],\
           'ordinal_categorical':['PRECARE', 'DOB_MM'],\
           'continuous':['MAGER', 'FAGECOMB','PRIORTERM','PRIORLIVE','PRIORDEAD','LBO_REC','TBO_REC',\
                         'ILLB_R','ILOP_R','ILP_R','PREVIS','CIG_0','CIG_1','CIG_2','CIG_3','M_Ht_In','BMI',\
                         'WTGAIN','RF_CESARN','OEGest_Comb'],\
            'target':['CA_CCHD']}

In [0]:
#create missingness types:
missing_dict = {'cont9': ['LBO_REC', 'TBO_REC'],\
                'cont99': ['FAGECOMB', 'PRIORTERM','PRIORLIVE', 'PRIORDEAD', 'PRECARE', 'PREVIS',\
                         'CIG_0', 'CIG_1', 'CIG_2', 'CIG_3', 'M_Ht_In', 'WTGAIN', 'RF_CESARN', 'OEGest_Comb'],\
                'cont999':['ILLB_R', 'ILP_R', 'ILOP_R'],\
                'cont99.9': ['BMI'],\
                'cat3': ['MBSTATE_REC'],\
                'cat8': ['MRACEHISP'],\
                'cat9': ['MEDUC', 'FEDUC', 'PAY', 'FRACEHISP', 'DMAR'],\
                'catU': ['WIC','RF_PDIAB','RF_GDIAB','RF_PHYPE',\
                        'RF_GHYPE','RF_EHYPE','RF_PPTERM','RF_INFTR','RF_FEDRG','RF_ARTEC','RF_CESAR','IP_GON',\
                        'IP_SYPH','IP_CHLAM','IP_HEPB','IP_HEPC', 'MAR_P']}
missing_vals = [9,99,999,99.9,3,8,9,'U']


In [None]:
#pull selected variables from 2016-2018 databases in SQL and append to a single dataframe
query18 = "SELECT MBSTATE_REC,MRACEHISP,MAR_P,DMAR,MEDUC,FRACEHISP,FEDUC,WIC,RF_PDIAB,RF_GDIAB,RF_PHYPE,\
                RF_GHYPE,RF_EHYPE,RF_PPTERM,RF_INFTR,RF_FEDRG,RF_ARTEC,RF_CESAR,IP_GON,IP_SYPH,IP_CHLAM,\
                IP_HEPB,IP_HEPC,PAY,SEX,PRECARE,DOB_MM,MAGER,FAGECOMB,PRIORTERM,PRIORLIVE,PRIORDEAD,\
                LBO_REC,TBO_REC,ILLB_R,ILOP_R,ILP_R,PREVIS,CIG_0,CIG_1,CIG_2,CIG_3,M_Ht_In,BMI,WTGAIN,\
                RF_CESARN,OEGest_Comb,CA_CCHD\
         FROM cdc.us2018"

query17 = "SELECT MBSTATE_REC,MRACEHISP,MAR_P,DMAR,MEDUC,FRACEHISP,FEDUC,WIC,RF_PDIAB,RF_GDIAB,RF_PHYPE,\
                RF_GHYPE,RF_EHYPE,RF_PPTERM,RF_INFTR,RF_FEDRG,RF_ARTEC,RF_CESAR,IP_GON,IP_SYPH,IP_CHLAM,\
                IP_HEPB,IP_HEPC,PAY,SEX,PRECARE,DOB_MM,MAGER,FAGECOMB,PRIORTERM,PRIORLIVE,PRIORDEAD,\
                LBO_REC,TBO_REC,ILLB_R,ILOP_R,ILP_R,PREVIS,CIG_0,CIG_1,CIG_2,CIG_3,M_Ht_In,BMI,WTGAIN,\
                RF_CESARN,OEGest_Comb,CA_CCHD\
         FROM cdc.us2017"

query16 = "SELECT MBSTATE_REC,MRACEHISP,MAR_P,DMAR,MEDUC,FRACEHISP,FEDUC,WIC,RF_PDIAB,RF_GDIAB,RF_PHYPE,\
                RF_GHYPE,RF_EHYPE,RF_PPTERM,RF_INFTR,RF_FEDRG,RF_ARTEC,RF_CESAR,IP_GON,IP_SYPH,IP_CHLAM,\
                IP_HEPB,IP_HEPC,PAY,SEX,PRECARE,DOB_MM,MAGER,FAGECOMB,PRIORTERM,PRIORLIVE,PRIORDEAD,\
                LBO_REC,TBO_REC,ILLB_R,ILOP_R,ILP_R,PREVIS,CIG_0,CIG_1,CIG_2,CIG_3,M_Ht_In,BMI,WTGAIN,\
                RF_CESARN,OEGest_Comb,CA_CCHD\
         FROM cdc.us2016"


In [0]:
queries = [query18, query17, query16]
            
cchd = pd.DataFrame()
test_cchd = pd.DataFrame()

for query in queries:
    temp = create_table_from_SQL('root','cdc',sql_pw, query)
    train, test = split_sets(temp, 0, test_prop=0.1)
    train = downsample_df(train, 'CA_CCHD')
    cchd = cchd.append(train)  
    test_cchd = test_cchd.append(test)

In [0]:
chd=cchd.copy()

In [0]:
chd_test = test_cchd.copy()

In [0]:
target = 'CA_CCHD'

In [0]:
chd = mlp_all_of_the_above(chd,chd,'CA_CCHD')

In [0]:
chd_test = chd_test.loc[(chd_test[target]=='Y')|(chd_test[target]=='N'),:]

In [0]:
chd_test = mlp_all_of_the_above(chd_test,chd,'CA_CCHD')

In [0]:
chd_test.MAR_P = chd_test.MAR_P.replace('U','Y')

In [0]:
#PHYPE
chd = chd[chd['RF_PHYPE'] == 'Y']
chd_test = chd_test[chd_test['RF_PHYPE'] == 'Y']

In [0]:
chd['lrg_miss_imp'] = chd.FAGECOMB_IMP | chd.FRACEHISP_IMP | chd.ILOP_R_IMP | chd.ILP_R_IMP | chd.FEDUC_IMP
chd.drop(columns = ['FAGECOMB_IMP','FRACEHISP_IMP','ILOP_R_IMP','ILP_R_IMP','FEDUC_IMP'],inplace=True)

In [0]:
chd = LabelEncoding(chd)

In [0]:
chd = add_random_column_to_df(chd)

In [0]:
chd_test['lrg_miss_imp'] = chd_test.FAGECOMB_IMP | chd_test.FRACEHISP_IMP | chd_test.ILOP_R_IMP | chd_test.ILP_R_IMP | chd_test.FEDUC_IMP
chd_test.drop(columns = ['FAGECOMB_IMP','FRACEHISP_IMP','ILOP_R_IMP','ILP_R_IMP','FEDUC_IMP'],inplace=True)

In [0]:
chd_test = LabelEncoding(chd_test)

In [0]:
chd_test = add_random_column_to_df(chd_test)

In [0]:
#test train split
target = 'CA_CCHD'
X_train = chd.drop(target, axis=1)
y_train = chd[[target]]

X_test = chd_test.drop(target, axis=1)
y_test = chd_test[[target]]

In [29]:

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import LocalOutlierFactor
y_train_fit = pd.DataFrame([-1 if i == 0 else 1 for i in y_train[list(y_train.columns)[0]]])

lofcv = LocalOutlierFactor(contamination = 0.5, novelty = True)
lof_grid_param = {'n_neighbors': range(20,101,50),
                  'leaf_size': range(10,30,5)}
gsearch = GridSearchCV(lofcv,lof_grid_param,scoring='precision',n_jobs=2,cv=5)
%time gsearch.fit(X_train, y_train_fit)

print(gsearch.best_params_)
print(gsearch.best_score_)
from sklearn.metrics import confusion_matrix
y_test_fit = pd.DataFrame([-1 if i == 0 else 1 for i in y_test[list(y_test.columns)[0]]])
%time cm = confusion_matrix(y_test_fit,gsearch.predict(X_test))
print(cm[1,1]/sum(cm[:,1])*100)
print(cm)



CPU times: user 166 ms, sys: 63.6 ms, total: 229 ms
Wall time: 2.69 s
{'leaf_size': 10, 'n_neighbors': 70}
0.654934367744285
CPU times: user 1.15 s, sys: 13.5 ms, total: 1.16 s
Wall time: 1.17 s
0.16812671445004868
[[10835 11282]
 [   15    19]]


In [30]:
y_train.groupby(list(y_train.columns)).size()

CA_CCHD
0    132
1    231
dtype: int64

In [31]:
y_test.groupby(list(y_test.columns)).size()


CA_CCHD
0    22117
1       34
dtype: int64

In [32]:
from sklearn.ensemble import IsolationForest

y_train_fit = pd.DataFrame([-1 if i == 0 else 1 for i in y_train[list(y_train.columns)[0]]])

#Initial fit
isoForest = IsolationForest()
isoForest.set_params(random_state=0, contamination = 0.1)
isoForest.fit(X_train, y_train_fit) 
isofit = isoForest.predict(X_test)




In [33]:
# set the parameter grid
score_method = 'precision'

import sklearn.model_selection as ms
grid_para_forest = {
    'contamination': np.linspace(0.01,0.1,11),
    'max_features': range(1, 5),
    'n_estimators': range(10, 120, 10)
}

# GRID SEARCH
grid_search_isoforest = ms.GridSearchCV(isoForest, grid_para_forest, scoring=score_method, cv=10, n_jobs=-1,)
grid_search_isoforest.fit(X_train, y_train_fit)


#CONFUSION MATRIX
from sklearn.metrics import confusion_matrix
%time cm = confusion_matrix(y_test_fit, grid_search_isoforest.predict(X_test))

print(score_method, ': ', cm[1,1]/sum(cm[:,1])*100)
print(cm)

#Best Params and Score
print(grid_search_isoforest.best_params_)
print('best score:', grid_search_isoforest.best_score_)



CPU times: user 65.7 ms, sys: 2.96 ms, total: 68.6 ms
Wall time: 68.4 ms
precision :  0.14572462746204107
[[  875 21242]
 [    3    31]]
{'contamination': 0.037000000000000005, 'max_features': 1, 'n_estimators': 10}
best score: 0.6413781306829435


