In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

pd.options.display.max_rows = 100
pd.options.display.max_columns = None

## Read in data

### outcome (short)

In [2]:
outcome = pd.read_csv('../data_processed/MESA/mesa_event.csv')
outcome = outcome[['idno','cvd_10y_HF', 'cvd_10y_noHF']]
outcome = outcome.dropna()

### nb features (long)

In [3]:
## nSES 
nSES = pd.read_csv('../Data/MESA/neighborhood_data/neighborhood_census_tract_SES.csv')
nSES = nSES[['idno', 'EXAM', 'F1_PC2']]

## G_bla_rk'
rs = pd.read_csv('../Data/MESA/neighborhood_data/neighborhood_racial_seg.csv')
rs = rs[['idno', 'EXAM', 'G_bla_rk']]

## 'S1FAV', 'S1PAI'
nb_env = pd.read_csv('../Data/MESA/neighborhood_data/neighborhood_builtenv.csv')   
nb_env = nb_env[['idno', 'EXAM','S1FAV', 'S1PAI']]


## merge
nb_feature = pd.merge(nSES, rs, on=["idno", "EXAM"], how="inner")
nb_feature = pd.merge(nb_feature, nb_env, on=["idno", "EXAM"], how="inner")


### ind exposures and covariates (long)

In [4]:
v1 = pd.read_csv('../Data/MESA/exam_1/final_label_1.csv')
v1 = v1[['idno','site1c',
        'income1', 'chdiet1c', 'chphysact1c',
        'age1c', 'race1c', 'gender1', 'cig1c', 'curalc1', 'diabet1', 'hdl1','chol1','sbp1c']]
v1['EXAM'] = 1.0

v1.rename(columns={'site1c':'site','income1': 'income', 'chdiet1c': 'chdiet', 'chphysact1c':'chphysact',
                   'age1c':'age', 'race1c':'race', 'gender1':'gender', 'cig1c':'cig', 
                   'curalc1':'cural', 'diabet1':'diabet', 'hdl1':'hdl','chol1':'chol','sbp1c':'sbp'}, 
          inplace=True)

v1['diabet'] = np.where(v1['diabet'] == 9, np.nan, v1['diabet'])

### Merge

In [5]:
# Outcome and baseline X

covar = nb_feature[nb_feature['EXAM']==1].merge(v1, how='right', on=['idno','EXAM'])  
merged = outcome.merge(covar, how='left', on=['idno']) 

In [6]:
# rename variables

merged = merged.rename(columns = {'F1_PC2': 'nSES',
                            'S1FAV': 'nFavFood',
                            'S1PAI': 'nPhysFac', 
                            'G_bla_rk': 'nRS', 
                            'chdiet': 'nutrition', 
                            'chphysact': 'PhysAct',
                            'income': 'FamIncome',
                             'cig':'currentSmoker',
                            'cural':'alc',
                              'diabet': 'Diabetes',
                              'chol':'totchol'})

In [7]:
# map famincome

income_mapping = {1:1,2:1,3:1,
                 4:2,5:2,6:2,
                 7:3,8:3,9:3,10:3, 11:3,
                 12:4,13:4}

merged = merged.assign(FamIncome = merged['FamIncome'].map(income_mapping))

In [8]:
# create binary race variable
# black = 1, non-black = 0

merged['race'] = np.where(merged['race'] == 3, 1, np.where(merged['race'].isna(), np.nan, 0))

In [None]:
# merged.to_csv('../data_processed/MESA/mesa_raw.csv', index = False)

## Preprocess

In [None]:
# covariates with no missing values:
# A_S1FAV, A_S1PAI, site, age, race, gender

### pipeline

In [9]:
def fillna_cat(df,cat_feat):
    for feat in cat_feat:
        df[feat].fillna(df[feat].mode()[0], inplace=True)
    return df

def fillna_cont(df,cont_feat):
    df= df.fillna((df[cont_feat].mean()))
    return df


def quantile_exp(df,con_exp_feat):
    for feat in con_exp_feat:
        df[feat] = df[feat].transform(lambda x: pd.qcut(x.rank(method='first'), 
                                                         q = [0, 0.25, 0.5, 0.75, 1], labels = [1,2,3,4]))
        df[feat] = pd.to_numeric(df[feat])
    return df

def standardize(df,con_index):
    scaler = StandardScaler()
    scaler.fit(df[con_index],) 
    df[con_index] = scaler.transform(df[con_index], copy = True)
    return df


## complete pipeline

def process(df,cont_feat,cat_feat,con_exp_feat):
    
    df = fillna_cat(df,cat_feat)
    
    df = fillna_cont(df,cont_feat)
    df = standardize(df,cont_feat)
        
    df = fillna_cont(df,con_exp_feat)
    df = quantile_exp(df,con_exp_feat)
    
    return df

### preprocess

In [11]:
con_exp_feat = ['nSES','nFavFood','nPhysFac','nRS']

cont_feat = ['sbp', 'hdl', 'totchol']

cat_feat = ['site','age','nutrition','PhysAct','currentSmoker','Diabetes', 'gender', 'race', 'FamIncome','alc']

mesa_preprocessed = merged.copy()
mesa_imputed = merged.copy()
mesa_cate = merged.copy()
mesa_std = merged.copy()

In [None]:
### preprocessed

mesa_processed = process(merged,cont_feat,cat_feat,con_exp_feat)

# mesa_processed.to_csv('../data_processed/MESA/mesa_processed.csv', index = False)

In [None]:
### missing imputation only

cat_feat = ['FamIncome', 'nutrition', 'PhysAct',
           'currentSmoker','alc','Diabetes']

cont_feat = ['nSES','nFavFood','nPhysFac','nRS', 
             'hdl','totchol','sbp']

imputed = fillna_cat(merged,cat_feat)  
imputed = fillna_cont(merged,cont_feat) 

# imputed.to_csv('../data_processed/MESA/mesa_imputed.csv', index = False)

In [None]:
### categorization only

# standardize
cols_to_standardize = ['nSES','nFavFood','nPhysFac','nRS', 'sbp', 'hdl', 'totchol']
scaler = StandardScaler()
mesa_cate[cols_to_standardize] = scaler.fit_transform(mesa_cate[cols_to_standardize])

# categorize
mesa_cate = quantile_exp(mesa_cate,con_exp_feat)

# mesa_cate.to_csv('../data_processed/MESA/mesa_cate.csv', index = False)

In [12]:
##### standardization only #####
##### final analysis data #####

# standardize
mesa_std = mesa_std.dropna()
cols_to_standardize = ['nSES','nFavFood','nPhysFac','nRS', 'sbp', 'hdl', 'totchol']
scaler = StandardScaler()
mesa_std[cols_to_standardize] = scaler.fit_transform(mesa_std[cols_to_standardize])

# mesa_std.to_csv('../data_processed/MESA/mesa_std.csv', index = False)

In [13]:
mesa_std.describe()

Unnamed: 0,idno,cvd_10y_HF,cvd_10y_noHF,EXAM,nSES,nRS,nFavFood,nPhysFac,site,FamIncome,nutrition,PhysAct,age,race,gender,currentSmoker,alc,Diabetes,hdl,totchol,sbp
count,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0,4619.0
mean,5580024.0,0.105867,0.087898,1.0,-4.308694e-16,-1.166828e-16,1.024306e-15,-5.336594e-16,5.563109,2.928772,0.381468,1.42152,61.521325,0.269755,0.5315,0.701884,0.703399,0.100238,5.9018e-16,2.338042e-16,-1.840121e-16
std,1676563.0,0.307701,0.283177,0.0,1.000108,1.000108,1.000108,1.000108,1.675774,0.883118,0.49288,0.804424,10.064933,0.443881,0.499061,0.694047,0.456809,0.30035,1.000108,1.000108,1.000108
min,3010031.0,0.0,0.0,1.0,-2.818197,-1.332962,-0.6622944,-0.6323294,3.0,1.0,0.0,0.0,44.0,0.0,0.0,0.0,0.0,0.0,-2.408395,-3.651174,-2.425469
25%,4017068.0,0.0,0.0,1.0,-0.5695914,-0.6313952,-0.5743173,-0.4999441,4.0,2.0,0.0,1.0,53.0,0.0,0.0,0.0,0.0,0.0,-0.736532,-0.6716625,-0.7301144
50%,6011748.0,0.0,0.0,1.0,0.2426,-0.3297106,-0.3983632,-0.3234303,6.0,3.0,0.0,2.0,61.0,0.0,1.0,1.0,1.0,0.0,-0.2015357,-0.04738393,-0.1488501
75%,7016162.0,0.0,0.0,1.0,0.7553456,0.3593914,0.04152196,-0.102788,7.0,4.0,1.0,2.0,69.0,1.0,1.0,1.0,1.0,0.0,0.5340842,0.6052709,0.6261689
max,8024995.0,1.0,1.0,1.0,2.007565,7.215007,4.704305,8.502259,8.0,4.0,2.0,2.0,84.0,1.0,1.0,2.0,1.0,1.0,6.084671,9.11816,5.106748
