In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

pd.options.display.max_rows = 100
pd.options.display.max_columns = None

## Read in data

### outcome (short)

In [None]:
outcome = pd.read_csv('../Data/MESA/outcomes/event.csv')
outcome = outcome[['idno','cvda', 'cvdatt']]
outcome = outcome.dropna()

In [None]:
# create 10-year CVD outcome
outcome['cvd_10y'] = ((outcome['cvda'] == 1) & (outcome['cvdatt'] <= 3650)).astype(int)

### nb features (long)

In [None]:
## nSES 
nSES = pd.read_csv('../Data/MESA/neighborhood_data/neighborhood_census_tract_SES.csv')
nSES = nSES[['idno', 'EXAM', 'F1_PC2']]

## G_bla_rk'
rs = pd.read_csv('../Data/MESA/neighborhood_data/neighborhood_racial_seg.csv')
rs = rs[['idno', 'EXAM', 'G_bla_rk']]

## 'S1FAV', 'S1PAI'
nb_env = pd.read_csv('../Data/MESA/neighborhood_data/neighborhood_builtenv.csv')   ## error in first few rows
nb_env = nb_env[['idno', 'EXAM','S1FAV', 'S1PAI']]


## merge
nb_feature = pd.merge(nSES, rs, on=["idno", "EXAM"], how="inner")
nb_feature = pd.merge(nb_feature, nb_env, on=["idno", "EXAM"], how="inner")


### ind exposures and covariates (long)

In [None]:
v1 = pd.read_csv('../Data/MESA/exam_1/final_label_1.csv')
v1 = v1[['idno','site1c',
        'income1', 'chdiet1c', 'chphysact1c',
        'age1c', 'race1c', 'gender1', 'cig1c', 'curalc1', 'diabet1', 'hdl1','chol1','sbp1c']]
v1['EXAM'] = 1.0

v1.rename(columns={'site1c':'site','income1': 'income', 'chdiet1c': 'chdiet', 'chphysact1c':'chphysact',
                   'age1c':'age', 'race1c':'race', 'gender1':'gender', 'cig1c':'cig', 
                   'curalc1':'cural', 'diabet1':'diabet', 'hdl1':'hdl','chol1':'chol','sbp1c':'sbp'}, 
          inplace=True)

v1['diabet'] = np.where(v1['diabet'] == 9, np.nan, v1['diabet'])

### Merge

In [None]:
# Outcome and baseline X

covar = nb_feature[nb_feature['EXAM']==1].merge(v1, how='right', on=['idno','EXAM'])  
merged = outcome.merge(covar, how='left', on=['idno']) 

In [None]:
#merged.to_csv('../processed_data/MESA/Y_BaselineX_raw_full.csv', index = False)

#merged[merged['race'] == 3].to_csv('../processed_data/MESA/Y_BaselineX_raw_bla.csv', index = False)

## Preprocess

In [None]:
# covariates with no missing values:
# A_S1FAV, A_S1PAI, site, age, race, gender

### pipeline

In [None]:
def fillna_cat(df,cat_feat):
    for feat in cat_feat:
        df[feat].fillna(df[feat].mode()[0], inplace=True)
    return df

def fillna_cont(df,cont_feat):
    df= df.fillna((df[cont_feat].mean()))
    return df


def quantile_exp(df,con_exp_feat):
    for feat in con_exp_feat:
        df[feat] = df[feat].transform(lambda x: pd.qcut(x.rank(method='first'), 
                                                         q = [0, 0.25, 0.5, 0.75, 1], labels = [1,2,3,4]))
        df[feat] = pd.to_numeric(df[feat])
    return df

def standardize(df,con_index):
    scaler = StandardScaler()
    scaler.fit(df[con_index],) 
    df[con_index] = scaler.transform(df[con_index], copy = True)
    return df

income_mapping = {1:1,2:1,3:1,
                 4:2,5:2,6:2,
                 7:3,8:3,9:3,10:3, 11:3,
                 12:4,13:4}
def map_income(df):
    df['income'] = df['income'].replace(income_mapping)
    return df


## complete pipeline

def process(df,cont_feat,cat_feat,con_exp_feat):
    
    df = fillna_cat(df,cat_feat)
    df = map_income(df)
    
    df = fillna_cont(df,cont_feat)
    df = standardize(df,cont_feat)
        
    df = fillna_cont(df,con_exp_feat)
    df = quantile_exp(df,con_exp_feat)
    
    return df

### preprocess baseline df

In [None]:
con_exp_feat = ['F1_PC2','S1FAV','S1PAI','G_bla_rk']

cont_feat = ['sbp', 'hdl', 'chol']

cat_feat = ['site','age','chdiet','chphysact','cig','diabet', 'gender', 'race', 'income','cural']

dat_base_processed = process(merged,cont_feat,cat_feat,con_exp_feat)

In [None]:
#dat_base_processed.to_csv('../processed_data/MESA/Y_BaselineX_processed_full.csv', index = False)

#dat_base_processed[dat_base_processed['race']==3].to_csv('../processed_data/MESA/Y_BaselineX_processed_bla.csv', index = False)

### missing imputation only

In [None]:
cat_feat = ['income', 'chdiet', 'chphysact',
           'cig','cural','diabet']

cont_feat = ['F1_PC2','S1FAV','S1PAI','G_bla_rk', 
             'hdl','chol','sbp']

In [None]:
imputed = fillna_cat(merged,cat_feat)  
imputed = fillna_cont(merged,cont_feat)  

In [None]:
#imputed.to_csv('../processed_data/MESA/Y_BaselineX_imputed_full.csv', index = False)

#imputed[imputed['race']==3].to_csv('../processed_data/MESA/Y_BaselineX_imputed_bla.csv', index = False)