In [None]:
import numpy as np
import pandas as pd

pd.options.display.max_rows = 100
pd.options.display.max_columns = None

## Read in data

### outcome (short)

In [None]:
outcome = pd.read_csv('../data/outcomes/event.csv')
outcome = outcome[['idno','cvda', 'cvdatt']]

### nb features (long)

In [None]:
## nSES 
nSES = pd.read_csv('../data/neighborhood_data/neighborhood_census_tract_SES.csv')
nSES = nSES[['idno', 'EXAM', 'F1_PC2']]

## G_bla_rk'
rs = pd.read_csv('../data/neighborhood_data/neighborhood_racial_seg.csv')
rs = rs[['idno', 'EXAM', 'G_bla_rk']]

## 'A_S1FAV', 'A_S1PAI'
nb_env = pd.read_csv('../data/neighborhood_data/neighborhood_builtenv.csv')   ## error in first few rows
nb_env = nb_env[['idno', 'EXAM','A_S1FAV', 'A_S1PAI']]


## merge
nb_feature = pd.merge(nSES, rs, on=["idno", "EXAM"], how="inner")
nb_feature = pd.merge(nb_feature, nb_env, on=["idno", "EXAM"], how="inner")


### ind exposures and covariates 

In [None]:
v1 = pd.read_csv('../data/exam_1/final_label_1.csv')
v1 = v1[['idno','site1c',
        'income1', 'chdiet1c', 'chphysact1c',
        'age1c', 'race1c', 'gender1', 'cig1c', 'curalc1', 'diabet1', 'hdl1','chol1','sbp1c']]
v1['EXAM'] = 1.0

v1.rename(columns={'site1c':'site','income1': 'income', 'chdiet1c': 'chdiet', 'chphysact1c':'chphysact',
                   'age1c':'age', 'race1c':'race', 'gender1':'gender', 'cig1c':'cig', 
                   'curalc1':'cural', 'diabet1':'diabet', 'hdl1':'hdl','chol1':'chol','sbp1c':'sbp'}, 
          inplace=True)

In [None]:
v2 = pd.read_csv('../data/exam_2/final_label_2.csv')
v2 = v2[['idno','site1c',
        'income2', 
        'age2c',  'race1c', 'gender1','cig2c', 'curalc2', 'hdl2','chol2','sbp2c']]
v2['EXAM'] = 2.0

v2.rename(columns={'site1c':'site', 'income2': 'income', 
                   'age2c':'age', 'race1c':'race', 'gender1':'gender', 'cig2c':'cig', 
                   'curalc2':'cural', 'hdl2':'hdl','chol2':'chol','sbp2c':'sbp'}, 
          inplace=True)

v2 = v2.assign(chphysact=np.nan, chdiet=np.nan)

In [None]:
v3 = pd.read_csv('../data/exam_3/final_label_3.csv')
v3 = v3[['idno','site3c',
        'income3',
        'age3c', 'race1c', 'gender1','cig3c', 'curalc3','hdl3','chol3','sbp3c']]
v3['EXAM'] = 3.0

v3.rename(columns={'site3c':'site', 'income3': 'income', 
                   'age3c':'age', 'race1c':'race', 'gender1':'gender', 'cig3c':'cig', 
                   'curalc3':'cural', 'hdl3':'hdl','chol3':'chol','sbp3c':'sbp'}, 
          inplace=True)

v3 = v3.assign(chphysact=np.nan, chdiet=np.nan)

In [None]:
v4 = pd.read_csv('../data/exam_4/final_label_4.csv')
v4 = v4[['idno','site4c',
        'age4c', 'race1c', 'gender1','cig4c', 'curalc4','hdl4','chol4','sbp4c']]
v4['EXAM'] = 4.0

v4.rename(columns={'site4c':'site', 
                   'age4c':'age', 'race1c':'race', 'gender1':'gender', 'cig4c':'cig', 
                   'curalc4':'cural', 'hdl4':'hdl','chol4':'chol','sbp4c':'sbp'}, 
          inplace=True)

v4 = v4.assign(income=np.nan, chphysact=np.nan, chdiet=np.nan)

In [None]:
v5 = pd.read_csv('../data/exam_5/final_label_5.csv')
v5 = v5[['idno','site5c',
        'income5',
        'age5c', 'race1c', 'gender1','cig5c', 'curalc5','hdl5','chol5','sbp5c']]
v5['EXAM'] = 5.0

v5.rename(columns={'site5c':'site', 'income5': 'income', 
                   'age5c':'age', 'race1c':'race', 'gender1':'gender', 'cig5c':'cig', 
                   'curalc5':'cural', 'hdl5':'hdl','chol5':'chol','sbp5c':'sbp'}, 
          inplace=True)

v5 = v5.assign(chphysact=np.nan, chdiet=np.nan)

In [None]:
# merge data
covar_merged = pd.concat([v1, v2, v3,v4,v5], ignore_index=True)

### Merge 

In [None]:
# covariates for all 5 visits
merged_x = nb_feature.merge(covar_merged, how='left', on=['idno','EXAM'])  ## long
merged_x.to_csv('../data_processed/covariate_raw_common.csv', index = False)

In [None]:
# Y + baseline X

covar = nb_feature[nb_feature['EXAM']==1].merge(v1, how='left', on=['idno','EXAM'])  
merged = covar.merge(outcome, how='left', on=['idno']) 
merged.to_csv('../data_processed/Y_BaselineX_raw_full.csv', index = False)

## Preprocess

In [None]:
# covariates with no missing values:
# A_S1FAV, A_S1PAI, site, age, race, gender

### pipeline

In [None]:
def fillna_cat(df,cat_feat):
    for feat in cat_feat:
        df[feat].fillna(df[feat].mode()[0], inplace=True)
    return df

def fillna_cont(df,cont_feat):
    df= df.fillna((df[cont_feat].mean()))
    return df


def quantile_exp(df,con_exp_feat):
    for feat in con_exp_feat:
        df[feat] = df[feat].transform(lambda x: pd.qcut(x.rank(method='first'), 
                                                         q = [0, 0.25, 0.5, 0.75, 1], labels = [1,2,3,4]))
        df[feat] = pd.to_numeric(df[feat])
    return df

def standardize(df,con_index):
    scaler = StandardScaler()
    scaler.fit(df[con_index],) 
    df[con_index] = scaler.transform(df[con_index], copy = True)
    return df



## complete pipeline

def process(df,cont_feat,cat_feat,con_exp_feat):
    df = get_outcome(df)
    
    df = fillna_cat(df,cat_feat)
    
    df = fillna_cont(df,cont_feat)
    df = standardize(df,cont_feat)
        
    df = fillna_cont(df,con_exp_feat)
    df = quantile_exp(df,con_exp_feat)
    
    return df

### preprocess by exam