# Prep data

In [None]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

## Nb food and physical facility

In [None]:
filename_ = "data/raw_data_file/neighborhood/jhs_nets.csv"
nets_data = pd.read_csv(filename_)

nets_data = nets_data[['SUBJID', 'exam', 'S1FAV', 'S1PAI']]

nets_data = nets_data.rename(columns={"SUBJID": "subjid", "exam":"visit"})

visit_mapping = {'exam1': 1,'exam2': 2, 'exam3':3}
nets_data = nets_data.assign(visit  = nets_data.visit.map(visit_mapping))


## racial segregation

In [None]:
filename_ = "data/raw_data_file/neighborhood/jhs_rs.csv"
rs_data = pd.read_csv(filename_)

rs_data = rs_data[['SUBJID', 'exam', 'G_bla_rk']]

rs_data = rs_data.rename(columns={"SUBJID": "subjid", "exam":"visit"})

visit_mapping = {'exam1': 1,'exam2': 2}
rs_data = rs_data.assign(visit  = rs_data.visit.map(visit_mapping))


## visit dates and other covariates

In [None]:
filename_ = "data/raw_data_file/analysis.csv"
analysis_data = pd.read_csv(filename_)
analysis_data = analysis_data[['subjid', 'visit', 'VisitDate',
                      'nutrition3cat','PA3cat',
                      'fmlyinc','alc']]

## Common dataset

In [None]:
common_data = pd.read_csv("data/processed/common_data_jhs.csv")

common_data = common_data[['subjid','visit',
                           'nSES','nbSESpc2score',
                           'currentSmoker','Diabetes','sex','age','sbp','hdl','totchol',
                          'MIHx','strokeHx','CHDHx','CVDHx']]

## merge

In [None]:
merge1 = common_data.merge(nets_data, how='left', on=['subjid','visit'])
merge2 = merge1.merge(rs_data, how='left', on=['subjid','visit'])
merge3 = merge2.merge(analysis_data, how='left', on=['subjid','visit'])
merge = merge3

In [None]:
## map gender and income

gender_mapping = {'Female':0,'Male':1}
income_mapping = {'A':1,'B':1,'C':1,
                 'D':2,'E':2,'F':2,
                 'G':3,'H':3,'I':3,
                 'J':4,'K':4}
merge = merge.assign(gender = merge['sex'].map(gender_mapping))
merge = merge.assign(fmlyinc = merge['fmlyinc'].map(income_mapping))

merge_raw = merge.copy()

In [None]:
## rename 

jhs_raw = jhs_raw.rename(columns = {'event': 'Y_10y', 
                             'nbSESpc2score': 'nSES',
                            'S1FAV': 'nFavFood',
                            'S1PAI': 'nPhysFac', 
                            'G_bla_rk': 'nRS', 
                            'nutrition3cat': 'nutrition', 
                            'PA3cat': 'PhysAct',
                            'fmlyinc': 'FamIncome'})

# preprocess

## functions

In [None]:
def get_outcome(df):
    df['y'] = 0
    df.loc[
       (df['MIHx']==1.0) |
       (df['strokeHx']==1.0) |
       (df['CHDHx']==1.0)|
       (df['CVDHx']==1.0),
       'y'] = 1
    return df

def fillna_cat(df,cat_feat):
    for feat in cat_feat:
        df[feat].fillna(df[feat].mode()[0], inplace=True)
    return df

def fillna_cont(df,cont_feat):
    df= df.fillna((df[cont_feat].mean()))
    return df


def quantile_exp(df,con_exp_feat):
    for feat in con_exp_feat:
        df[feat] = df[feat].transform(lambda x: pd.qcut(x.rank(method='first'), 
                                                         q = [0, 0.25, 0.5, 0.75, 1], labels = [1,2,3,4]))
        df[feat] = pd.to_numeric(df[feat])
    return df

def standardize(df,con_index):
    scaler = StandardScaler()
    scaler.fit(df[con_index],) 
    df[con_index] = scaler.transform(df[con_index], copy = True)
    return df



## complete pipeline

def process(df,cont_feat,cat_feat,con_exp_feat):
    df = get_outcome(df)
    
    df = fillna_cat(df,cat_feat)
    
    df = fillna_cont(df,cont_feat)
    df = standardize(df,cont_feat)
        
    df = fillna_cont(df,con_exp_feat)
    df = quantile_exp(df,con_exp_feat)
    
    return df



## process by exam 

In [None]:
dat1 = merge[merge['visit'] == 1]
dat2 = merge[merge['visit'] == 2]
dat3 = merge[merge['visit'] == 3]

In [None]:
## visit 1

con_exp_feat1 = ['nbSESpc2score','S1FAV','S1PAI','G_bla_rk']

cont_feat1 = ['sbp', 'hdl', 'totchol']

cat_feat1 = ['currentSmoker','Diabetes', 'gender', 'fmlyinc','nutrition3cat','PA3cat','alc']

data1_processed = process(dat1,cont_feat1,cat_feat1,con_exp_feat1)

In [None]:
## visit 2

con_exp_feat2 = ['S1FAV','S1PAI','G_bla_rk']

cont_feat2 = ['sbp']

cat_feat2 = ['Diabetes', 'gender']

data2_processed = process(dat2,cont_feat2,cat_feat2,con_exp_feat2)

In [None]:
## visit 3

con_exp_feat3 = []

cont_feat3 = ['sbp', 'hdl']

cat_feat3 = ['PA3cat','Diabetes', 'gender','alc','fmlyinc']

data3_processed = process(dat3,cont_feat3,cat_feat3,con_exp_feat3)

## merge 

In [None]:
merged = pd.concat([data1_processed, data2_processed, data3_processed])

In [None]:
# exclude subjects with CVD at baseline

rows_to_exclude = merged[(merged['visit'] == 1) & (merged['y'] == 1)]['subjid']

In [None]:
filtered_df = merged[~merged['subjid'].isin(rows_to_exclude)]
# filtered_df.to_csv('data/processed/jhs_preprocessed_1103.csv', index = False)

In [None]:
filtered_df_raw = merge_raw[~merge_raw['subjid'].isin(rows_to_exclude)]
# filtered_df_raw.to_csv('data/processed/jhs_raw_full.csv', index = False)

## missing imputation only

In [None]:
cat_feat = ['FamIncome', 'nutrition', 'PhysAct',
           'currentSmoker','alc','Diabetes']
cont_feat = ['nSES','nFavFood','nPhysFac', 'nRS',
           'hdl','totchol','sbp']

In [None]:
jhs_impute = fillna_cat(jhs_impute,cat_feat)  
jhs_impute = fillna_cont(jhs_impute,cont_feat)  