# Prep data

In [1]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

## Nb food and physical facility

In [2]:
filename_ = "../Data/JHS/jhs_nets.csv"
nets_data = pd.read_csv(filename_)

nets_data = nets_data[nets_data['exam'] == 'exam1']
nets_data = nets_data[['SUBJID', 'S1FAV', 'S1PAI']]
nets_data = nets_data.rename(columns={"SUBJID": "subjid"})



## racial segregation

In [3]:
filename_ = "../Data/JHS/jhs_rs.csv"
rs_data = pd.read_csv(filename_)

rs_data = rs_data[rs_data['exam'] == 'exam1']
rs_data = rs_data[['SUBJID','G_bla_rk']]

rs_data = rs_data.rename(columns={"SUBJID": "subjid"})

## other covariates

In [4]:
filename_ = "../Data/JHS/analysis.csv"
analysis_data = pd.read_csv(filename_)

# select visit 1
analysis_data = analysis_data[analysis_data['visit']==1]

analysis_data = analysis_data[['subjid', 
                               'nbSESpc2score',
                               'nutrition3cat','PA3cat','fmlyinc','alc','currentSmoker',
                              'age','sex','Diabetes','hdl','totchol','sbp']]

## event dataset

In [5]:
event_data = pd.read_csv("../data_processed/JHS/jhs_event.csv")

event_data = event_data[['subjid', 'cvd_10y_HF','cvd_10y_noHF']]

## merge

In [6]:
merge1 = event_data.merge(nets_data, how='left', on=['subjid'])
merge2 = merge1.merge(rs_data, how='left', on=['subjid'])
merge3 = merge2.merge(analysis_data, how='left', on=['subjid'])
merge = merge3

In [7]:
## map gender and income

gender_mapping = {'Female':0,'Male':1}
income_mapping = {'A':1,'B':1,'C':1,
                 'D':2,'E':2,'F':2,
                 'G':3,'H':3,'I':3,
                 'J':4,'K':4}
merge = merge.assign(gender = merge['sex'].map(gender_mapping))
merge = merge.assign(fmlyinc = merge['fmlyinc'].map(income_mapping))

merge_raw = merge.copy()

In [8]:
## rename 

merge_raw = merge_raw.rename(columns = {'nbSESpc2score': 'nSES',
                            'S1FAV': 'nFavFood',
                            'S1PAI': 'nPhysFac', 
                            'G_bla_rk': 'nRS', 
                            'nutrition3cat': 'nutrition', 
                            'PA3cat': 'PhysAct',
                            'fmlyinc': 'FamIncome'})

In [None]:
# merge_raw.to_csv('../data_processed/JHS/jhs_raw.csv', index = False)

# preprocess

## functions

In [9]:
def fillna_cat(df,cat_feat):
    for feat in cat_feat:
        df[feat].fillna(df[feat].mode()[0], inplace=True)
    return df

def fillna_cont(df,cont_feat):
    df= df.fillna((df[cont_feat].mean()))
    return df


def quantile_exp(df,con_exp_feat):
    for feat in con_exp_feat:
        df[feat] = df[feat].transform(lambda x: pd.qcut(x.rank(method='first'), 
                                                         q = [0, 0.25, 0.5, 0.75, 1], labels = [1,2,3,4]))
        df[feat] = pd.to_numeric(df[feat])
    return df

def standardize(df,con_index):
    scaler = StandardScaler()
    scaler.fit(df[con_index],) 
    df[con_index] = scaler.transform(df[con_index], copy = True)
    return df



## complete pipeline

def process(df,cont_feat,cat_feat,con_exp_feat):
    
    df = fillna_cat(df,cat_feat)
    
    df = fillna_cont(df,cont_feat)
    df = standardize(df,cont_feat)
        
    df = fillna_cont(df,con_exp_feat)
    df = quantile_exp(df,con_exp_feat)
    
    return df



## process 

In [10]:
con_exp_feat = ['nSES','nFavFood','nPhysFac','nRS']

cont_feat = ['sbp', 'hdl', 'totchol']

cat_feat = ['age','nutrition','PhysAct','currentSmoker','Diabetes', 'gender', 'FamIncome','alc']

jhs_preprocessed = merge_raw.copy()
jhs_imputed = merge_raw.copy()
jhs_cate = merge_raw.copy()
jhs_std = merge_raw.copy()

In [None]:
### preprocessed

jhs_preprocessed = process(jhs_preprocessed,cont_feat,cat_feat,con_exp_feat)

#jhs_preprocessed.to_csv('../data_processed/JHS/jhs_pre.csv', index = False)

In [11]:
##### standardization only #####
##### final analysis data #####

jhs_std = jhs_std.dropna()

cols_to_standardize = ['nSES','nFavFood','nPhysFac','nRS', 'sbp', 'hdl', 'totchol']
scaler = StandardScaler()
jhs_std[cols_to_standardize] = scaler.fit_transform(jhs_std[cols_to_standardize])

#jhs_std.to_csv('../data_processed/JHS/jhs_std.csv', index = False)

In [None]:
### categorization only

# standardize
cols_to_standardize = ['nSES','nFavFood','nPhysFac','nRS', 'sbp', 'hdl', 'totchol']
scaler = StandardScaler()
jhs_cate[cols_to_standardize] = scaler.fit_transform(jhs_cate[cols_to_standardize])

# categorize
jhs_cate = quantile_exp(jhs_cate,con_exp_feat)

#jhs_cate.to_csv('../data_processed/JHS/jhs_cate.csv', index = False)

In [None]:
### missing imputation only

cat_feat = ['FamIncome', 'nutrition', 'PhysAct',
           'currentSmoker','alc','Diabetes']
cont_feat = ['nSES','nFavFood','nPhysFac', 'nRS',
           'hdl','totchol','sbp']

jhs_imputed = fillna_cat(jhs_imputed,cat_feat)  
jhs_imputed = fillna_cont(jhs_imputed,cont_feat)  

#jhs_imputed.to_csv('../data_processed/JHS/jhs_imputed.csv', index = False)