# Prep data

In [None]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


## NETS dataset

In [None]:
filename_ = "data/neighborhood/jhs_nets.csv"
nfood_data = pd.read_csv(filename_)

nfood_data = nfood_data[['SUBJID', 'exam', 'N_UNFAV_CT00']]

nfood_data = nfood_data.rename(columns={"SUBJID": "subjid", "exam":"visit"})

visit_mapping = {'exam1': 1,'exam2': 2, 'exam3':3}
nfood_data = nfood_data.assign(visit  = nfood_data.visit.map(visit_mapping))


## racial segregation

In [None]:
filename_ = "data/neighborhood/jhs_rs.csv"
rs_data = pd.read_csv(filename_)

rs_data = rs_data[['SUBJID', 'exam', 'G_bla_rk']]

rs_data = rs_data.rename(columns={"SUBJID": "subjid", "exam":"visit"})

visit_mapping = {'exam1': 1,'exam2': 2}
rs_data = rs_data.assign(visit  = rs_data.visit.map(visit_mapping))


## Common dataset

In [None]:
common_data = pd.read_csv("data/common_data_jhs.csv")

common_data = common_data[['subjid','visit',
                           'nSES','nbSESpc2score','nbK3paFacilities',
                           'sportIndex', 'hyIndex','activeIndex','darkgrnVeg', 'eggs','fish',
                           'currentSmoker','Diabetes','sex','age','sbp','hdl','totchol',
                          'MIHx','strokeHx','CHDHx','CVDHx']]

common_data

## merge

In [None]:
merge1 = common_data.merge(nfood_data, how='left', on=['subjid','visit'])
merge = merge1.merge(rs_data, how='left', on=['subjid','visit'])
merge

### check variables

In [None]:
def CheckVar(variable, df=merge, iscategorical = False): 
    print(variable)
    df_v1 = df[df["visit"] == 1]
    df_v2 = df[df["visit"] == 2]
    df_v3 = df[df["visit"] == 3]
    nan_v1 = df_v1[variable].isnull().sum()
    nan_v2 = df_v2[variable].isnull().sum()
    nan_v3 = df_v3[variable].isnull().sum()
    
    # categorical variables
    if iscategorical == True:
        if nan_v1 < 1000:
            print("#na in V1 =",nan_v1,"\n",
                  df_v1[variable].value_counts() / len(df_v1))
        else:
            print("#na in V1 =",nan_v1,"not available")
        
        if nan_v2 < 1000:
            print("#na in V2 =",nan_v2,"\n",
                  df_v2[variable].value_counts() / len(df_v2))
        else:
            print("#na in V2 =",nan_v2,"not available")
            
        if nan_v3 < 1000:
            print("#na in V3 =",nan_v3,"\n",
                  df_v3[variable].value_counts() / len(df_v3),'\n')
        else:
            print("#na in V3 =",nan_v3,"not available\n")
    
    # continuous variables
    if iscategorical == False:
        if nan_v1 < 1000:
            print("#na in V1 =",nan_v1)
            print(df_v1[variable].describe())
                  
        else:
            print("#na in V1 =",nan_v1,"not available")
            
        if nan_v2 < 1000:
            print("#na in V2 =",nan_v2,"\n",
                  "mean =", np.nanmean(df_v2[variable]),"\n",
                 "Variance =", np.nanvar(df_v2[variable]))
        else:
            print("#na in V2 =",nan_v2,"not available")
            
        if nan_v3 < 1000:
            print("#na in V3 =",nan_v3,"\n",
                  "mean =", np.nanmean(df_v3[variable]),"\n",
                 "Variance =", np.nanvar(df_v3[variable]),'\n')
        else:
            print("#na in V3 =",nan_v3,"not available\n")
        

In [None]:
list(map(CheckVar, ['nbSESpc2score','nbK3paFacilities','N_UNFAV_CT00',
                           'sportIndex', 'hyIndex','activeIndex','darkgrnVeg', 'eggs','fish']))

print(CheckVar('hdl', df=merge, iscategorical = False))

In [None]:
## plot distribution and check the edge for quantile cutting

dat1 = merge[merge['visit'] == 1]
dat2 = merge[merge['visit'] == 2]
dat3 = merge[merge['visit'] == 3]


def plot_distribution(df,variable):
    for var in variable:
        plt.hist(df[var], bins=20, edgecolor='black', alpha=0.5)

        quantiles = df[var].quantile([0.25, 0.5, 0.75, 1])
        plt.axvline(quantiles[0.25], color='r', linestyle='--', label='Q1')
        plt.axvline(quantiles[0.5], color='g', linestyle='--', label='Median')
        plt.axvline(quantiles[0.75], color='b', linestyle='--', label='Q3')
        plt.axvline(quantiles[1], color='m', linestyle='--', label='Max')

        plt.xlabel(var)
        plt.ylabel('Frequency')
        plt.legend()

        plt.show()
        
exposures = ['nbSESpc2score','nbK3paFacilities','N_UNFAV_CT00','G_bla_rk',
                           'sportIndex', 'hyIndex','activeIndex','darkgrnVeg', 'eggs','fish']

plot_distribution(dat1, exposures)

# preprocess

## functions

In [None]:
def get_outcome(df):
    df['y'] = 0
    df.loc[
       (df['MIHx']==1.0) |
       (df['strokeHx']==1.0) |
       (df['CHDHx']==1.0)|
       (df['CVDHx']==1.0),
       'y'] = 1
    return df

def fillna_cat(df,cat_feat):
    for feat in cat_feat:
        df[feat].fillna(df[feat].mode()[0], inplace=True)
    return df

def fillna_cont(df,cont_feat):
    df= df.fillna((df[cont_feat].mean()))
    return df

def process_binary(df,bin_feat):
    gender_mapping = {'Female':0,'Male':1}
    df = df.assign(gender = df['sex'].map(gender_mapping))
    return df

def quantile_exp(df,exp_feat):
    for feat in exp_feat:
        df[feat] = df[feat].transform(lambda x: pd.qcut(x.rank(method='first'), 
                                                         q = [0, 0.25, 0.5, 0.75, 1], labels = [1,2,3,4]))
        df[feat] = pd.to_numeric(df[feat])
    return df

def standardize(df,con_index):
    scaler = StandardScaler()
    scaler.fit(df[con_index],) 
    df[con_index] = scaler.transform(df[con_index], copy = True)
    return df

def process(df,bin_feat,cont_feat,cat_feat,exp_feat):
    df = get_outcome(df)
    
    df = process_binary(df,bin_feat)
    
    df = fillna_cat(df,cat_feat)
    
    df = fillna_cont(df,cont_feat)
    df = standardize(df,cont_feat)
        
    df = fillna_cont(df,exp_feat)
    df = quantile_exp(df,exp_feat)
    
    return df



## process by exam 

In [None]:
dat1 = merge[merge['visit'] == 1]
dat2 = merge[merge['visit'] == 2]
dat3 = merge[merge['visit'] == 3]

In [None]:
bin_feat1 = ['sex']

exp_feat1 = ['nbSESpc2score','nbK3paFacilities','N_UNFAV_CT00','G_bla_rk',
            'sportIndex', 'hyIndex','activeIndex','darkgrnVeg', 'eggs','fish']

cont_feat1 = ['sbp', 'hdl', 'totchol']

cat_feat1 = ['currentSmoker','Diabetes', 'gender']

data1_processed = process(dat1,bin_feat1,cont_feat1,cat_feat1,exp_feat1)
data1_processed

In [None]:
bin_feat2 = ['sex']

exp_feat2 = ['N_UNFAV_CT00','G_bla_rk']

cont_feat2 = ['sbp']

cat_feat2 = ['Diabetes', 'gender']

data2_processed = process(dat2,bin_feat2,cont_feat2,cat_feat2,exp_feat2)
data2_processed

In [None]:
bin_feat3 = ['sex']

exp_feat3 = ['sportIndex', 'hyIndex','activeIndex']

cont_feat3 = ['sbp', 'hdl']

cat_feat3 = ['Diabetes', 'gender']

data3_processed = process(dat3,bin_feat3,cont_feat3,cat_feat3,exp_feat3)
data3_processed

## merge 

In [None]:
merged = pd.concat([data1_processed, data2_processed, data3_processed])
merged.to_csv('data/jhs_complete_0718.csv', index = False)