In [2]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import os
from sklearn.preprocessing import StandardScaler

In [3]:
import warnings
warnings.filterwarnings('ignore')


In [4]:
def get_outcome(df):
    df['y'] = 0
    df.loc[
       (df['MIHx']==1.0) |
       (df['strokeHx']==1.0) |
       (df['CHDHx']==1.0)|
       (df['CVDHx']==1.0),
       'y'] = 1
    return df

In [5]:
def fillna_cat(df,cat_feat):
    for feat in cat_feat:
        df[feat].fillna(df[feat].mode()[0], inplace=True)
    return df


In [6]:
def fillna_cont(df,cont_feat):
    df= df.fillna((df[cont_feat].mean()))
    return df

In [7]:
def process_binary(df,bin_feat):
    gender_mapping = {'Female':0,'Male':1}
    df = df.assign(gender = df['sex'].map(gender_mapping))
    return df

In [8]:
def dichotomize_nb(df,nb_feat):
    for feat in nb_feat:
        median_feat = np.nanmedian(df[feat])
        df[feat] = [1 if x >= median_feat else 0 for x in df[feat].tolist()]
        df[feat] = pd.to_numeric(data[feat])
    return df

In [9]:
def standardize(df,con_index):
    scaler = StandardScaler()
    scaler.fit(df[con_index],) 
    df[con_index] = scaler.transform(df[con_index], copy = True)
    return df

In [10]:
def process(df,bin_feat,cont_feat,cat_feat,nb_feat,consider_all_exams = False):
    df = get_outcome(df)
    df = process_binary(df,bin_feat)
    df = fillna_cat(df,cat_feat)
    df = fillna_cont(df,cont_feat)
    df = dichotomize_nb(df,nb_feat)
#     df = standardize(df,cont_feat)

    if consider_all_exams:
        data = df
    else:
        data = df[df['visit']==1]
    
    
    return data
    

In [11]:
# filename_ = "../../CVD_data/JHS/common_data_jhs.csv"
filename_ = "../../CVD_data/JHS/jhs_data.csv"

data = pd.read_csv(filename_)
data[data.columns[~data.columns.isin(['VisitDate'])]]
data

Unnamed: 0,subjid,visit,VisitDate,age,sex,alc,alcw,currentSmoker,everSmoker,weight,...,nbK3paFacilities,nbpctResiden1mi,nbPopDensity1mi,sportIndex,hyIndex,activeIndex,frs_chdtenyrrisk,frs_cvdtenyrrisk,frs_atpiii_tenyrrisk,rrs_tenyrrisk
0,J100079,1,1/22/01,62,Female,1.0,0.00,0.0,0.0,95.0,...,0.58,0.52,3235.71,2.33,2.71,2.25,0.06,0.14,0.02,0.04
1,J100180,1,2/24/01,75,Female,0.0,0.00,0.0,0.0,57.0,...,0.29,0.40,3917.48,2.00,1.86,1.75,,0.12,0.06,0.04
2,J100228,1,10/30/01,69,Female,0.0,0.00,0.0,0.0,92.0,...,0.50,0.39,3241.73,1.00,1.00,1.00,0.20,0.22,0.06,0.09
3,J100291,1,11/2/01,74,Female,0.0,0.00,0.0,0.0,91.1,...,0.55,0.33,3480.61,3.67,1.71,3.50,0.11,0.22,0.14,0.04
4,J100333,1,11/1/01,77,Male,1.0,0.00,0.0,1.0,91.6,...,0.55,0.33,3480.61,4.00,3.86,1.75,,0.25,0.25,0.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13325,J599265,3,,51,Female,1.0,0.15,,1.0,,...,,,,3.33,2.29,3.25,0.17,0.29,0.08,0.06
13326,J599276,3,,50,Male,1.0,6.00,,1.0,,...,,,,3.33,2.43,2.50,0.14,0.16,0.10,0.04
13327,J599282,3,,65,Male,0.0,0.00,,0.0,,...,,,,2.00,4.14,3.00,0.18,0.13,0.10,0.07
13328,J599323,3,,55,Female,0.0,0.00,,0.0,,...,,,,1.00,1.71,1.50,0.07,0.14,0.04,0.03


In [12]:
bin_feat = ['sex']
nb_feat = ['nbSESanascore',
           'nbProblems',
           'nbCohesion',
           'nbViolence',
           'nbK3FavorFoodstore',
           'nbK3paFacilities']
cont_feat = ['sportIndex','hyIndex','activeIndex', #physical activity factors
             'darkgrnVeg','eggs','fish','idealHealthNutrition', #diet factors
             'age', 'sbp', 'hdl', 'totchol' #individual health factors 
            ]

cat_feat = ['idealHealthPA', #physical activity factors
            'idealHealthNutrition', #diet factors
            'currentSmoker','Diabetes', #behaviors and conditions
            'Income' #income information
           ]


In [13]:
data_processed = process(data,bin_feat,cont_feat,cat_feat,nb_feat,consider_all_exams = False)

In [14]:
data_processed

Unnamed: 0,subjid,visit,VisitDate,age,sex,alc,alcw,currentSmoker,everSmoker,weight,...,nbPopDensity1mi,sportIndex,hyIndex,activeIndex,frs_chdtenyrrisk,frs_cvdtenyrrisk,frs_atpiii_tenyrrisk,rrs_tenyrrisk,y,gender
0,J100079,1,1/22/01,62,Female,1.0,0.00,0.0,0.0,95.0,...,3235.71,2.33,2.71,2.25,0.06,0.14,0.02,0.04,0,0
1,J100180,1,2/24/01,75,Female,0.0,0.00,0.0,0.0,57.0,...,3917.48,2.00,1.86,1.75,,0.12,0.06,0.04,0,0
2,J100228,1,10/30/01,69,Female,0.0,0.00,0.0,0.0,92.0,...,3241.73,1.00,1.00,1.00,0.20,0.22,0.06,0.09,1,0
3,J100291,1,11/2/01,74,Female,0.0,0.00,0.0,0.0,91.1,...,3480.61,3.67,1.71,3.50,0.11,0.22,0.14,0.04,1,0
4,J100333,1,11/1/01,77,Male,1.0,0.00,0.0,1.0,91.6,...,3480.61,4.00,3.86,1.75,,0.25,0.25,0.08,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5301,J599298,1,3/26/04,68,Male,0.0,0.00,1.0,1.0,77.1,...,3006.90,1.00,1.00,1.00,0.27,0.25,0.16,0.12,1,1
5302,J599302,1,3/30/04,42,Male,1.0,0.12,1.0,1.0,81.5,...,3006.86,2.67,1.00,1.00,0.11,0.09,0.10,0.02,0,1
5303,J599310,1,4/5/04,50,Female,0.0,0.00,1.0,1.0,101.5,...,1233.56,1.00,1.57,1.00,,,,,1,0
5304,J599323,1,3/30/04,48,Female,0.0,0.00,0.0,0.0,108.0,...,1462.61,1.00,2.14,1.00,0.07,0.73,0.01,0.01,0,0


In [15]:
data_processed.to_csv("../../CVD_data/JHS/processed_visit1_jhs_data.csv",index=False)