## Prep

In [None]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
import seaborn as sns
from statistics import variance
from statistics import mean
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

# calculate outcome y
def get_outcome(df):
    df['y'] = 0
    df.loc[
       (df['MIHx']==1.0) |
       (df['strokeHx']==1.0) |
       (df['CHDHx']==1.0)|
       (df['CVDHx']==1.0),
       'y'] = 1
    return df

pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)

## JHS

In [None]:
### get the complete df with V1 covariates and V1-V3 outcome y

In [None]:
df = pd.read_csv('data/common_data_jhs.csv')

# preprocess
sex_mapping = {'Female': 0,'Male': 1}
df = df.assign(gender  = df.sex.map(sex_mapping))
# fix id column
df["subjid"] = [float(str(i).replace("J", "")) for i in df["subjid"]]

df_v1 = df[df["visit"] == 1]
df_v1 = get_outcome(df_v1)
df_v1 = df_v1[['subjid','nSES','nbSESpc2score','currentSmoker','Diabetes','gender','age','sbp','hdl','totchol','y']]
df_v1 = df_v1.rename(columns={"y": "y1"})

df_v2 = df[df["visit"] == 2]
df_v2 = get_outcome(df_v2)
df_v2 = df_v2[['subjid','y']]
df_v2 = df_v2.rename(columns={"y": "y2"})

df_v3 = df[df["visit"] == 3]
df_v3 = get_outcome(df_v3)
df_v3 = df_v3[['subjid','y']]
df_v3 = df_v3.rename(columns={"y": "y3"})

merge1 = pd.merge(df_v1, df_v2)
jhs_complete_df = pd.merge(merge1, df_v3)
jhs_complete_df['y_tot'] = [1 if x == True else 0 for x in jhs_complete_df[['y1','y2','y3']].any(axis=1).tolist()]

# jhs_complete_df.to_csv('data/jhs_raw_complete.csv', index = False)

In [None]:
### split train and test sets

In [None]:
jhs_tr,jhs_te = train_test_split(jhs_complete_df, test_size=0.2,random_state=0, stratify=jhs_complete_df[['y_tot']]) 

In [None]:
### impute missing values

In [None]:
# train set
## categorical
jhs_tr[['currentSmoker','Diabetes']] = jhs_tr[['currentSmoker','Diabetes']].apply(lambda x: x.fillna(x.value_counts().index[0]))
## continuous
tr_mean = jhs_tr[['nbSESpc2score', 'sbp', 'hdl', 'totchol']].mean()
jhs_tr.fillna(value = tr_mean, inplace = True)


# test set
## categorical
tr_mode = jhs_tr[['currentSmoker','Diabetes']].mode().squeeze()
jhs_te.fillna(value = tr_mode, inplace = True)
## continuous
jhs_te.fillna(value = tr_mean, inplace = True)

In [None]:
# convert to int
jhs_tr[['subjid', 'nSES', 'currentSmoker', 'Diabetes', 'gender', 'age']] = jhs_tr[['subjid', 'nSES', 'currentSmoker', 'Diabetes', 'gender', 'age']].astype(int)
jhs_te[['subjid', 'nSES', 'currentSmoker', 'Diabetes', 'gender', 'age']] = jhs_te[['subjid', 'nSES', 'currentSmoker', 'Diabetes', 'gender', 'age']].astype(int)


In [None]:
### standardize (on continuous covariates)

In [None]:
con_index = ['nbSESpc2score', 'age', 'sbp', 'hdl', 'totchol']

jhs_scaler = StandardScaler()
jhs_scaler.fit(jhs_tr[con_index],)   ## fit scaler using tr set
     
jhs_tr[con_index] = jhs_scaler.transform(jhs_tr[con_index], copy = True)
jhs_te[con_index] = jhs_scaler.transform(jhs_te[con_index], copy = True)

In [None]:
### export 
jhs_tr.to_csv('data/jhs_tr_stratified.csv', index = False)  # stratified; no smote
jhs_te.to_csv('data/jhs_te_stratified.csv', index = False)  

### JHS gender df

In [None]:
jhs_f = jhs_complete_df.loc[jhs_complete_df['gender'] == 0,:]
jhs_m = jhs_complete_df.loc[jhs_complete_df['gender'] == 1,:]

In [None]:
class PreProcess:
    def __init__(self, dat):
        self.dat = dat
        
        ### split train and test sets
        jhs_tr,jhs_te = train_test_split(self.dat, test_size=0.2,random_state=0, stratify=self.dat[['y_tot']]) 

        ### impute missing values

        # train set
        ## categorical
        jhs_tr[['currentSmoker','Diabetes']] = jhs_tr[['currentSmoker','Diabetes']].apply(lambda x: x.fillna(x.value_counts().index[0]))
        ## continuous
        tr_mean = jhs_tr[['nbSESpc2score', 'sbp', 'hdl', 'totchol']].mean()
        jhs_tr.fillna(value = tr_mean, inplace = True)

        # test set
        ## categorical
        tr_mode = jhs_tr[['currentSmoker','Diabetes']].mode().squeeze()
        jhs_te.fillna(value = tr_mode, inplace = True)
        ## continuous
        jhs_te.fillna(value = tr_mean, inplace = True)
        
        # convert to int
        jhs_tr[['subjid', 'nSES', 'currentSmoker', 'Diabetes', 'gender', 'age']] = jhs_tr[['subjid', 'nSES', 'currentSmoker', 'Diabetes', 'gender', 'age']].astype(int)
        jhs_te[['subjid', 'nSES', 'currentSmoker', 'Diabetes', 'gender', 'age']] = jhs_te[['subjid', 'nSES', 'currentSmoker', 'Diabetes', 'gender', 'age']].astype(int)


        ### standardize (on continuous covariates)
        con_index = ['nbSESpc2score', 'age', 'sbp', 'hdl', 'totchol']
        jhs_scaler = StandardScaler()
        jhs_scaler.fit(jhs_tr[con_index],)   ## fit scaler using tr set
        jhs_tr[con_index] = jhs_scaler.transform(jhs_tr[con_index], copy = True)
        jhs_te[con_index] = jhs_scaler.transform(jhs_te[con_index], copy = True)
        
        ### output
        self.jhs_tr = jhs_tr
        self.jhs_te = jhs_te

        
jhs_f_tr = PreProcess(jhs_f).jhs_tr
jhs_f_te = PreProcess(jhs_f).jhs_te
jhs_m_tr = PreProcess(jhs_m).jhs_tr
jhs_m_te = PreProcess(jhs_m).jhs_te

### export df
jhs_f_tr.to_csv('data/jhs_f_tr.csv', index = False)
jhs_f_te.to_csv('data/jhs_f_te.csv', index = False)
jhs_m_tr.to_csv('data/jhs_m_tr.csv', index = False)
jhs_m_te.to_csv('data/jhs_m_te.csv', index = False)  


### JHS - gcomputation df

#### nSES

In [None]:
## categorical
jhs_complete_df[['currentSmoker','Diabetes']] = jhs_complete_df[['currentSmoker','Diabetes']].apply(lambda x: x.fillna(x.value_counts().index[0]))
## continuous
dat_mean = jhs_complete_df[['nbSESpc2score', 'sbp', 'hdl', 'totchol']].mean()
jhs_complete_df.fillna(value = dat_mean, inplace = True)

# convert to int
jhs_complete_df[['subjid', 'nSES', 'currentSmoker', 'Diabetes', 'gender', 'age']] = jhs_complete_df[['subjid', 'nSES', 'currentSmoker', 'Diabetes', 'gender', 'age']].astype(int)

# standardize
con_index = ['nbSESpc2score', 'age', 'sbp', 'hdl', 'totchol']

jhs_scaler = StandardScaler()
jhs_scaler.fit(jhs_complete_df[con_index],) 
     
jhs_complete_df[con_index] = jhs_scaler.transform(jhs_complete_df[con_index], copy = True)

jhs_complete_df.to_csv('data/jhs_gcomputation.csv', index = False)


#### more features for gcomputation

In [None]:
# preprocess
sex_mapping = {'Female': 0,'Male': 1}
df = df.assign(gender  = df.sex.map(sex_mapping))
# fix id column
df["subjid"] = [float(str(i).replace("J", "")) for i in df["subjid"]]

df_v1 = df[df["visit"] == 1]
df_v1 = get_outcome(df_v1)
df_v1 = df_v1[['subjid','nSES','PA3cat','nbSESpc2score','nbK3FavorFoodstore','nbK3paFacilities','nbpctResiden1mi','currentSmoker','Diabetes','gender','age','sbp','hdl','totchol','y']]
df_v1 = df_v1.rename(columns={"y": "y1"})

df_v2 = df[df["visit"] == 2]
df_v2 = get_outcome(df_v2)
df_v2 = df_v2[['subjid','y']]
df_v2 = df_v2.rename(columns={"y": "y2"})

df_v3 = df[df["visit"] == 3]
df_v3 = get_outcome(df_v3)
df_v3 = df_v3[['subjid','y']]
df_v3 = df_v3.rename(columns={"y": "y3"})

merge1 = pd.merge(df_v1, df_v2)
jhs_complete_df = pd.merge(merge1, df_v3)
jhs_complete_df['y_tot'] = [1 if x == True else 0 for x in jhs_complete_df[['y1','y2','y3']].any(axis=1).tolist()]

## impute nb features with mean
nb_dat_mean = jhs_complete_df[['nbK3FavorFoodstore','nbK3paFacilities','nbpctResiden1mi']].mean()
jhs_complete_df.fillna(value = nb_dat_mean, inplace = True)

## dichotomize nb features
food_med = np.nanmedian(jhs_complete_df['nbK3FavorFoodstore'])
fac_med = np.nanmedian(jhs_complete_df['nbK3paFacilities'])
res_med = np.nanmedian(jhs_complete_df['nbpctResiden1mi'])
jhs_complete_df['nFood'] = [1 if x >= food_med else 0 for x in jhs_complete_df['nbK3FavorFoodstore'].tolist()]
jhs_complete_df['nFac'] = [1 if x >= fac_med else 0 for x in jhs_complete_df['nbK3paFacilities'].tolist()]
jhs_complete_df['nRes'] = [1 if x >= res_med else 0 for x in jhs_complete_df['nbpctResiden1mi'].tolist()]


# fill nan
## categorical
jhs_complete_df[['PA3cat','currentSmoker','Diabetes']] = jhs_complete_df[['PA3cat','currentSmoker','Diabetes']].apply(lambda x: x.fillna(x.value_counts().index[0]))
## continuous
dat_mean = jhs_complete_df[['nbSESpc2score', 'sbp', 'hdl', 'totchol']].mean()
jhs_complete_df.fillna(value = dat_mean, inplace = True)

# convert to int
jhs_complete_df[['subjid', 'nSES', 'PA3cat','currentSmoker', 'Diabetes', 'gender', 'age','nFood','nFac','nRes']] = jhs_complete_df[['subjid', 'nSES', 'PA3cat','currentSmoker', 'Diabetes', 'gender', 'age','nFood','nFac','nRes']].astype(int)

# dummy coding for PA3cat
PA3cat_dummy = pd.get_dummies(jhs_complete_df['PA3cat'],  prefix='PA3cat')
jhs_complete_df = pd.concat([jhs_complete_df, PA3cat_dummy], axis=1)


# standardize
con_index = ['nbSESpc2score', 'age', 'sbp', 'hdl', 'totchol']

jhs_scaler = StandardScaler()
jhs_scaler.fit(jhs_complete_df[con_index],) 
     
jhs_complete_df[con_index] = jhs_scaler.transform(jhs_complete_df[con_index], copy = True)

#jhs_complete_df
#jhs_complete_df.isnull().sum()

jhs_complete_df.to_csv('data/jhs_gcomputation_more.csv', index = False)


## MESA

In [None]:
# SMOTE
mesa_tr =pd.read_csv("../mesa/code/plan_2/mesa_train_scale.csv")
mesa_te =pd.read_csv("../mesa/code/plan_2/mesa_test_scale.csv")

In [None]:
### merge JHS and MESA ###

In [None]:
# train set

## add the dif cols
mesa_tr[[ 'y1', 'y2', 'y3']] = float('NAN')
mesa_tr[['dat']] = "MESA"
jhs_tr[['race']] = int(1)
jhs_tr[['dat']] = "JHS"


## rename
jhs_tr = jhs_tr.rename(columns={"nbSESpc2score": "nSESscore", "y_tot":"y"})
mesa_tr = mesa_tr.rename(columns={"idno":"subjid","F3_PC2": "nSESscore", "cig1c":"currentSmoker", "diabet1":"Diabetes", 
                                 "gender1":"gender", "age1c":"age", "sbp1c":"sbp", "hdl1":"hdl", "chol1":"totchol", "race_2":"race"})

## merge
merged_tr = pd.concat([jhs_tr, mesa_tr])


In [None]:
# test set

## add the dif cols
mesa_te[['y1', 'y2', 'y3']] = float('NAN')
mesa_te[['dat']] = "MESA"
jhs_te[['race']] = int(1)
jhs_te[['dat']] = "JHS"


## rename
jhs_te = jhs_te.rename(columns={"nbSESpc2score": "nSESscore", "y_tot":"y"})
mesa_te = mesa_te.rename(columns={"idno":"subjid", "F3_PC2": "nSESscore", "cig1c":"currentSmoker", "diabet1":"Diabetes", 
                                 "gender1":"gender", "age1c":"age", "sbp1c":"sbp", "hdl1":"hdl", "chol1":"totchol", "race_2":"race"})

## merge
merged_te = pd.concat([jhs_te, mesa_te])

In [None]:
# export 
merged_tr.to_csv('data/merged_tr.csv', index = False)  
merged_te.to_csv('data/merged_te.csv', index = False)  

## merge mesa and jhs 

In [None]:
jhs_tr =pd.read_csv("data/jhs_tr.csv")
jhs_te =pd.read_csv("data/jhs_te.csv")

mesa_tr =pd.read_csv("../mesa/code/plan_2/mesa_train_scale_noSMOTE.csv")
mesa_te =pd.read_csv("../mesa/code/plan_2/mesa_test_scale_noSMOTE.csv")

In [None]:
jhs_tr[['y_tot']].value_counts()

In [None]:
jhs_te[['y_tot']].value_counts()

In [None]:
# export 
merged_tr.to_csv('data/merged_tr_nosmt.csv', index = False)  
merged_te.to_csv('data/merged_te_nosmt.csv', index = False)  