In [None]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)


# calculate outcome y
def get_outcome(df):
    df['y'] = 0
    df.loc[
       (df['MIHx']==1.0) |
       (df['strokeHx']==1.0) |
       (df['CHDHx']==1.0)|
       (df['CVDHx']==1.0),
       'y'] = 1
    return df

In [None]:
# data of predictors
df = pd.read_csv('data/common_data_jhs.csv')
df

In [None]:
sex_mapping = {'Female':0,'Male':1}
df = df.assign(gender  = df.sex.map(sex_mapping))
df = df[['subjid','visit','age', 'gender', 'currentSmoker', 'sbp', 'Diabetes', 'hdl', 'totchol','nSES', 'nbSESpc2score']]
common_subj_index = df['subjid'].tolist()
df

In [None]:
# data of outcome
df_raw = pd.read_csv('data/jhs_data.csv')
df_y_v1 = df_raw[df_raw["visit"] == 1]
df_y_v1 = df_y_v1.loc[df_y_v1['subjid'].isin(common_subj_index),:]  # subset subjects with data across 3 visits
df_y_v1 = get_outcome(df_y_v1)
df_y_v1 = df_y_v1[['subjid','y']]
df_y_v1 = df_y_v1.rename(columns={"y": "y1"})

df_y_v2 = df_raw[df_raw["visit"] == 2]
df_y_v2 = df_y_v2.loc[df_y_v2['subjid'].isin(common_subj_index),:] 
df_y_v2 = get_outcome(df_y_v2)
df_y_v2 = df_y_v2[['subjid','y']]
df_y_v2 = df_y_v2.rename(columns={"y": "y2"})

df_y_v3 = df_raw[df_raw["visit"] == 3]
df_y_v3 = df_y_v3.loc[df_y_v3['subjid'].isin(common_subj_index),:] 
df_y_v3 = get_outcome(df_y_v3)
df_y_v3 = df_y_v3[['subjid','y']]
df_y_v3 = df_y_v3.rename(columns={"y": "y3"})

In [None]:
# merge predictors and outcome
df_y1 = df.merge(df_y_v1, on = 'subjid', how = 'left', suffixes=(False, False))
df_y2 = df_y1.merge(df_y_v2, on = 'subjid', how = 'left', suffixes=(False, False))
df_y123 = df_y2.merge(df_y_v3, on = 'subjid', how = 'left', suffixes=(False, False))
df_y123

In [None]:
# recode event and time

df_event = df_y123[df_y123["visit"] == 1]
df_event['event'] = 0
df_event['time'] = 3


## incidence in V1
df_event.loc[df_event['y1'] == 1,'event'] = 1
df_event.loc[df_event['y1'] == 1,'time'] = 1

## incidence in V2
v2_index = (df_event['y2'] == 1) & (df_event['y1'] == 0)
df_event.loc[v2_index,'event'] = 1
df_event.loc[v2_index,'time'] = 2

## incidence in V1
v3_index = (df_event['y3'] == 1) & (df_event['y1'] == 0) & (df_event['y2'] == 0)
df_event.loc[v3_index,'event'] = 1

df_event.head(100)

In [None]:
# prepare df for cox

cox_df = df_event.loc[:, ~df_event.columns.isin(['subjid','visit','y1','y2','y3'])]

cate_index = ['gender', 'currentSmoker', 'Diabetes', 'nSES']
cox_df.loc[:, cate_index] = pd.get_dummies(cox_df.loc[:, cate_index], drop_first=True)

# fill nan
mean_score=cox_df['nbSESpc2score'].mean()
cox_df['nbSESpc2score'].fillna(value=mean_score, inplace=True)
mean_sbp=cox_df['sbp'].mean()
cox_df['sbp'].fillna(value=mean_sbp, inplace=True)
mean_hdl=cox_df['hdl'].mean()
cox_df['hdl'].fillna(value=mean_hdl, inplace=True)
mean_chol=cox_df['totchol'].mean()
cox_df['totchol'].fillna(value=mean_chol, inplace=True)
cox_df = cox_df.apply(lambda x: x.fillna(x.value_counts().index[0]))

cox_df

In [None]:
# store df
cox_df.to_csv('data/cox_df.csv', index = False)  

In [None]:
# cox df with y1-y3

cox_df_full = df_event.loc[:, ~df_event.columns.isin(['subjid','visit'])]

cate_index = ['gender', 'currentSmoker', 'Diabetes', 'nSES']
cox_df_full.loc[:, cate_index] = pd.get_dummies(cox_df_full.loc[:, cate_index], drop_first=True)

# fill nan
mean_score=cox_df_full['nbSESpc2score'].mean()
cox_df_full['nbSESpc2score'].fillna(value=mean_score, inplace=True)
mean_sbp=cox_df_full['sbp'].mean()
cox_df_full['sbp'].fillna(value=mean_sbp, inplace=True)
mean_hdl=cox_df_full['hdl'].mean()
cox_df_full['hdl'].fillna(value=mean_hdl, inplace=True)
mean_chol=cox_df_full['totchol'].mean()
cox_df_full['totchol'].fillna(value=mean_chol, inplace=True)
cox_df_full = cox_df_full.apply(lambda x: x.fillna(x.value_counts().index[0]))

cox_df_full

In [None]:
# store df
cox_df_full.to_csv('data/cox_df_full.csv', index = False)  

In [None]:
# os.remove('data/cox_df.csv')