In [None]:
import pandas as pd
import numpy as np
from lifelines.utils import to_long_format
from sklearn.model_selection import train_test_split
from lifelines import CoxTimeVaryingFitter
import random
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)

# calculate outcome y
def get_outcome(df):
    df['y'] = 0
    df.loc[
       (df['MIHx']==1.0) |
       (df['strokeHx']==1.0) |
       (df['CHDHx']==1.0)|
       (df['CVDHx']==1.0),
       'y'] = 1
    return df

In [None]:
##### get the df of all the covariates and outcome #####

In [None]:
# data of predictors
df = pd.read_csv('data/common_data_jhs.csv')
sex_mapping = {'Female':0,'Male':1}
df = df.assign(gender  = df.sex.map(sex_mapping))
df = df[['subjid','visit','age', 'gender', 'currentSmoker', 'sbp', 'Diabetes', 'hdl', 'totchol','nSES', 'nbSESpc2score']]
common_subj_index = df['subjid'].tolist()
df

In [None]:
# data of outcome
df_raw = pd.read_csv('data/jhs_data.csv')
df_y_v1 = df_raw[df_raw["visit"] == 1]
df_y_v1 = df_y_v1.loc[df_y_v1['subjid'].isin(common_subj_index),:]  # subset subjects with data across 3 visits
df_y_v1 = get_outcome(df_y_v1)
df_y_v1 = df_y_v1[['subjid','y']]
df_y_v1 = df_y_v1.rename(columns={"y": "y1"})

df_y_v2 = df_raw[df_raw["visit"] == 2]
df_y_v2 = df_y_v2.loc[df_y_v2['subjid'].isin(common_subj_index),:] 
df_y_v2 = get_outcome(df_y_v2)
df_y_v2 = df_y_v2[['subjid','y']]
df_y_v2 = df_y_v2.rename(columns={"y": "y2"})

df_y_v3 = df_raw[df_raw["visit"] == 3]
df_y_v3 = df_y_v3.loc[df_y_v3['subjid'].isin(common_subj_index),:] 
df_y_v3 = get_outcome(df_y_v3)
df_y_v3 = df_y_v3[['subjid','y']]
df_y_v3 = df_y_v3.rename(columns={"y": "y3"})

In [None]:
# merge predictors and outcome
df_y1 = df.merge(df_y_v1, on = 'subjid', how = 'left', suffixes=(False, False))
df_y2 = df_y1.merge(df_y_v2, on = 'subjid', how = 'left', suffixes=(False, False))
df_y123 = df_y2.merge(df_y_v3, on = 'subjid', how = 'left', suffixes=(False, False))
df_y123

In [None]:
# counts of CVD events in each visit

df_1 = df_y123[df_y123["visit"] == 1]
df_2 = df_y123[df_y123["visit"] == 2]
df_3 = df_y123[df_y123["visit"] == 3]
print("y1 = ",df_1[df_1["y1"] == 1].shape[0])
print("y2 = ",df_2[df_2["y2"] == 1].shape[0])
print("y3 = ",df_3[df_3["y3"] == 1].shape[0])

In [None]:
##### create df in long format #####

In [None]:
# deal with df_1 nan

def FillNan(df):
    # replace na with mean (continuous predictor)
    mean_score=df['nbSESpc2score'].mean()
    df['nbSESpc2score'].fillna(value=mean_score, inplace=True)
    mean_sbp=df['sbp'].mean()
    df['sbp'].fillna(value=mean_sbp, inplace=True)
    mean_hdl=df['hdl'].mean()
    df['hdl'].fillna(value=mean_hdl, inplace=True)
    mean_chol=df['totchol'].mean()
    df['totchol'].fillna(value=mean_chol, inplace=True)

    # filling na with most common class (categorical predictor)
    df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))
    
    return df

df_1 = FillNan(df_1)
print(df_1.isnull().sum())

In [None]:
base_df = df_1[['subjid', 'visit', 'age','gender', 'currentSmoker','sbp', 'Diabetes','hdl','totchol', 'nSES', 'nbSESpc2score','y1']]
base_df = to_long_format(base_df, duration_col="visit")
base_df = base_df.rename(columns={'y1': 'event'})
base_df

In [None]:
# extract V2 covariates and y2
sec_df = df_2[['subjid','visit', 'age', 'sbp', 'Diabetes','y2']]
sec_df = sec_df.rename(columns={'y2': 'event', 'visit':'stop'})

# deal with nan
def FillNan2(df):
    # replace na with mean (continuous predictor)
    mean_sbp=df['sbp'].mean()
    df['sbp'].fillna(value=mean_sbp, inplace=True)
    # filling na with most common class (categorical predictor)
    df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))
    return df

sec_df = FillNan2(sec_df)
print(sec_df.isnull().sum())

# get value from V1 if covariates not available at V2
covar_from_V1 = df_1[['subjid', 'gender', 'currentSmoker', 'hdl','totchol', 'nSES','nbSESpc2score']]
sec_df = sec_df.merge(covar_from_V1, on = 'subjid', how = 'left', suffixes=(False, False))

# manually add "start" col
sec_df['start'] = 1

In [None]:
# combine base_df and sec_df
com_12 = pd.concat([base_df, sec_df])
com_12 = com_12.sort_values(by=['subjid'])
com_12

In [None]:
# df for V3
third_df = df_3[['subjid','visit', 'age', 'sbp', 'Diabetes','hdl','y3']]
third_df = third_df.rename(columns={'y3': 'event', 'visit':'stop'})

# deal with nan
def FillNan3(df):
    # replace na with mean (continuous predictor)
    mean_sbp=df['sbp'].mean()
    df['sbp'].fillna(value=mean_sbp, inplace=True)
    mean_hdl=df['hdl'].mean()
    df['hdl'].fillna(value=mean_hdl, inplace=True)
    # filling na with most common class (categorical predictor)
    df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))
    return df

third_df = FillNan3(third_df)
print(third_df.isnull().sum())

# get value from V1 if covariates not available at V2
covar_from_V1 = df_1[['subjid', 'gender', 'currentSmoker','totchol', 'nSES','nbSESpc2score']]
third_df = third_df.merge(covar_from_V1, on = 'subjid', how = 'left', suffixes=(False, False))

# manually add "start" col
third_df['start'] = 2

In [None]:
# combine df
cox_df = pd.concat([com_12, third_df])
cox_df = cox_df.sort_values(by=['subjid'])

# dummy coding
cate_index = ['gender', 'currentSmoker', 'Diabetes', 'nSES']
cox_df.loc[:, cate_index] = pd.get_dummies(cox_df.loc[:, cate_index], drop_first=True)

cox_df

In [None]:
##### model fitting #####

In [None]:
# split train and test set

random.seed(1)

tot = np.arange(0,3568).tolist()
n_tr = int(3568 * 0.8)
tr_index = random.sample(tot, k=n_tr) 
cox_tr = cox_df.loc[tr_index,:]

tr_index = cox_df.index.isin(tr_index)
cox_te = cox_df.loc[~tr_index]

In [None]:
# model without nses

In [None]:
ctv_no_nses = CoxTimeVaryingFitter(penalizer=0.1)
ctv_no_nses.fit(cox_tr.loc[:, ~cox_tr.columns.isin(['nSES', 'nbSESpc2score'])], id_col="subjid", event_col="event", start_col="start", stop_col="stop", show_progress=False)


In [None]:
ctv_no_nses.print_summary()

In [None]:
# model with binary nses
ctv_nses = CoxTimeVaryingFitter(penalizer=0.1)
ctv_nses.fit(cox_tr.loc[:, cox_tr.columns!='nbSESpc2score'], id_col="subjid", event_col="event", start_col="start", stop_col="stop", show_progress=False)

In [None]:
ctv_nses.print_summary()

In [None]:
# model with continuous nSES

In [None]:
ctv_con_nses = CoxTimeVaryingFitter(penalizer=0.1)
ctv_con_nses.fit(cox_tr.loc[:, cox_tr.columns!='nSES'], id_col="subjid", event_col="event", start_col="start", stop_col="stop", show_progress=False)

In [None]:
ctv_con_nses.print_summary()

In [None]:
ctv_con_nses.plot

In [None]:
##### model performance #####

In [None]:
# predictions?
par_hazard = ctv_con_nses.predict_partial_hazard(cox_te.loc[:, ~cox_te.columns.isin(['nSES', 'event'])])

In [None]:
cox_te.loc[:, ~cox_te.columns.isin(['nSES'])]

In [None]:
print(par_hazard)

In [None]:
dir(par_hazard)

In [None]:
# density plot of nses score
import statistics

# matplotlib histogram
plt.hist(par_hazard, color = '#1f77b4', edgecolor = 'black',
         bins = 50)

# Add labels
plt.title('Histogram of predicted hazards')
plt.xlabel('hazards')
plt.ylabel('Frequency')
plt.show()

In [None]:
[ctv_con_nses.baseline_cumulative_hazard_]

In [None]:
cox_te

In [None]:
ctv_con_nses.baseline_survival_

In [None]:
ctv_con_nses.plot_covariate_groups

In [None]:
ctv_con_nses.plot

In [None]:
ctv_con_nses.formula

In [None]:
dir(ctv_con_nses)