## prep

In [None]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from lifelines.utils import to_long_format
from lifelines import KaplanMeierFitter
from lifelines import CoxTimeVaryingFitter
import random
import matplotlib.pyplot as plt
import statistics
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from lifelines.statistics import logrank_test
from scipy import stats

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)

In [None]:
dat = pd.read_csv('data/jhs_complete_processed.csv')

dat['event'] = 0
dat['time'] = 3  # because most records were censored

## baseline model

### prep

In [None]:
dat_v1 = dat[dat['visit'] == 1]
dat_v2 = dat.loc[dat['visit'] == 2, ['subjid','y']].rename(columns={'y': 'y2'})
dat_v3 = dat.loc[dat['visit'] == 3, ['subjid','y']].rename(columns={'y': 'y3'})

merged_df = pd.merge(dat_v1, dat_v2, on='subjid')
merged_df = pd.merge(merged_df, dat_v3, on='subjid')

In [None]:
dat_base = merged_df

## incidence in V1
dat_base.loc[dat_base['y'] == 1,'event'] = 1
dat_base.loc[dat_base['y'] == 1,'time'] = 1

## incidence in V2
v2_index = (dat_base['y2'] == 1) & (dat_base['y'] == 0)
dat_base.loc[v2_index,'event'] = 1
dat_base.loc[v2_index,'time'] = 2

## incidence in V1
v3_index = (dat_base['y3'] == 1) & (dat_base['y'] == 0) & (dat_base['y2'] == 0)
dat_base.loc[v3_index,'event'] = 1

In [None]:
cols_to_convert = ['nbSESpc2score', 'nbK3paFacilities','N_UNFAV_CT00', 'nSES',
                   'sportIndex','hyIndex','activeIndex','darkgrnVeg', 'eggs','fish', 
                   'gender', 'currentSmoker', 'Diabetes']
dat_base[cols_to_convert] = dat_base[cols_to_convert].astype('category')

### fit

In [None]:
cph_base = CoxPHFitter()
cph_base.fit(dat_base, duration_col='time', event_col='event',
       formula = "nbSESpc2score+nbK3paFacilities+N_UNFAV_CT00+sportIndex+hyIndex+activeIndex+darkgrnVeg+eggs+fish+age+gender+currentSmoker+Diabetes+hdl+sbp+totchol")


In [None]:
cph_base.print_summary()

cph_base.plot()
plt.show()

## time-varying covariates

### prep

In [None]:
# V1 in long format

dat_tv1 = dat[dat['visit'] == 1]

dat_tv1 = to_long_format(dat_tv1, duration_col="visit")
dat_tv1 = dat_tv1.drop('event', axis=1)
dat_tv1 = dat_tv1.rename(columns={'y': 'event'})

In [None]:
# V2

dat_tv2 = dat.loc[dat['visit'] == 2].rename(columns={'y': 'y2'})

## get value from V1 if covariates not available at V2
dat_tv2 = dat_tv2[['subjid','visit','Diabetes','age','sbp','N_UNFAV_CT00','gender','y2']]
covar_from_V1 = dat_tv1[['subjid','nbSESpc2score', 'nbK3paFacilities',
                        'sportIndex','hyIndex','activeIndex','darkgrnVeg', 'eggs','fish',
                        'currentSmoker', 'hdl','totchol']]
dat_tv2 = dat_tv2.merge(covar_from_V1, on = 'subjid', how = 'left', suffixes=(False, False))

## manually add "start" and "stop" col
dat_tv2 = dat_tv2.rename(columns={'y2': 'event', 'visit':'stop'})
dat_tv2['start'] = 1

In [None]:
# V3

dat_tv3 = dat.loc[dat['visit'] == 3].rename(columns={'y': 'y3'})

## get value from V1 if covariates not available at V2
dat_tv3 = dat_tv3[['subjid','visit','y3',
                   'sportIndex','hyIndex','activeIndex',
                   'Diabetes','age','sbp','hdl','gender']]
covar_from_V2 = dat_tv2[['subjid','N_UNFAV_CT00']]
covar_from_V1 = dat_tv1[['subjid','nbSESpc2score', 'nbK3paFacilities',
                        'darkgrnVeg', 'eggs','fish',
                        'currentSmoker', 'totchol']]
dat_tv3 = dat_tv3.merge(covar_from_V2, on = 'subjid', how = 'left', suffixes=(False, False))
dat_tv3 = dat_tv3.merge(covar_from_V1, on = 'subjid', how = 'left', suffixes=(False, False))

## manually add "start" and "stop" col
dat_tv3 = dat_tv3.rename(columns={'y3': 'event', 'visit':'stop'})
dat_tv3['start'] = 2

In [None]:
# merge

dat_tv12 = pd.concat([dat_tv1, dat_tv2])
dat_tv123 = pd.concat([dat_tv12, dat_tv3])
dat_tv = dat_tv123.sort_values(by=['subjid'])
dat_tv = dat_tv.loc[:,['subjid', 'event','start','stop',
                       'nbSESpc2score', 'nbK3paFacilities','N_UNFAV_CT00', 
                   'sportIndex','hyIndex','activeIndex','darkgrnVeg', 'eggs','fish', 
                   'age','gender', 'currentSmoker', 'Diabetes','sbp','hdl','totchol']]

dat_tv

### fit

In [None]:
ctv = CoxTimeVaryingFitter(penalizer=0.1)
ctv.fit(dat_tv, id_col="subjid",event_col="event", start_col="start", stop_col="stop", show_progress=False,
       formula={'nbSESpc2score': 'categorical', 'gender': 'binary','sbp':'continuous'})
ctv.print_summary()