## Prep

In [None]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from lifelines.utils import to_long_format
from lifelines import KaplanMeierFitter
from lifelines import CoxTimeVaryingFitter
import random
import matplotlib.pyplot as plt
import statistics
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from lifelines.statistics import logrank_test
from scipy import stats
from lifelines.utils import median_survival_times
from lifelines.statistics import pairwise_logrank_test


import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)

In [None]:
dat = pd.read_csv('data/jhs_preprocess_0914.csv')

dat['event'] = 0

### calculate time

In [None]:
dat['time'] = ''

dat_v1 = dat.loc[dat['visit'] == 1, ['subjid','VisitDate']].rename(columns={'VisitDate': 't1'})
dat_v2 = dat.loc[dat['visit'] == 2, ['subjid','VisitDate']].rename(columns={'VisitDate': 't2'})
dat_v3 = dat.loc[dat['visit'] == 3, ['subjid','VisitDate']].rename(columns={'VisitDate': 't3'})

dat_v1.reset_index(drop=True, inplace=True)
dat_v2.reset_index(drop=True, inplace=True)
dat_v3.reset_index(drop=True, inplace=True)

dat_t12 = pd.concat([dat_v1, dat_v2], axis=1)
dat_time = pd.concat([dat_t12, dat_v3], axis=1)

In [None]:
dat_time['t1'] = pd.to_datetime(dat_time['t1'])
dat_time['t2'] = pd.to_datetime(dat_time['t2'])
dat_time['t3'] = pd.to_datetime(dat_time['t3'])

dat_time['time_12'] = dat_time['t2'] - dat_time['t1']
dat_time['time_13'] = dat_time['t3'] - dat_time['t1']

dat_time = dat_time.iloc[:,[0,-2,-1]]
dat_time['time_12'] = (dat_time['time_12'] / pd.Timedelta(days=1)).astype(int)
dat_time['time_13'] = (dat_time['time_13'] / pd.Timedelta(days=1)).astype(int)

In [None]:
dat = pd.merge(dat,dat_time, on='subjid')

## Bseline Model

### prep

In [None]:
## get all the covariates from V1, combine with outcome from 3 visits
dat_v1 = dat[dat['visit'] == 1]
dat_v2 = dat.loc[dat['visit'] == 2, ['subjid','y']].rename(columns={'y': 'y2'})
dat_v3 = dat.loc[dat['visit'] == 3, ['subjid','y']].rename(columns={'y': 'y3'})

merged_df = pd.merge(dat_v1, dat_v2, on='subjid')
merged_df = pd.merge(merged_df, dat_v3, on='subjid')

merged_df['time'] = merged_df['time_13']  # because most subjects were right censored

In [None]:
dat_base = merged_df

## incidence in V1
dat_base.loc[dat_base['y'] == 1,'event'] = 1
dat_base.loc[dat_base['y'] == 1,'time'] = 0

## incidence in V2
v2_index = (dat_base['y2'] == 1) & (dat_base['y'] == 0)
dat_base.loc[v2_index,'event'] = 1
dat_base.loc[v2_index, 'time'] = dat_base.loc[v2_index, 'time_12']

## incidence in V1
v3_index = (dat_base['y3'] == 1) & (dat_base['y'] == 0) & (dat_base['y2'] == 0)
dat_base.loc[v3_index,'event'] = 1
dat_base.loc[v3_index, 'time'] = dat_base.loc[v3_index, 'time_13']

In [None]:
# convert categorical variables for auto dummy 

cols_to_convert = ['nbSESpc2score', 'N_UNFAV_CT00','nbK3paFacilities', 'G_bla_rk',
                   'PA3cat','nutrition3cat',
                   'gender', 'currentSmoker', 'Diabetes','alc','fmlyinc']
dat_base[cols_to_convert] = dat_base[cols_to_convert].astype('category')

### fit

In [None]:
cph_base = CoxPHFitter()
cph_base.fit(dat_base, duration_col='time', event_col='event',
       formula = "nbSESpc2score+nbK3paFacilities+N_UNFAV_CT00+G_bla_rk+ \
                 PA3cat+nutrition3cat+\
                 age+gender+currentSmoker+Diabetes+hdl+sbp+totchol+alc+fmlyinc")


In [None]:
cph_base.print_summary()

In [None]:
cph_base.plot()
plt.show()

## Time-varying Model

### prep

In [None]:
# V1 in long format

dat_tv1 = dat[dat['visit'] == 1]
dat_tv1['time_1'] = 0
dat_tv1 = to_long_format(dat_tv1, duration_col="time_1")
dat_tv1 = dat_tv1.drop('event', axis=1)
dat_tv1 = dat_tv1.rename(columns={'y': 'event'})

## exists subjects died on the day of entry. 
## not allowed in CoxTimeVaryingFitter. 
## add a small non-zero value to their end 
dat_tv1.loc[ (dat_tv1['start'] == dat_tv1['stop']) & (dat_tv1['start'] == 0) & dat_tv1['event'], 'stop'] = 1
dat_tv1

In [None]:
# V2

dat_tv2 = dat.loc[dat['visit'] == 2].rename(columns={'y': 'y2'})

## get value from V1 if covariates not available at V2
dat_tv2 = dat_tv2[['subjid','visit','N_UNFAV_CT00','G_bla_rk','Diabetes','age','sbp','gender','y2','time_12']]
covar_from_V1 = dat_tv1[['subjid','nbSESpc2score', 'nbK3paFacilities',
                        'PA3cat','nutrition3cat',
                        'currentSmoker', 'hdl','totchol','alc','fmlyinc']]
dat_tv2 = dat_tv2.merge(covar_from_V1, on = 'subjid', how = 'left', suffixes=(False, False))

## manually add "start" and "stop" col
dat_tv2 = dat_tv2.rename(columns={'y2': 'event'})
dat_tv2['start'] = 0
dat_tv2['stop'] = dat_tv2['time_12']


In [None]:
# V3

dat_tv3 = dat.loc[dat['visit'] == 3].rename(columns={'y': 'y3'})

## get value from V1 if covariates not available at V2
dat_tv3 = dat_tv3[['subjid','visit','y3', 'time_12','time_13',
                   'PA3cat',
                   'Diabetes','age','sbp','hdl','gender','alc','fmlyinc']]
covar_from_V2 = dat_tv2[['subjid','N_UNFAV_CT00','G_bla_rk']]
covar_from_V1 = dat_tv1[['subjid','nbSESpc2score', 'nbK3paFacilities',
                        'currentSmoker', 'totchol']]
dat_tv3 = dat_tv3.merge(covar_from_V2, on = 'subjid', how = 'left', suffixes=(False, False))
dat_tv3 = dat_tv3.merge(covar_from_V1, on = 'subjid', how = 'left', suffixes=(False, False))

## manually add "start" and "stop" col
dat_tv3 = dat_tv3.rename(columns={'y3': 'event'})
dat_tv3['start'] = dat_tv3['time_12']
dat_tv3['stop'] = dat_tv3['time_13']

In [None]:
# merge

dat_tv12 = pd.concat([dat_tv1, dat_tv2])
dat_tv123 = pd.concat([dat_tv12, dat_tv3])
dat_tv = dat_tv123.sort_values(by=['subjid'])
dat_tv = dat_tv.loc[:,['subjid', 'event','start','stop',
                       'nbSESpc2score', 'nbK3paFacilities','N_UNFAV_CT00', 'G_bla_rk',
                   'PA3cat','nutrition3cat',
                   'age','gender', 'currentSmoker', 'Diabetes','sbp','hdl','totchol','alc','fmlyinc']]


In [None]:
# dummy coding for categorical variables
cols_to_convert = ['nbSESpc2score', 'nbK3paFacilities','N_UNFAV_CT00', 'G_bla_rk',
                   'PA3cat','nutrition3cat', 
                   'gender', 'currentSmoker', 'Diabetes','alc','fmlyinc']

dat_tv[cols_to_convert] = dat_tv[cols_to_convert].astype('category')

dummy_cols = pd.get_dummies(dat_tv[cols_to_convert])
dat_tv = dat_tv.drop(cols_to_convert, axis=1)
dat_tv = pd.concat([dat_tv, dummy_cols], axis=1)

### fit

In [None]:
ctv = CoxTimeVaryingFitter(penalizer=0.1)
ctv.fit(dat_tv, id_col="subjid",event_col="event", start_col="start", stop_col="stop", show_progress=False)
ctv.print_summary()

## Kaplan-Meier Curve

In [None]:
## use the baseline cox model df dat_base

time =dat_base['time']
event =dat_base['event'].astype(float)
 
kmf = KaplanMeierFitter()
kmf.fit(time,event)

In [None]:
# Helper function for plotting Kaplan-Meier curves at the covariate level
def plot_km(col):
  ax = plt.subplot(111)
  for r in dat_base[col].unique():
    index = dat_base[col] == r
    kmf.fit(time[index], event[index],label=r)
    kmf.plot(ax=ax)
    ax.set(title = col)
    
# Helper function for printing out Log-rank test results
def print_logrank(col):
  log_rank = pairwise_logrank_test(dat_base['time'], dat_base[col], dat_base['event'])
  return log_rank.summary

In [None]:
# get plot for each variable
plot_km('G_bla_rk')
plt.show()

print_logrank('G_bla_rk')