## prep

In [None]:
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
from lifelines import KaplanMeierFitter
import random
import matplotlib.pyplot as plt
import statistics
from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from lifelines.statistics import logrank_test
from scipy import stats

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)

In [None]:
dat = pd.read_csv('data/jhs_complete_processed.csv')

dat['event'] = 0
dat['time'] = 3  # because most records were censored

## baseline model

### prep

In [None]:
dat_v1 = dat[dat['visit'] == 1]
dat_v2 = dat.loc[dat['visit'] == 2, ['subjid','y']].rename(columns={'y': 'y2'})
dat_v3 = dat.loc[dat['visit'] == 3, ['subjid','y']].rename(columns={'y': 'y3'})

merged_df = pd.merge(dat_v1, dat_v2, on='subjid')
merged_df = pd.merge(merged_df, dat_v3, on='subjid')

In [None]:
dat_base = merged_df

## incidence in V1
dat_base.loc[dat_base['y'] == 1,'event'] = 1
dat_base.loc[dat_base['y'] == 1,'time'] = 1

## incidence in V2
v2_index = (dat_base['y2'] == 1) & (dat_base['y'] == 0)
dat_base.loc[v2_index,'event'] = 1
dat_base.loc[v2_index,'time'] = 2

## incidence in V1
v3_index = (dat_base['y3'] == 1) & (dat_base['y'] == 0) & (dat_base['y2'] == 0)
dat_base.loc[v3_index,'event'] = 1

In [None]:
dat_base = dat_base.astype({'nbSESpc2score': 'category', 'nbK3paFacilities': 'category',
                           'N_UNFAV_CT00': 'category', 'sportIndex': 'category',
                           'hyIndex': 'category', 'activeIndex': 'category',
                           'darkgrnVeg': 'category', 'eggs': 'category',
                           'fish': 'category', 'gender': 'int',
                           'currentSmoker': 'int', 'Diabetes': 'int'})

### fit

In [None]:
cph_base = CoxPHFitter()
cph_base.fit(dat_base, duration_col='time', event_col='event',
       formula = "nbSESpc2score+nbK3paFacilities+N_UNFAV_CT00+sportIndex+hyIndex+activeIndex+darkgrnVeg+eggs+fish+age+gender+currentSmoker+Diabetes+hdl+sbp+totchol")


In [None]:
cph_base.print_summary()

cph_base.plot()
plt.show()

## time-varying covariates