From 9c8fae077b8cbb7cbd015bf9d0996cc4776758bf Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Sun, 17 Dec 2017 21:52:36 -0500 Subject: [PATCH] faster ctv! --- CHANGELOG.md | 3 +- lifelines/fitters/cox_time_varying_fitter.py | 41 ++++++++------------ lifelines/fitters/coxph_fitter.py | 12 ++++-- lifelines/utils/__init__.py | 2 + 4 files changed, 28 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae78530f9..e6f328656 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,9 +5,10 @@ - `CoxPHFitter` and `AalenAdditiveFitter` now have a `score_` property that is the concordance-index of the dataset to the fitted model. - `CoxPHFitter` and `AalenAdditiveFitter` now longer have the `data` property. It was an _almost_ duplicate of the training data, but was causing the model to be very large when serialized. - Implements a new fitter `CoxTimeVaryingFitter` available under the `lifelines` namespace. This model implements the Cox model for time-varying covariates. - - Utils for creating time varying datasets available in + - Utils for creating time varying datasets available in `utils`. - less noisy check for complete separation. - removed `datasets` namespace from the main `lifelines` namespace + - `CoxPHFitter` has a slightly more intelligent (barely...) way to pick a step size, so convergence should generally be faster. #### 0.12.0 - removes `include_likelihood` from `CoxPHFitter.fit` - it was not slowing things down much (empirically), and often I wanted it for debugging (I suppose others do too). It's also another exit condition, so we many exit from the NR iterations faster. diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py index 7b9155610..20e64c4aa 100644 --- a/lifelines/fitters/cox_time_varying_fitter.py +++ b/lifelines/fitters/cox_time_varying_fitter.py @@ -32,7 +32,7 @@ def __init__(self, alpha=0.95, penalizer=0.0): self.alpha = alpha self.penalizer = penalizer - def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=True, step_size=0.50): + def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None): df = df.copy() if not (id_col in df and event_col in df and start_col in df and stop_col in df): @@ -41,14 +41,12 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop'}) df['event'] = df['event'].astype(bool) - df['interval'] = df.apply(lambda r: pd.Interval(r['start'], r['stop']), axis=1) - df = df.drop(['start'], axis=1)\ - .set_index(['interval', 'id']) + df = df.set_index(['id']) - self._check_values(df.drop(["event", "stop"], axis=1), df['event']) + self._check_values(df.drop(["event", "stop", "start"], axis=1), df['event']) - stop_times_events = df[["event", "stop"]] - df = df.drop(["event", "stop"], axis=1) + stop_times_events = df[["event", "stop", "start"]] + df = df.drop(["event", "stop", "start"], axis=1) self._norm_mean = df.mean(0) self._norm_std = df.std(0) @@ -111,7 +109,7 @@ def summary(self): df['upper %.2f' % self.alpha] = self.confidence_intervals_.loc['upper-bound'].values return df - def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=0.5, precision=10e-6): + def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-6): """ Newton Rhaphson algorithm for fitting CPH model. @@ -130,15 +128,17 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size """ assert precision <= 1., "precision must be less than or equal to 1." - _, d = df.shape + n, d = df.shape # make sure betas are correct size. beta = np.zeros((d, 1)) i = 0 converging = True - ll = 0 - previous_ll = 0 + + if step_size is None: + # empirically determined + step_size = 0.95 if n < 1000 else 0.5 while converging: i += 1 @@ -162,8 +162,6 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size # convergence criteria if norm(delta) < precision: converging = False - elif abs(ll - previous_ll) < precision: - converging = False elif i >= 50: # 50 iterations steps with N-R is a lot. # Expected convergence is ~10 steps @@ -176,11 +174,10 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size if norm(delta) > 10: step_size *= 0.5 - # anneal the step size down. - step_size *= 0.99 + # temper the step size down. + step_size *= 0.995 beta += delta - previous_ll = ll self._hessian_ = hessian self._score_ = gradient @@ -203,13 +200,6 @@ def _get_gradients(self, df, stops_events, beta): gradient: (1, d) numpy array log_likelihood: double """ - # the below INDEX_ is a faster way to use the at_ function. See https://stackoverflow.com/questions/47621886 - # def at_(df, t): - # # this adds about a 100%+ runtime increase =( - # return df.iloc[(df.index.get_level_values(0).get_loc(t))] - - INDEX_df = df.index.get_level_values(0) - INDEX_stops_events = stops_events.index.get_level_values(0) _, d = df.shape hessian = np.zeros((d, d)) @@ -224,8 +214,9 @@ def _get_gradients(self, df, stops_events, beta): risk_phi_x, tie_phi_x = np.zeros((1, d)), np.zeros((1, d)) risk_phi_x_x, tie_phi_x_x = np.zeros((d, d)), np.zeros((d, d)) - df_at_t = df.iloc[INDEX_df.get_loc(t)] - stops_events_at_t = stops_events.iloc[INDEX_stops_events.get_loc(t)] + ix = (stops_events['start'] < t) & (t <= stops_events['stop']) + df_at_t = df.loc[ix] + stops_events_at_t = stops_events.loc[ix] phi_i = exp(dot(df_at_t, beta)) phi_x_i = phi_i * df_at_t diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py index 28a92fd6f..a36001f4a 100644 --- a/lifelines/fitters/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -148,7 +148,7 @@ def _get_efron_values(self, X, beta, T, E): return hessian, gradient, log_lik - def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=.8, + def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=None, precision=10e-6, show_progress=True): """ Newton Rhaphson algorithm for fitting CPH model. @@ -178,6 +178,10 @@ def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=.8, else: beta = np.zeros((d, 1)) + if step_size is None: + # empirically determined + step_size = 0.95 if n < 1000 else 0.5 + # Method of choice is just efron right now if self.tie_method == 'Efron': get_gradients = self._get_efron_values @@ -235,8 +239,8 @@ def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=.8, if norm(delta) > 10.0: step_size *= 0.5 - # anneal the step size down. - step_size *= 0.99 + # temper the step size down. + step_size *= 0.995 beta += delta previous_ll = ll @@ -250,7 +254,7 @@ def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=.8, def fit(self, df, duration_col, event_col=None, show_progress=False, initial_beta=None, - strata=None, step_size=0.5): + strata=None, step_size=None): """ Fit the Cox Propertional Hazard model to a dataset. Tied survival times are handled using Efron's tie-method. diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index ca4f5fe59..ccb0fb941 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -1071,6 +1071,8 @@ def expand(df, cvs): cv = cv.cumsum() cv = cv.add_prefix(cumulative_sum_prefix) + #import pdb + #pdb.set_trace() # How do I want to merge existing columns at the same time - could be # new observations (update) or new treatment applied (sum). # There may be more options in the future.