Skip to content

Commit

Permalink
faster ctv!
Browse files Browse the repository at this point in the history
  • Loading branch information
CamDavidsonPilon committed Dec 18, 2017
1 parent edf983a commit 9c8fae0
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 30 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
- `CoxPHFitter` and `AalenAdditiveFitter` now have a `score_` property that is the concordance-index of the dataset to the fitted model.
- `CoxPHFitter` and `AalenAdditiveFitter` now longer have the `data` property. It was an _almost_ duplicate of the training data, but was causing the model to be very large when serialized.
- Implements a new fitter `CoxTimeVaryingFitter` available under the `lifelines` namespace. This model implements the Cox model for time-varying covariates.
- Utils for creating time varying datasets available in
- Utils for creating time varying datasets available in `utils`.
- less noisy check for complete separation.
- removed `datasets` namespace from the main `lifelines` namespace
- `CoxPHFitter` has a slightly more intelligent (barely...) way to pick a step size, so convergence should generally be faster.

#### 0.12.0
- removes `include_likelihood` from `CoxPHFitter.fit` - it was not slowing things down much (empirically), and often I wanted it for debugging (I suppose others do too). It's also another exit condition, so we many exit from the NR iterations faster.
Expand Down
41 changes: 16 additions & 25 deletions lifelines/fitters/cox_time_varying_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self, alpha=0.95, penalizer=0.0):
self.alpha = alpha
self.penalizer = penalizer

def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=True, step_size=0.50):
def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_progress=False, step_size=None):

df = df.copy()
if not (id_col in df and event_col in df and start_col in df and stop_col in df):
Expand All @@ -41,14 +41,12 @@ def fit(self, df, id_col, event_col, start_col='start', stop_col='stop', show_pr
df = df.rename(columns={id_col: 'id', event_col: 'event', start_col: 'start', stop_col: 'stop'})
df['event'] = df['event'].astype(bool)

df['interval'] = df.apply(lambda r: pd.Interval(r['start'], r['stop']), axis=1)
df = df.drop(['start'], axis=1)\
.set_index(['interval', 'id'])
df = df.set_index(['id'])

self._check_values(df.drop(["event", "stop"], axis=1), df['event'])
self._check_values(df.drop(["event", "stop", "start"], axis=1), df['event'])

stop_times_events = df[["event", "stop"]]
df = df.drop(["event", "stop"], axis=1)
stop_times_events = df[["event", "stop", "start"]]
df = df.drop(["event", "stop", "start"], axis=1)

self._norm_mean = df.mean(0)
self._norm_std = df.std(0)
Expand Down Expand Up @@ -111,7 +109,7 @@ def summary(self):
df['upper %.2f' % self.alpha] = self.confidence_intervals_.loc['upper-bound'].values
return df

def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=0.5, precision=10e-6):
def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size=None, precision=10e-6):
"""
Newton Rhaphson algorithm for fitting CPH model.
Expand All @@ -130,15 +128,17 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
"""
assert precision <= 1., "precision must be less than or equal to 1."

_, d = df.shape
n, d = df.shape

# make sure betas are correct size.
beta = np.zeros((d, 1))

i = 0
converging = True
ll = 0
previous_ll = 0

if step_size is None:
# empirically determined
step_size = 0.95 if n < 1000 else 0.5

while converging:
i += 1
Expand All @@ -162,8 +162,6 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
# convergence criteria
if norm(delta) < precision:
converging = False
elif abs(ll - previous_ll) < precision:
converging = False
elif i >= 50:
# 50 iterations steps with N-R is a lot.
# Expected convergence is ~10 steps
Expand All @@ -176,11 +174,10 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
if norm(delta) > 10:
step_size *= 0.5

# anneal the step size down.
step_size *= 0.99
# temper the step size down.
step_size *= 0.995

beta += delta
previous_ll = ll

self._hessian_ = hessian
self._score_ = gradient
Expand All @@ -203,13 +200,6 @@ def _get_gradients(self, df, stops_events, beta):
gradient: (1, d) numpy array
log_likelihood: double
"""
# the below INDEX_ is a faster way to use the at_ function. See https://stackoverflow.com/questions/47621886
# def at_(df, t):
# # this adds about a 100%+ runtime increase =(
# return df.iloc[(df.index.get_level_values(0).get_loc(t))]

INDEX_df = df.index.get_level_values(0)
INDEX_stops_events = stops_events.index.get_level_values(0)

_, d = df.shape
hessian = np.zeros((d, d))
Expand All @@ -224,8 +214,9 @@ def _get_gradients(self, df, stops_events, beta):
risk_phi_x, tie_phi_x = np.zeros((1, d)), np.zeros((1, d))
risk_phi_x_x, tie_phi_x_x = np.zeros((d, d)), np.zeros((d, d))

df_at_t = df.iloc[INDEX_df.get_loc(t)]
stops_events_at_t = stops_events.iloc[INDEX_stops_events.get_loc(t)]
ix = (stops_events['start'] < t) & (t <= stops_events['stop'])
df_at_t = df.loc[ix]
stops_events_at_t = stops_events.loc[ix]

phi_i = exp(dot(df_at_t, beta))
phi_x_i = phi_i * df_at_t
Expand Down
12 changes: 8 additions & 4 deletions lifelines/fitters/coxph_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def _get_efron_values(self, X, beta, T, E):

return hessian, gradient, log_lik

def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=.8,
def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=None,
precision=10e-6, show_progress=True):
"""
Newton Rhaphson algorithm for fitting CPH model.
Expand Down Expand Up @@ -178,6 +178,10 @@ def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=.8,
else:
beta = np.zeros((d, 1))

if step_size is None:
# empirically determined
step_size = 0.95 if n < 1000 else 0.5

# Method of choice is just efron right now
if self.tie_method == 'Efron':
get_gradients = self._get_efron_values
Expand Down Expand Up @@ -235,8 +239,8 @@ def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=.8,
if norm(delta) > 10.0:
step_size *= 0.5

# anneal the step size down.
step_size *= 0.99
# temper the step size down.
step_size *= 0.995

beta += delta
previous_ll = ll
Expand All @@ -250,7 +254,7 @@ def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=.8,

def fit(self, df, duration_col, event_col=None,
show_progress=False, initial_beta=None,
strata=None, step_size=0.5):
strata=None, step_size=None):
"""
Fit the Cox Propertional Hazard model to a dataset. Tied survival times
are handled using Efron's tie-method.
Expand Down
2 changes: 2 additions & 0 deletions lifelines/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1071,6 +1071,8 @@ def expand(df, cvs):
cv = cv.cumsum()
cv = cv.add_prefix(cumulative_sum_prefix)

#import pdb
#pdb.set_trace()
# How do I want to merge existing columns at the same time - could be
# new observations (update) or new treatment applied (sum).
# There may be more options in the future.
Expand Down

0 comments on commit 9c8fae0

Please sign in to comment.