Skip to content

Commit

Permalink
Merge pull request #429 from CamDavidsonPilon/cp-grouped-fitter
Browse files Browse the repository at this point in the history
Cp grouped fitter
  • Loading branch information
CamDavidsonPilon committed Mar 21, 2018
2 parents 4d22dd9 + e03fe46 commit 1f2315f
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 37 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
### Changelogs

#### 0.15.0
- even smarter `step_size` calculations for iterative optimizations.
- fixed bug with using weights and strata in `CoxPHFitter`


#### 0.14.0
- adding `plot_covariate_groups` to `CoxPHFitter` to visualize what happens to survival as we vary a covariate, all else being equal.
- `utils` functions like `qth_survival_times` and `median_survival_times` now return the transpose of the DataFrame compared to previous version of lifelines. The reason for this is that we often treat survival curves as columns in DataFrames, and functions of the survival curve as index (ex: KaplanMeierFitter.survival_function_ returns a survival curve _at_ time _t_).
Expand Down
16 changes: 5 additions & 11 deletions lifelines/fitters/cox_time_varying_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
significance_code, normalize,\
pass_for_numeric_dtypes_or_raise, check_low_var,\
check_for_overlapping_intervals, check_complete_separation_low_variance,\
ConvergenceWarning
ConvergenceWarning, StepSizer


class CoxTimeVaryingFitter(BaseFitter):
Expand Down Expand Up @@ -138,9 +138,8 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
converging = True
ll, previous_ll = 0, 0

if step_size is None:
# empirically determined
step_size = min(0.98, 3.0 / np.log10(n))
step_sizer = StepSizer(step_size)
step_size = step_sizer.next()

while converging:
i += 1
Expand Down Expand Up @@ -177,12 +176,7 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/ ", ConvergenceWarning)
converging, completed = False, False

# Only allow small steps
if norm(delta) > 10:
step_size *= 0.5

# temper the step size down.
step_size *= 0.995
step_size = step_sizer.update(norm(delta)).next()

beta += delta

Expand Down Expand Up @@ -213,7 +207,7 @@ def _get_gradients(self, df, stops_events, beta):
gradient = np.zeros((1, d))
log_lik = 0

unique_death_times = np.sort(stops_events['stop'].loc[stops_events['event']].unique())
unique_death_times = np.unique(stops_events['stop'].loc[stops_events['event']])

for t in unique_death_times:
x_death_sum = np.zeros((1, d))
Expand Down
50 changes: 25 additions & 25 deletions lifelines/fitters/coxph_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import print_function
from __future__ import division

import time
import warnings
import numpy as np
import pandas as pd
Expand All @@ -15,7 +16,10 @@
from lifelines.utils import survival_table_from_events, inv_normal_cdf, normalize,\
significance_code, concordance_index, _get_index, qth_survival_times,\
pass_for_numeric_dtypes_or_raise, check_low_var, coalesce,\
check_complete_separation, check_nans, StatError, ConvergenceWarning
check_complete_separation, check_nans, StatError, ConvergenceWarning,\
StepSizer




class CoxPHFitter(BaseFitter):
Expand Down Expand Up @@ -84,6 +88,8 @@ def fit(self, df, duration_col, event_col=None,
self, with additional properties: hazards_
"""
#import pdb
#pdb.set_trace()
df = df.copy()

# Sort on time
Expand All @@ -105,15 +111,15 @@ def fit(self, df, duration_col, event_col=None,
del df[event_col]

if weights_col:
weights = df.pop(weights_col).values
weights = df.pop(weights_col)
if (weights.astype(int) != weights).any():
warnings.warn("""It looks like your weights are not integers, possibly prospenity scores then?
warnings.warn("""It looks like your weights are not integers, possibly propensity scores then?
It's important to know that the naive variance estimates of the coefficients are biased. Instead use Monte Carlo to
estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"
""", RuntimeWarning)

else:
weights = np.ones(self._n_examples)
weights = pd.DataFrame(np.ones((self._n_examples,1)), index=df.index)

self._check_values(df, T, E)
df = df.astype(float)
Expand Down Expand Up @@ -184,9 +190,8 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N
else:
beta = np.zeros((d, 1))

if step_size is None:
# "empirically" determined
step_size = min(0.98, 3.0 / np.log10(n))
step_sizer = StepSizer(step_size)
step_size = step_sizer.next()

# Method of choice is just efron right now
if self.tie_method == 'Efron':
Expand All @@ -198,19 +203,20 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N
converging = True
warn_ll = True
ll, previous_ll = 0, 0
start = time.time()

while converging:
self.path.append(beta.copy())
i += 1
if self.strata is None:
h, g, ll = get_gradients(X.values, beta, T.values, E.values, weights)
h, g, ll = get_gradients(X.values, beta, T.values, E.values, weights.values)
else:
g = np.zeros_like(beta).T
h = np.zeros((beta.shape[0], beta.shape[0]))
ll = 0
for strata in np.unique(X.index):
stratified_X, stratified_T, stratified_E = X.loc[[strata]], T.loc[[strata]], E.loc[[strata]]
_h, _g, _ll = get_gradients(stratified_X.values, beta, stratified_T.values, stratified_E.values, weights)
stratified_X, stratified_T, stratified_E, stratified_W = X.loc[[strata]], T.loc[[strata]], E.loc[[strata]], weights.loc[[strata]]
_h, _g, _ll = get_gradients(stratified_X.values, beta, stratified_T.values, stratified_E.values, stratified_W.values)
g += _g
h += _h
ll += _ll
Expand All @@ -230,7 +236,7 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N
hessian, gradient = h, g

if show_progress:
print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f" % (i, norm(delta), step_size, ll))
print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start))
# convergence criteria
if norm(delta) < precision:
converging, completed = False, True
Expand All @@ -247,12 +253,7 @@ def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=N
See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/ ", ConvergenceWarning)
converging, completed = False, False

# Only allow small steps
if norm(delta) > 10.0:
step_size *= 0.5

# temper the step size down.
step_size *= 0.995
step_size = step_sizer.update(norm(delta)).next()

beta += delta
previous_ll = ll
Expand All @@ -272,17 +273,13 @@ def _get_efron_values(self, X, beta, T, E, weights):
"""
Calculates the first and second order vector differentials,
with respect to beta.
Note that X, T, E are assumed to be sorted on T!
Parameters:
X: (n,d) numpy array of observations.
beta: (1, d) numpy array of coefficients.
T: (n) numpy array representing observed durations.
E: (n) numpy array representing death events.
weights: (n) an array representing weights per observation.
Returns:
hessian: (d, d) numpy array,
gradient: (1, d) numpy array
Expand Down Expand Up @@ -367,11 +364,14 @@ def _get_efron_values(self, X, beta, T, E, weights):
tie_phi_x_x = np.zeros((d, d))
return hessian, gradient, log_lik


def _compute_baseline_cumulative_hazard(self):
return self.baseline_hazard_.cumsum()

@staticmethod
def _check_values(df, T, E):
#import pdb
#pdb.set_trace()
pass_for_numeric_dtypes_or_raise(df)
check_nans(T)
check_nans(E)
Expand Down Expand Up @@ -463,13 +463,13 @@ def predict_log_partial_hazard(self, X):
can be in any order. If a numpy array, columns must be in the
same order as the training data.
This is equivalent to R's linear.predictors.
Returns the log of the partial hazard for the individuals, partial since the
baseline hazard is not included. Equal to \beta (X - \bar{X})
If X is a dataframe, the order of the columns do not matter. But
if X is an array, then the column ordering is assumed to be the
same as the training dataset.
Returns the log of the partial hazard for the individuals, partial since the
baseline hazard is not included. Equal to \beta X
"""
if isinstance(X, pd.DataFrame):
order = self.hazards_.columns
Expand All @@ -486,7 +486,7 @@ def predict_log_hazard_relative_to_mean(self, X):
same order as the training data.
Returns the log hazard relative to the hazard of the mean covariates. This is the behaviour
of R's predict.coxph. Equal to \beta X - \beta \bar{X}
of R's predict.coxph. Equal to \beta X - \beta \bar{X_{train}}
"""

return self.predict_log_partial_hazard(X) - self._train_log_partial_hazard.squeeze()
Expand Down
45 changes: 45 additions & 0 deletions lifelines/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,6 +1038,7 @@ def check_complete_separation_low_variance(df, events):
warnings.warn(warning_text, ConvergenceWarning)

def check_complete_separation_close_to_perfect_correlation(df, durations):
# slow for many columns
THRESHOLD = 0.99
for col, series in df.iteritems():
if abs(stats.spearmanr(series, durations).correlation) >= THRESHOLD:
Expand Down Expand Up @@ -1214,3 +1215,47 @@ def covariates_from_event_matrix(df, id_col):
df.columns = [id_col, 'event', 'duration']
df['_counter'] = 1
return df.pivot_table(index=[id_col, 'duration'], columns='event', fill_value=0)['_counter'].reset_index()



class StepSizer():

def __init__(self, initial_step_size):
initial_step_size = coalesce(initial_step_size, 0.95)
self.initial_step_size = initial_step_size
self.step_size = initial_step_size
self.temper_back_up = False
self.norm_of_deltas = []

def update(self, norm_of_delta):
SCALE = 1.2
LOOKBACK = 3

self.norm_of_deltas.append(norm_of_delta)

# Only allow small steps
if norm_of_delta >= 15.0:
self.step_size *= 0.25
self.temper_back_up = True
elif 15.0 > norm_of_delta > 5.0:
self.step_size *= 0.75
self.temper_back_up = True

# speed up convergence by increasing step size again
if self.temper_back_up:
self.step_size = min(self.step_size * SCALE, self.initial_step_size)

# recent non-monotonically decreasing is a concern
if len(self.norm_of_deltas) >= LOOKBACK and \
np.any(np.diff(self.norm_of_deltas[-LOOKBACK:]) > 0):
self.step_size *= 0.98

# recent monotonically decreasing is good though
if len(self.norm_of_deltas) >= LOOKBACK and \
np.all(np.diff(self.norm_of_deltas[-LOOKBACK:]) < 0):
self.step_size = min(self.step_size * SCALE, 0.95)

return self

def next(self):
return self.step_size
2 changes: 1 addition & 1 deletion tests/test_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,7 +806,7 @@ def test_efron_computed_by_hand_examples(self, data_nus):

def test_efron_newtons_method(self, data_nus):
newton = CoxPHFitter()._newton_rhaphson
X, T, E, W = data_nus[['x']], data_nus['t'], data_nus['E'], np.ones_like(data_nus['t'])
X, T, E, W = data_nus[['x']], data_nus['t'], data_nus['E'], pd.Series(np.ones_like(data_nus['t']))
assert np.abs(newton(X, T, E, W)[0][0] - -0.0335) < 0.0001

def test_fit_method(self, data_nus):
Expand Down

0 comments on commit 1f2315f

Please sign in to comment.