Skip to content

Commit

Permalink
Merge 3dcc9bd into 99df6ea
Browse files Browse the repository at this point in the history
  • Loading branch information
CamDavidsonPilon committed Mar 15, 2019
2 parents 99df6ea + 3dcc9bd commit 820dfee
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 22 deletions.
2 changes: 1 addition & 1 deletion docs/Survival analysis with lifelines.rst
Expand Up @@ -646,7 +646,7 @@ of time to birth. This is available as the ``cumulative_density_`` property afte
.. image:: images/lifelines_intro_lcd.png

.. note:: Other types of censoring, like interval-censoring, are not implemented in *lifelines* yet.
.. note:: Other types of censoring, like interval-censoring, are not implemented in *lifelines* yet.


Left truncated data
Expand Down
39 changes: 22 additions & 17 deletions lifelines/fitters/coxph_fitter.py
Expand Up @@ -57,7 +57,7 @@ def decide(batch_mode, T):
# https://github.com/CamDavidsonPilon/lifelines/issues/591 for original issue.
# new values from from perf/batch_vs_single script.
(batch_mode is None)
and (0.713869 + -0.000028 * n_total + 0.684229 * frac_dups + 0.000201 * n_total * frac_dups < 1)
and (0.712085 + -0.000025 * n_total + 0.579359 * frac_dups + 0.000044 * n_total * frac_dups < 1)
):
return "batch"
return "single"
Expand Down Expand Up @@ -683,27 +683,30 @@ def _get_efron_values_single(self, X, T, E, weights, beta):
@staticmethod
def _trivial_log_likelihood_batch(T, E, weights):
# used for log-likelihood test
n = T.shape[0]
log_lik = 0
unique_death_times = np.unique(T)
unique_death_times_and_counts = zip(*np.unique(-T, return_counts=True))
risk_phi = 0
pos = n

for t in reversed(unique_death_times):
for _, count_of_removals in unique_death_times_and_counts:

ix = T == t
slice_ = slice(pos - count_of_removals, pos)

weights_at_t = weights[ix]
weights_at_t = weights[slice_]

phi_i = weights_at_t

# Calculate sums of Risk set
risk_phi = risk_phi + array_sum_to_scalar(phi_i)

# Calculate the sums of Tie set
deaths = E[ix]
deaths = E[slice_]

tied_death_counts = array_sum_to_scalar(deaths.astype(int))
if tied_death_counts == 0:
# no deaths, can continue
pos -= count_of_removals
continue

weights_deaths = weights_at_t[deaths]
Expand All @@ -716,6 +719,7 @@ def _trivial_log_likelihood_batch(T, E, weights):
factor = np.log(risk_phi)

log_lik = log_lik - weight_count / tied_death_counts * factor
pos -= count_of_removals

return log_lik

Expand Down Expand Up @@ -774,6 +778,7 @@ def _trivial_log_likelihood_single(T, E, weights):

def _get_efron_values_batch(self, X, T, E, weights, beta): # pylint: disable=too-many-locals
"""
Assumes sorted on ascending on T
Calculates the first and second order vector differentials, with respect to beta.
A good explanation for how Efron handles ties. Consider three of five subjects who fail at the time.
Expand All @@ -788,7 +793,7 @@ def _get_efron_values_batch(self, X, T, E, weights, beta): # pylint: disable=to
log_likelihood: float
"""

_, d = X.shape
n, d = X.shape
hessian = np.zeros((d, d))
gradient = np.zeros((d,))
log_lik = 0
Expand All @@ -799,20 +804,18 @@ def _get_efron_values_batch(self, X, T, E, weights, beta): # pylint: disable=to
risk_phi_x, tie_phi_x = np.zeros((d,)), np.zeros((d,))
risk_phi_x_x, tie_phi_x_x = np.zeros((d, d)), np.zeros((d, d))

unique_death_times = np.unique(T)
unique_death_times_and_counts = zip(*np.unique(-T, return_counts=True))
scores = weights * np.exp(np.dot(X, beta))
pos = n

for t in reversed(unique_death_times):

ix = T == t
for _, count_of_removals in unique_death_times_and_counts:

# this can be improved by "stepping", ex: X[pos: pos+removals], since X is sorted by T
# slice_ = slice(pos - ix.sum(), pos)
slice_ = slice(pos - count_of_removals, pos)

X_at_t = X[ix]
weights_at_t = weights[ix]
X_at_t = X[slice_]
weights_at_t = weights[slice_]

phi_i = scores[ix][:, None]
phi_i = scores[slice_, None]
phi_x_i = phi_i * X_at_t
phi_x_x_i = np.dot(X_at_t.T, phi_x_i)

Expand All @@ -822,11 +825,12 @@ def _get_efron_values_batch(self, X, T, E, weights, beta): # pylint: disable=to
risk_phi_x_x = risk_phi_x_x + phi_x_x_i

# Calculate the sums of Tie set
deaths = E[ix]
deaths = E[slice_]

tied_death_counts = array_sum_to_scalar(deaths.astype(int))
if tied_death_counts == 0:
# no deaths, can continue
pos -= count_of_removals
continue

xi_deaths = X_at_t[deaths]
Expand Down Expand Up @@ -875,6 +879,7 @@ def _get_efron_values_batch(self, X, T, E, weights, beta): # pylint: disable=to
gradient = gradient + x_death_sum - weighted_average * summand.sum(0)
log_lik = log_lik + np.dot(x_death_sum, beta) + weighted_average * np.log(denom).sum()
hessian = hessian + weighted_average * (a2 - a1)
pos -= count_of_removals

return hessian, gradient, log_lik

Expand Down
5 changes: 3 additions & 2 deletions perf_tests/batch_vs_single.py
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
from time import time
import pandas as pd
import numpy as np
Expand All @@ -13,11 +14,11 @@
results = {}


for n_copies in [1, 2, 4, 6, 8, 10, 13, 17, 20]:
for n_copies in [1, 2, 4, 6, 8, 10, 13, 17, 20, 25]:

# lower percents means more ties.
# original rossi dataset has 0.113
for fraction in np.linspace(0.01, 0.99, 12):
for fraction in np.linspace(0.01, 0.99, 15):
print(n_copies, fraction)

df = pd.concat([load_rossi()] * n_copies)
Expand Down
5 changes: 3 additions & 2 deletions perf_tests/cp_perf_test.py
Expand Up @@ -11,10 +11,11 @@
from lifelines.datasets import load_rossi

df = load_rossi()
df = pd.concat([df] * 500)
df = pd.concat([df] * 20)
# df = df.reset_index()
# df['week'] = np.random.exponential(1, size=df.shape[0])
cp = CoxPHFitter()
start_time = time.time()
cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True, show_progress=True)
cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True)
print("--- %s seconds ---" % (time.time() - start_time))
cp.print_summary()

0 comments on commit 820dfee

Please sign in to comment.