Merge 3dcc9bd into 99df6ea

CamDavidsonPilon · Mar 15, 2019 · 820dfee · 820dfee
2 parents 99df6ea + 3dcc9bd
commit 820dfee
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 22 deletions.
diff --git a/docs/Survival analysis with lifelines.rst b/docs/Survival analysis with lifelines.rst
@@ -646,7 +646,7 @@ of time to birth. This is available as the ``cumulative_density_`` property afte
 
 .. image:: images/lifelines_intro_lcd.png
 
- .. note:: Other types of censoring, like interval-censoring, are not implemented in *lifelines* yet.
+.. note:: Other types of censoring, like interval-censoring, are not implemented in *lifelines* yet.
 
 
 Left truncated data

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
@@ -57,7 +57,7 @@ def decide(batch_mode, T):
             # https://github.com/CamDavidsonPilon/lifelines/issues/591 for original issue.
             # new values from from perf/batch_vs_single script.
             (batch_mode is None)
-            and (0.713869 + -0.000028 * n_total + 0.684229 * frac_dups + 0.000201 * n_total * frac_dups < 1)
+            and (0.712085 + -0.000025 * n_total + 0.579359 * frac_dups + 0.000044 * n_total * frac_dups < 1)
         ):
             return "batch"
         return "single"
@@ -683,27 +683,30 @@ def _get_efron_values_single(self, X, T, E, weights, beta):
     @staticmethod
     def _trivial_log_likelihood_batch(T, E, weights):
         # used for log-likelihood test
+        n = T.shape[0]
         log_lik = 0
-        unique_death_times = np.unique(T)
+        unique_death_times_and_counts = zip(*np.unique(-T, return_counts=True))
         risk_phi = 0
+        pos = n
 
-        for t in reversed(unique_death_times):
+        for _, count_of_removals in unique_death_times_and_counts:
 
-            ix = T == t
+            slice_ = slice(pos - count_of_removals, pos)
 
-            weights_at_t = weights[ix]
+            weights_at_t = weights[slice_]
 
             phi_i = weights_at_t
 
             # Calculate sums of Risk set
             risk_phi = risk_phi + array_sum_to_scalar(phi_i)
 
             # Calculate the sums of Tie set
-            deaths = E[ix]
+            deaths = E[slice_]
 
             tied_death_counts = array_sum_to_scalar(deaths.astype(int))
             if tied_death_counts == 0:
                 # no deaths, can continue
+                pos -= count_of_removals
                 continue
 
             weights_deaths = weights_at_t[deaths]
@@ -716,6 +719,7 @@ def _trivial_log_likelihood_batch(T, E, weights):
                 factor = np.log(risk_phi)
 
             log_lik = log_lik - weight_count / tied_death_counts * factor
+            pos -= count_of_removals
 
         return log_lik
 
@@ -774,6 +778,7 @@ def _trivial_log_likelihood_single(T, E, weights):
 
     def _get_efron_values_batch(self, X, T, E, weights, beta):  # pylint: disable=too-many-locals
         """
+        Assumes sorted on ascending on T
         Calculates the first and second order vector differentials, with respect to beta.
 
         A good explanation for how Efron handles ties. Consider three of five subjects who fail at the time.
@@ -788,7 +793,7 @@ def _get_efron_values_batch(self, X, T, E, weights, beta):  # pylint: disable=to
         log_likelihood: float
         """
 
-        _, d = X.shape
+        n, d = X.shape
         hessian = np.zeros((d, d))
         gradient = np.zeros((d,))
         log_lik = 0
@@ -799,20 +804,18 @@ def _get_efron_values_batch(self, X, T, E, weights, beta):  # pylint: disable=to
         risk_phi_x, tie_phi_x = np.zeros((d,)), np.zeros((d,))
         risk_phi_x_x, tie_phi_x_x = np.zeros((d, d)), np.zeros((d, d))
 
-        unique_death_times = np.unique(T)
+        unique_death_times_and_counts = zip(*np.unique(-T, return_counts=True))
         scores = weights * np.exp(np.dot(X, beta))
+        pos = n
 
-        for t in reversed(unique_death_times):
-
-            ix = T == t
+        for _, count_of_removals in unique_death_times_and_counts:
 
-            # this can be improved by "stepping", ex: X[pos: pos+removals], since X is sorted by T
-            # slice_ = slice(pos - ix.sum(), pos)
+            slice_ = slice(pos - count_of_removals, pos)
 
-            X_at_t = X[ix]
-            weights_at_t = weights[ix]
+            X_at_t = X[slice_]
+            weights_at_t = weights[slice_]
 
-            phi_i = scores[ix][:, None]
+            phi_i = scores[slice_, None]
             phi_x_i = phi_i * X_at_t
             phi_x_x_i = np.dot(X_at_t.T, phi_x_i)
 
@@ -822,11 +825,12 @@ def _get_efron_values_batch(self, X, T, E, weights, beta):  # pylint: disable=to
             risk_phi_x_x = risk_phi_x_x + phi_x_x_i
 
             # Calculate the sums of Tie set
-            deaths = E[ix]
+            deaths = E[slice_]
 
             tied_death_counts = array_sum_to_scalar(deaths.astype(int))
             if tied_death_counts == 0:
                 # no deaths, can continue
+                pos -= count_of_removals
                 continue
 
             xi_deaths = X_at_t[deaths]
@@ -875,6 +879,7 @@ def _get_efron_values_batch(self, X, T, E, weights, beta):  # pylint: disable=to
             gradient = gradient + x_death_sum - weighted_average * summand.sum(0)
             log_lik = log_lik + np.dot(x_death_sum, beta) + weighted_average * np.log(denom).sum()
             hessian = hessian + weighted_average * (a2 - a1)
+            pos -= count_of_removals
 
         return hessian, gradient, log_lik
 

diff --git a/perf_tests/batch_vs_single.py b/perf_tests/batch_vs_single.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from time import time
 import pandas as pd
 import numpy as np
@@ -13,11 +14,11 @@
 results = {}
 
 
-for n_copies in [1, 2, 4, 6, 8, 10, 13, 17, 20]:
+for n_copies in [1, 2, 4, 6, 8, 10, 13, 17, 20, 25]:
 
     # lower percents means more ties.
     # original rossi dataset has 0.113
-    for fraction in np.linspace(0.01, 0.99, 12):
+    for fraction in np.linspace(0.01, 0.99, 15):
         print(n_copies, fraction)
 
         df = pd.concat([load_rossi()] * n_copies)

diff --git a/perf_tests/cp_perf_test.py b/perf_tests/cp_perf_test.py
@@ -11,10 +11,11 @@
     from lifelines.datasets import load_rossi
 
     df = load_rossi()
-    df = pd.concat([df] * 500)
+    df = pd.concat([df] * 20)
     # df = df.reset_index()
     # df['week'] = np.random.exponential(1, size=df.shape[0])
     cp = CoxPHFitter()
     start_time = time.time()
-    cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True, show_progress=True)
+    cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True)
     print("--- %s seconds ---" % (time.time() - start_time))
+    cp.print_summary()