slight perf improvement, and rerun the script to look at large counts

CamDavidsonPilon · Jul 14, 2019 · f232b55 · f232b55
1 parent 441ee59
commit f232b55
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,6 @@
 ### Changelog
 
-### 0.22.1
+#### 0.22.1
 
 ##### New features
  - New univariate model, `GeneralizedGammaFitter`. This model contains many sub-models, so it is a good model to check fits.

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
@@ -11,6 +11,7 @@
 from scipy.integrate import trapz
 from scipy import stats
 from bottleneck import nansum as array_sum_to_scalar
+from numpy import sum as array_sum_to_scalar
 
 from lifelines.fitters import BaseFitter
 from lifelines.plotting import set_kwargs_ax, set_kwargs_drawstyle
@@ -58,7 +59,17 @@ def decide(batch_mode, T):
             # https://github.com/CamDavidsonPilon/lifelines/issues/591 for original issue.
             # new values from from perf/batch_vs_single script.
             (batch_mode is None)
-            and (0.712085 + -0.000025 * n_total + 0.579359 * frac_dups + 0.000044 * n_total * frac_dups < 1)
+            and (
+                (
+                    5.302813e-01
+                    + -1.789398e-06 * n_total
+                    + -3.496285e-11 * n_total ** 2
+                    + 2.756569e00 * frac_dups
+                    + -1.306258e00 * frac_dups ** 2
+                    + 9.535042e-06 * n_total * frac_dups
+                )
+                < 1
+            )
         ):
             return "batch"
         return "single"
@@ -616,19 +627,22 @@ def _get_efron_values_single(self, X, T, E, weights, beta):
         tied_death_counts = 0
         scores = weights * np.exp(np.dot(X, beta))
 
+        phi_x_is = scores[:, None] * X
+        phi_x_x_i = np.empty((d, d))
+
         # Iterate backwards to utilize recursive relationship
         for i in range(n - 1, -1, -1):
             # Doing it like this to preserve shape
             ti = T[i]
             ei = E[i]
             xi = X[i]
-            score = scores[i]
             w = weights[i]
 
             # Calculate phi values
-            phi_i = score
-            phi_x_i = phi_i * xi
-            phi_x_x_i = np.outer(xi, phi_x_i)
+            phi_i = scores[i]
+            phi_x_i = phi_x_is[i]
+            # https://stackoverflow.com/a/51481295/1895939
+            phi_x_x_i = np.multiply.outer(xi, phi_x_i)
 
             # Calculate sums of Risk set
             risk_phi = risk_phi + phi_i

diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py
@@ -71,7 +71,7 @@ class WeibullFitter(KnownModelParametericUnivariateFitter):
     entry: array or None
         The entry array provided, or None
 
-    See Also
+    Notes
     ----------
     Looking for a 3-parameter Weibull model? See notes here: https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Piecewise%20Exponential%20Models%20and%20Creating%20Custom%20Models.html#3-parameter-Weibull-distribution
     """

diff --git a/perf_tests/batch_vs_single.py b/perf_tests/batch_vs_single.py
@@ -14,7 +14,7 @@
 results = {}
 
 
-for n_copies in [1, 2, 4, 6, 8, 10, 13, 17, 20, 25]:
+for n_copies in [1, 2, 4, 6, 8, 10, 15, 20, 50, 100, 150]:
 
     # lower percents means more ties.
     # original rossi dataset has 0.113
@@ -56,8 +56,10 @@
 
 
 results["N * frac"] = results["N"] * results["frac"]
+results["N**2"] = results["N"] ** 2
+results["frac**2"] = results["frac"] ** 2
 
-X = results[["N", "frac", "N * frac"]]
+X = results[["N", "frac", "N * frac", "frac**2", "N**2"]]
 X = sm.add_constant(X)
 
 Y = results["ratio"]

diff --git a/perf_tests/cp_perf_test.py b/perf_tests/cp_perf_test.py
@@ -13,7 +13,7 @@
     df = load_rossi()
     df = pd.concat([df] * 16)
     # df = df.reset_index()
-    # df['week'] = np.random.exponential(1, size=df.shape[0])
+    df["week"] = np.random.exponential(1, size=df.shape[0])
     cp = CoxPHFitter()
     start_time = time.time()
     cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True)