Merge 8a7cd99 into fc99934

CamDavidsonPilon · Jun 30, 2018 · dd27a93 · dd27a93
2 parents fc99934 + 8a7cd99
commit dd27a93
Show file tree

Hide file tree

Showing 6 changed files with 120 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 ### Changelogs
 
+#### 0.14.5
+ - fix for n > 2 groups in `multivariate_logrank_test`
+ - fix weights in KaplanMeierFitter when using a pandas Series. 
+
 #### 0.14.4
  - Adds `baseline_cumulative_hazard_` and `baseline_survival_` to `CoxTimeVaryingFitter`. Because of this, new prediction methods are available. 
  - fixed a bug in `add_covariate_to_timeline` when using `cumulative_sum` with multiple columns.

diff --git a/lifelines/datasets/__init__.py b/lifelines/datasets/__init__.py
@@ -363,3 +363,22 @@ def load_dfcv():
     """
     from lifelines.datasets.dfcv_dataset import dfcv
     return dfcv
+
+
+def load_lymphoma(**kwargs):
+    """
+    From https://www.statsdirect.com/help/content/survival_analysis/logrank.htm
+
+
+    Size: (80, 3)
+
+    Example:
+
+       Stage_group  Time  Censor
+    0            1     6       1
+    1            1    19       1
+    2            1    32       1
+    3            1    42       1
+    4            1    42       1
+    """
+    return load_dataset('lymphoma.csv', **kwargs)
diff --git a/lifelines/datasets/lymphoma.csv b/lifelines/datasets/lymphoma.csv
@@ -0,0 +1,81 @@
+Stage_group,Time,Censor
+1,6,1
+1,19,1
+1,32,1
+1,42,1
+1,42,1
+1,43,0
+1,94,1
+1,126,0
+1,169,0
+1,207,1
+1,211,0
+1,227,0
+1,253,1
+1,255,0
+1,270,0
+1,310,0
+1,316,0
+1,335,0
+1,346,0
+2,4,1
+2,6,1
+2,10,1
+2,11,1
+2,11,1
+2,11,1
+2,13,1
+2,17,1
+2,20,1
+2,20,1
+2,21,1
+2,22,1
+2,24,1
+2,24,1
+2,29,1
+2,30,1
+2,30,1
+2,31,1
+2,33,1
+2,34,1
+2,35,1
+2,39,1
+2,40,1
+2,41,0
+2,43,0
+2,45,1
+2,46,1
+2,50,1
+2,56,1
+2,61,0
+2,61,0
+2,63,1
+2,68,1
+2,82,1
+2,85,1
+2,88,1
+2,89,1
+2,90,1
+2,93,1
+2,104,1
+2,110,1
+2,134,1
+2,137,1
+2,160,0
+2,169,1
+2,171,1
+2,173,1
+2,175,1
+2,184,1
+2,201,1
+2,222,1
+2,235,0
+2,247,0
+2,260,0
+2,284,0
+2,290,0
+2,291,0
+2,302,0
+2,304,0
+2,341,0
+2,345,0
diff --git a/lifelines/statistics.py b/lifelines/statistics.py
@@ -223,16 +223,17 @@ def multivariate_logrank_test(event_durations, groups, event_observed=None,
     assert abs(Z_j.sum()) < 10e-8, "Sum is not zero."  # this should move to a test eventually.
 
     # compute covariance matrix
-    factor = (((n_i - d_i) / (n_i - 1)).replace(np.inf, 1)) * d_i
+    factor = (((n_i - d_i) / (n_i - 1)).replace(np.inf, 1)) * d_i / n_i ** 2
     n_ij['_'] = n_i.values
-    V_ = n_ij.mul(np.sqrt(factor) / n_i, axis='index').fillna(1)
-    V = -np.dot(V_.T, V_)
+    V_ = n_ij.mul(np.sqrt(factor), axis='index').fillna(1)
+
+    V = -np.dot(V_.T, V_) + 1
     ix = np.arange(n_groups)
-    V[ix, ix] = -V[-1, ix] + V[ix, ix]
+    V[ix, ix] = V[ix, ix] - V[-1, ix]
     V = V[:-1, :-1]
 
     # take the first n-1 groups
-    U = Z_j.iloc[:-1].dot(np.linalg.pinv(V[:-1, :-1]).dot(Z_j.iloc[:-1]))  # Z.T*inv(V)*Z
+    U = Z_j.iloc[:-1].dot(np.linalg.pinv(V[:-1, :-1])).dot(Z_j.iloc[:-1])  # Z.T*inv(V)*Z
 
     # compute the p-values and tests
     test_result, p_value = chisq_test(U, n_groups - 1, alpha)

diff --git a/lifelines/version.py b/lifelines/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '0.14.4'
+__version__ = '0.14.5'
diff --git a/tests/test_statistics.py b/tests/test_statistics.py
@@ -5,7 +5,7 @@
 import pytest
 
 from lifelines import statistics as stats
-from lifelines.datasets import load_waltons, load_g3
+from lifelines.datasets import load_waltons, load_g3, load_lymphoma
 
 
 def test_sample_size_necessary_under_cph():
@@ -67,6 +67,14 @@ def test_rank_test_output_against_R_no_censorship():
     assert abs(result.test_statistic - r_stat) < 10e-6
 
 
+def test_n_more_than_2_multivariate_logrank():
+    # from https://www.statsdirect.com/help/content/survival_analysis/logrank.htm
+    df_ = load_lymphoma()
+    results = stats.multivariate_logrank_test(df_['Time'], df_['Stage_group'], df_['Censor'])
+    assert abs(results.test_statistic - 6.70971) < 1e-4
+    assert abs(results.p_value - 0.0096) < 1e-4
+
+
 def test_rank_test_output_against_R_with_censorship():
     """
     > time <- c(10,20,30,10,20,50)