Merge pull request #540 from CamDavidsonPilon/cluster

Adding cluster arg to coxph
CamDavidsonPilon · Nov 18, 2018 · 9fc5256 · 9fc5256
2 parents 10b5994 + efc64a0
commit 9fc5256
Show file tree

Hide file tree

Showing 3 changed files with 107 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@
  - In Python3, Univariate models are now serialisable with `pickle`. Thanks @dwilson1988 for the contribution. For Python2, `dill` is still the preferred method.
  - `baseline_cumulative_hazard_` (and derivatives of that) on `CoxPHFitter` now correctly incorporate the `weights_col`. 
  - Fixed a bug in `KaplanMeierFitter` when late entry times lined up with death events. Thanks @pzivich
+ - adding `cluster_col` argument to `CoxPHFitter` so users can specify groups of subjects/rows that may be correlated. 
 
 #### 0.14.6
  - fix for n > 2 groups in `multivariate_logrank_test` (again).

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
@@ -61,7 +61,7 @@ def __init__(self, alpha=0.95, tie_method='Efron', penalizer=0.0, strata=None):
     def fit(self, df, duration_col, event_col=None,
             show_progress=False, initial_beta=None,
             strata=None, step_size=None, weights_col=None,
-            robust=False):
+            cluster_col=None, robust=False):
         """
         Fit the Cox Propertional Hazard model to a dataset. Tied survival times
         are handled using Efron's tie-method.
@@ -93,7 +93,8 @@ def fit(self, df, duration_col, event_col=None,
           robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
             ties, so if there are high number of ties, results may significantly differ. See
             "The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
-
+          cluster_col: specifies what column has ids for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to
+            be used.
         Returns:
             self, with additional properties: hazards_, confidence_intervals_, baseline_survival_, etc.
 
@@ -108,6 +109,8 @@ def fit(self, df, duration_col, event_col=None,
         self.duration_col = duration_col
         self.event_col = event_col
         self.robust = robust
+        self.cluster_col = cluster_col
+        self.weights_col = weights_col
         self._n_examples = df.shape[0]
         self.strata = coalesce(strata, self.strata)
         if self.strata is not None:
@@ -136,6 +139,9 @@ def fit(self, df, duration_col, event_col=None,
         else:
             weights = pd.Series(np.ones((self._n_examples,)), index=df.index)
 
+        if self.cluster_col:
+            self._clusters = df.pop(self.cluster_col)
+
         self._check_values(df, T, E)
         df = df.astype(float)
 
@@ -453,21 +459,38 @@ def _compute_confidence_intervals(self):
 
     def _compute_sandwich_estimator(self, X, T, E, weights):
 
-        if self.strata is None:
-            score_residuals = self._compute_residuals_within_strata(X.values, T.values, E.values, weights.values)
+        _, d = X.shape
 
-        else:
-            score_residuals = np.empty((0,1))
+        if self.strata is not None and self.cluster_col is not None:
+            raise NotImplementedError("Providing clusters and strata is not implemented yet")
+
+        if self.strata is not None:
+            score_residuals = np.empty((0, d))
             for strata in np.unique(X.index):
+                # TODO: use pandas .groupby
                 stratified_X, stratified_T, stratified_E, stratified_W = X.loc[[strata]], T.loc[[strata]], E.loc[[strata]], weights.loc[[strata]]
 
                 score_residuals = np.append(score_residuals,
-                                            self._compute_residuals_within_strata(stratified_X.values, stratified_T.values, stratified_E.values, stratified_W.values),
+                                            self._compute_residuals_within_strata(stratified_X.values, stratified_T.values, stratified_E.values, stratified_W.values) * stratified_W[:, None],
                                             axis=0)
 
+        else:
+            score_residuals = self._compute_residuals_within_strata(X.values, T.values, E.values, weights.values) * weights[:, None]
+
+        if self.cluster_col:
+
+            score_residuals_ = np.empty((0, d))
+            for cluster in np.unique(self._clusters):
+                ix = self._clusters == cluster
+                weights_ = weights.values[ix]
+
+                score_residuals_ = np.append(score_residuals_,
+                                            (score_residuals[ix, :] * weights_[:, None]).sum(0).reshape(1, d),
+                                            axis=0)
+            score_residuals = score_residuals_
 
         naive_var = inv(self._hessian_)
-        delta_betas = score_residuals.dot(naive_var) * weights[:, None]
+        delta_betas = score_residuals.dot(naive_var)
         sandwich_estimator = delta_betas.T.dot(delta_betas) / np.outer(self._norm_std, self._norm_std)
         return sandwich_estimator
 
@@ -515,7 +538,7 @@ def _compute_residuals_within_strata(self, X, T, E, weights):
 
 
     def _compute_standard_errors(self, df, T, E, weights):
-        if self.robust:
+        if self.robust or self.cluster_col:
             se = np.sqrt(self._compute_sandwich_estimator(df, T, E, weights).diagonal()) # / self._norm_std
         else:
             se = np.sqrt(self.variance_matrix_.diagonal())
@@ -561,6 +584,14 @@ def print_summary(self):
         print(self)
         print("{} = {}".format(justify('duration col'), self.duration_col))
         print("{} = {}".format(justify('event col'), self.event_col))
+        if self.weights_col:
+            print("{} = {}".format(justify('weights col'), self.weights_col))
+
+        if self.cluster_col:
+            print("{} = {}".format(justify('cluster col'), self.cluster_col))
+
+        if self.robust or self.cluster_col:
+            print("{} = {}".format(justify('robust variance'), True))
 
         if self.strata:
             print('{} = {}'.format(justify('strata'), self.strata))

diff --git a/tests/test_estimation.py b/tests/test_estimation.py
@@ -1174,6 +1174,61 @@ def test_robust_errors_with_trivial_weights_is_the_same_than_R(self, regression_
         expected = pd.Series({'var1': 2.097, 'var2': 0.827})
         assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
 
+    def test_cluster_option(self, regression_dataset):
+        """
+        library(survival)
+        df <- data.frame(
+          "var1" = c(1, 1, 2, 2, 2),
+          "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+          "id" = c(1, 1, 2, 3, 4),
+          "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+
+        c = coxph(formula=Surv(T, E) ~ var1 + var2 + cluster(id), data=df)
+        """
+
+        df = pd.DataFrame({
+            "var1": [1, 1, 2, 2, 2],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            "T":    [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
+            "id":   [1, 1, 2, 3, 4],
+        })
+        df['E'] = 1
+
+        cph = CoxPHFitter()
+        cph.fit(df, 'T', 'E', cluster_col='id', show_progress=True)
+        expected = pd.Series({'var1': 5.9752, 'var2': 4.0683})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
+
+    @pytest.mark.xfail(reason="can't do this yet")
+    def test_cluster_option_with_strata(self, regression_dataset):
+        """
+        library(survival)
+        df <- data.frame(
+          "var1" = c(1, 1, 2, 2, 2),
+          "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
+          "id" = c(1, 1, 2, 3, 4),
+          "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+        )
+        df['E'] = 1
+
+        c = coxph(formula=Surv(T, E) ~ strata(var1) + var2 + cluster(id), data=df)
+        """
+
+        df = pd.DataFrame({
+            "var1": [1, 1, 2, 2, 2],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
+            "T":    [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
+            "id":   [1, 1, 2, 3, 4],
+        })
+        df['E'] = 1
+
+        cph = CoxPHFitter()
+        cph.fit(df, 'T', 'E', cluster_col='id', strata=['var1'], show_progress=True)
+        expected = pd.Series({'var2': 3.34})
+        assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)
+
 
     def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset):
         """
@@ -1818,25 +1873,27 @@ def test_robust_errors_against_R_no_ties(self, regression_dataset):
     def test_robust_errors_with_strata_against_R(self, rossi):
         """
         df <- data.frame(
-            "var1" = c(1, 1, 2, 2, 2),
-            "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
-            "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
+          "var1" = c(1, 1, 2, 2, 2, 1),
+          "var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092, 0.5),
+          "var3" = c(1, 2, 3, 2, 1, 2),
+          "T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666, 8.)
         )
         df['E'] = 1
 
-        coxph(formula=Surv(T, E) ~ strata(var1) + var2, data=df, robust=TRUE)
+        coxph(formula=Surv(T, E) ~ strata(var1) + var2 + var3, data=df, robust=TRUE)
         """
 
         df = pd.DataFrame({
-            "var1": [1, 1, 2, 2, 2],
-            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
-            "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666]
+            "var1": [1, 1, 2, 2, 2, 1],
+            "var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092, 0.5],
+            "var3": [1, 2, 3, 2, 1, 2],
+            "T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666, 8.0]
         })
         df['E'] = 1
 
         cf = CoxPHFitter()
         cf.fit(df, duration_col='T', event_col='E', strata=['var1'], robust=True)
-        npt.assert_allclose(cf.summary['se(coef)'].values, 2.78649, rtol=1e-2)
+        npt.assert_allclose(cf.summary['se(coef)'].values, np.array([1.076, 0.680]), rtol=1e-2)
 
 
     @pytest.mark.xfail
@@ -1967,6 +2024,7 @@ def test_aalen_additive_median_predictions_split_data(self):
         hz, coef, X = generate_hazard_rates(n, d, timeline)
         T = generate_random_lifetimes(hz, timeline)
         X['T'] = T
+        X = X.replace([np.inf, -np.inf], 10.0)
         # fit it to Aalen's model
         aaf = AalenAdditiveFitter()
         aaf.fit(X, 'T')