Skip to content

Commit

Permalink
Merge pull request #540 from CamDavidsonPilon/cluster
Browse files Browse the repository at this point in the history
Adding cluster arg to coxph
  • Loading branch information
CamDavidsonPilon committed Nov 18, 2018
2 parents 10b5994 + efc64a0 commit 9fc5256
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 17 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -20,6 +20,7 @@
- In Python3, Univariate models are now serialisable with `pickle`. Thanks @dwilson1988 for the contribution. For Python2, `dill` is still the preferred method.
- `baseline_cumulative_hazard_` (and derivatives of that) on `CoxPHFitter` now correctly incorporate the `weights_col`.
- Fixed a bug in `KaplanMeierFitter` when late entry times lined up with death events. Thanks @pzivich
- adding `cluster_col` argument to `CoxPHFitter` so users can specify groups of subjects/rows that may be correlated.

#### 0.14.6
- fix for n > 2 groups in `multivariate_logrank_test` (again).
Expand Down
49 changes: 40 additions & 9 deletions lifelines/fitters/coxph_fitter.py
Expand Up @@ -61,7 +61,7 @@ def __init__(self, alpha=0.95, tie_method='Efron', penalizer=0.0, strata=None):
def fit(self, df, duration_col, event_col=None,
show_progress=False, initial_beta=None,
strata=None, step_size=None, weights_col=None,
robust=False):
cluster_col=None, robust=False):
"""
Fit the Cox Propertional Hazard model to a dataset. Tied survival times
are handled using Efron's tie-method.
Expand Down Expand Up @@ -93,7 +93,8 @@ def fit(self, df, duration_col, event_col=None,
robust: Compute the robust errors using the Huber sandwich estimator, aka Wei-Lin estimate. This does not handle
ties, so if there are high number of ties, results may significantly differ. See
"The Robust Inference for the Cox Proportional Hazards Model", Journal of the American Statistical Association, Vol. 84, No. 408 (Dec., 1989), pp. 1074- 1078
cluster_col: specifies what column has ids for clustering covariances. Using this forces the sandwich estimator (robust variance estimator) to
be used.
Returns:
self, with additional properties: hazards_, confidence_intervals_, baseline_survival_, etc.
Expand All @@ -108,6 +109,8 @@ def fit(self, df, duration_col, event_col=None,
self.duration_col = duration_col
self.event_col = event_col
self.robust = robust
self.cluster_col = cluster_col
self.weights_col = weights_col
self._n_examples = df.shape[0]
self.strata = coalesce(strata, self.strata)
if self.strata is not None:
Expand Down Expand Up @@ -136,6 +139,9 @@ def fit(self, df, duration_col, event_col=None,
else:
weights = pd.Series(np.ones((self._n_examples,)), index=df.index)

if self.cluster_col:
self._clusters = df.pop(self.cluster_col)

self._check_values(df, T, E)
df = df.astype(float)

Expand Down Expand Up @@ -453,21 +459,38 @@ def _compute_confidence_intervals(self):

def _compute_sandwich_estimator(self, X, T, E, weights):

if self.strata is None:
score_residuals = self._compute_residuals_within_strata(X.values, T.values, E.values, weights.values)
_, d = X.shape

else:
score_residuals = np.empty((0,1))
if self.strata is not None and self.cluster_col is not None:
raise NotImplementedError("Providing clusters and strata is not implemented yet")

if self.strata is not None:
score_residuals = np.empty((0, d))
for strata in np.unique(X.index):
# TODO: use pandas .groupby
stratified_X, stratified_T, stratified_E, stratified_W = X.loc[[strata]], T.loc[[strata]], E.loc[[strata]], weights.loc[[strata]]

score_residuals = np.append(score_residuals,
self._compute_residuals_within_strata(stratified_X.values, stratified_T.values, stratified_E.values, stratified_W.values),
self._compute_residuals_within_strata(stratified_X.values, stratified_T.values, stratified_E.values, stratified_W.values) * stratified_W[:, None],
axis=0)

else:
score_residuals = self._compute_residuals_within_strata(X.values, T.values, E.values, weights.values) * weights[:, None]

if self.cluster_col:

score_residuals_ = np.empty((0, d))
for cluster in np.unique(self._clusters):
ix = self._clusters == cluster
weights_ = weights.values[ix]

score_residuals_ = np.append(score_residuals_,
(score_residuals[ix, :] * weights_[:, None]).sum(0).reshape(1, d),
axis=0)
score_residuals = score_residuals_

naive_var = inv(self._hessian_)
delta_betas = score_residuals.dot(naive_var) * weights[:, None]
delta_betas = score_residuals.dot(naive_var)
sandwich_estimator = delta_betas.T.dot(delta_betas) / np.outer(self._norm_std, self._norm_std)
return sandwich_estimator

Expand Down Expand Up @@ -515,7 +538,7 @@ def _compute_residuals_within_strata(self, X, T, E, weights):


def _compute_standard_errors(self, df, T, E, weights):
if self.robust:
if self.robust or self.cluster_col:
se = np.sqrt(self._compute_sandwich_estimator(df, T, E, weights).diagonal()) # / self._norm_std
else:
se = np.sqrt(self.variance_matrix_.diagonal())
Expand Down Expand Up @@ -561,6 +584,14 @@ def print_summary(self):
print(self)
print("{} = {}".format(justify('duration col'), self.duration_col))
print("{} = {}".format(justify('event col'), self.event_col))
if self.weights_col:
print("{} = {}".format(justify('weights col'), self.weights_col))

if self.cluster_col:
print("{} = {}".format(justify('cluster col'), self.cluster_col))

if self.robust or self.cluster_col:
print("{} = {}".format(justify('robust variance'), True))

if self.strata:
print('{} = {}'.format(justify('strata'), self.strata))
Expand Down
74 changes: 66 additions & 8 deletions tests/test_estimation.py
Expand Up @@ -1174,6 +1174,61 @@ def test_robust_errors_with_trivial_weights_is_the_same_than_R(self, regression_
expected = pd.Series({'var1': 2.097, 'var2': 0.827})
assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)

def test_cluster_option(self, regression_dataset):
"""
library(survival)
df <- data.frame(
"var1" = c(1, 1, 2, 2, 2),
"var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
"id" = c(1, 1, 2, 3, 4),
"T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
)
df['E'] = 1
c = coxph(formula=Surv(T, E) ~ var1 + var2 + cluster(id), data=df)
"""

df = pd.DataFrame({
"var1": [1, 1, 2, 2, 2],
"var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
"T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
"id": [1, 1, 2, 3, 4],
})
df['E'] = 1

cph = CoxPHFitter()
cph.fit(df, 'T', 'E', cluster_col='id', show_progress=True)
expected = pd.Series({'var1': 5.9752, 'var2': 4.0683})
assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)

@pytest.mark.xfail(reason="can't do this yet")
def test_cluster_option_with_strata(self, regression_dataset):
"""
library(survival)
df <- data.frame(
"var1" = c(1, 1, 2, 2, 2),
"var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
"id" = c(1, 1, 2, 3, 4),
"T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
)
df['E'] = 1
c = coxph(formula=Surv(T, E) ~ strata(var1) + var2 + cluster(id), data=df)
"""

df = pd.DataFrame({
"var1": [1, 1, 2, 2, 2],
"var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
"T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666],
"id": [1, 1, 2, 3, 4],
})
df['E'] = 1

cph = CoxPHFitter()
cph.fit(df, 'T', 'E', cluster_col='id', strata=['var1'], show_progress=True)
expected = pd.Series({'var2': 3.34})
assert_series_equal(cph.summary['se(coef)'], expected, check_less_precise=2, check_names=False)


def test_robust_errors_with_less_trival_weights_is_the_same_as_R(self, regression_dataset):
"""
Expand Down Expand Up @@ -1818,25 +1873,27 @@ def test_robust_errors_against_R_no_ties(self, regression_dataset):
def test_robust_errors_with_strata_against_R(self, rossi):
"""
df <- data.frame(
"var1" = c(1, 1, 2, 2, 2),
"var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092),
"T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666)
"var1" = c(1, 1, 2, 2, 2, 1),
"var2" = c(0.184677, 0.071893, 1.364646, 0.098375, 1.663092, 0.5),
"var3" = c(1, 2, 3, 2, 1, 2),
"T" = c( 7.335846, 5.269797, 11.684092, 12.678458, 6.601666, 8.)
)
df['E'] = 1
coxph(formula=Surv(T, E) ~ strata(var1) + var2, data=df, robust=TRUE)
coxph(formula=Surv(T, E) ~ strata(var1) + var2 + var3, data=df, robust=TRUE)
"""

df = pd.DataFrame({
"var1": [1, 1, 2, 2, 2],
"var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092],
"T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666]
"var1": [1, 1, 2, 2, 2, 1],
"var2": [0.184677, 0.071893, 1.364646, 0.098375, 1.663092, 0.5],
"var3": [1, 2, 3, 2, 1, 2],
"T": [7.335846, 5.269797, 11.684092, 12.678458, 6.601666, 8.0]
})
df['E'] = 1

cf = CoxPHFitter()
cf.fit(df, duration_col='T', event_col='E', strata=['var1'], robust=True)
npt.assert_allclose(cf.summary['se(coef)'].values, 2.78649, rtol=1e-2)
npt.assert_allclose(cf.summary['se(coef)'].values, np.array([1.076, 0.680]), rtol=1e-2)


@pytest.mark.xfail
Expand Down Expand Up @@ -1967,6 +2024,7 @@ def test_aalen_additive_median_predictions_split_data(self):
hz, coef, X = generate_hazard_rates(n, d, timeline)
T = generate_random_lifetimes(hz, timeline)
X['T'] = T
X = X.replace([np.inf, -np.inf], 10.0)
# fit it to Aalen's model
aaf = AalenAdditiveFitter()
aaf.fit(X, 'T')
Expand Down

0 comments on commit 9fc5256

Please sign in to comment.