Skip to content

Commit

Permalink
Merge pull request #293 from CamDavidsonPilon/coxph-baseline
Browse files Browse the repository at this point in the history
Coxph baseline fix; remove normalize; bump version
  • Loading branch information
CamDavidsonPilon committed May 29, 2017
2 parents 6184551 + c9a0ae2 commit 0fe0d10
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 85 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
### Changelogs

#### 0.10.0
- corrected bug that was returning the wrong baseline survival and hazard values in `CoxPHFitter` when `normalize=True`.
- removed `normalize` kwarg in `CoxPHFitter`. This was causing lots of confusion for users, and added code complexity. It's really nice to be able to remove it.
- correcting column name in `CoxPHFitter.baseline_survival_`
- `CoxPHFitter.baseline_cumulative_hazard_` is always centered, to mimic R's `basehaz` API.
- new `predict_log_partial_hazards` to `CoxPHFitter`

#### 0.9.4
- adding `plot_loglogs` to `KaplanMeierFitter`
- added a (correct) check to see if some columns in a dataset will cause convergence problems.
Expand Down
75 changes: 43 additions & 32 deletions lifelines/fitters/coxph_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class CoxPHFitter(BaseFitter):
The penalty is 1/2 * penalizer * ||beta||^2.
"""

def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0, strata=None):
def __init__(self, alpha=0.95, tie_method='Efron', penalizer=0.0, strata=None):
if not (0 < alpha <= 1.):
raise ValueError('alpha parameter must be between 0 and 1.')
if penalizer < 0:
Expand All @@ -42,7 +42,6 @@ def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0
raise NotImplementedError("Only Efron is available atm.")

self.alpha = alpha
self.normalize = normalize
self.tie_method = tie_method
self.penalizer = penalizer
self.strata = strata
Expand Down Expand Up @@ -295,32 +294,27 @@ def fit(self, df, duration_col, event_col=None,
E = df[event_col]
del df[event_col]

# Store original non-normalized data
self.data = df if self.strata is None else df.reset_index()
self._check_values(df)

if self.normalize:
# Need to normalize future inputs as well
self._norm_mean = df.mean(0)
self._norm_std = df.std(0)
df = normalize(df)
self._norm_mean = df.mean(0)
self._norm_std = df.std(0)
df = normalize(df, self._norm_mean, self._norm_std)

E = E.astype(bool)

hazards_ = self._newton_rhaphson(df, T, E, initial_beta=initial_beta,
show_progress=show_progress,
include_likelihood=include_likelihood)

self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns,
index=['coef'])
self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
self.confidence_intervals_ = self._compute_confidence_intervals()

self.durations = T
self.event_observed = E

self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
self.baseline_hazard_ = self._compute_baseline_hazards(normalize(df, 0, 1 / self._norm_std), T, E)
self.baseline_cumulative_hazard_ = self.baseline_hazard_.cumsum()
self.baseline_survival_ = exp(-self.baseline_cumulative_hazard_)
self.baseline_survival_ = self._compute_baseline_survival()
return self

def _check_values(self, X):
Expand All @@ -342,7 +336,7 @@ def _compute_confidence_intervals(self):
columns=self.hazards_.columns)

def _compute_standard_errors(self):
se = np.sqrt(inv(-self._hessian_).diagonal())
se = np.sqrt(inv(-self._hessian_).diagonal()) / self._norm_std
return pd.DataFrame(se[None, :],
index=['se'], columns=self.hazards_.columns)

Expand Down Expand Up @@ -404,8 +398,6 @@ def predict_partial_hazard(self, X):
can be in any order. If a numpy array, columns must be in the
same order as the training data.
If covariates were normalized during fitting, they are normalized
in the same way here.
If X is a dataframe, the order of the columns do not matter. But
if X is an array, then the column ordering is assumed to be the
Expand All @@ -414,17 +406,28 @@ def predict_partial_hazard(self, X):
Returns the partial hazard for the individuals, partial since the
baseline hazard is not included. Equal to \exp{\beta X}
"""
index = _get_index(X)
return exp(self.predict_log_partial_hazard(X))

def predict_log_partial_hazard(self, X):
"""
X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
can be in any order. If a numpy array, columns must be in the
same order as the training data.
If X is a dataframe, the order of the columns do not matter. But
if X is an array, then the column ordering is assumed to be the
same as the training dataset.
Returns the log of the partial hazard for the individuals, partial since the
baseline hazard is not included. Equal to \beta X
"""
if isinstance(X, pd.DataFrame):
order = self.hazards_.columns
X = X[order]

if self.normalize:
# Assuming correct ordering and number of columns
X = normalize(X, self._norm_mean.values, self._norm_std.values)

return pd.DataFrame(exp(np.dot(X, self.hazards_.T)), index=index)
index = _get_index(X)
return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)

def predict_log_hazard_relative_to_mean(self, X):
"""
Expand All @@ -433,29 +436,31 @@ def predict_log_hazard_relative_to_mean(self, X):
same order as the training data.
Returns the log hazard relative to the hazard of the mean covariates. This is the behaviour
of R's predict.coxph.
of R's predict.coxph. Equal to \beta X - \beta \bar{X}
"""
mean_covariates = self.data.mean(0).to_frame().T
return np.log(self.predict_partial_hazard(X) / self.predict_partial_hazard(mean_covariates).squeeze())
return self.predict_log_partial_hazard(X) - self.predict_log_partial_hazard(mean_covariates).squeeze()

def predict_cumulative_hazard(self, X):
"""
X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
can be in any order. If a numpy array, columns must be in the
same order as the training data.
Returns the cumulative hazard of individuals.
"""
if self.strata:
cumulative_hazard_ = pd.DataFrame()
for stratum, stratified_X in X.groupby(self.strata):
s_0 = self.baseline_survival_[[stratum]]
c_0 = self.baseline_cumulative_hazard_[[stratum]]
col = _get_index(stratified_X)
v = self.predict_partial_hazard(stratified_X)
cumulative_hazard_ = cumulative_hazard_.merge(pd.DataFrame(-np.dot(np.log(s_0), v.T), index=s_0.index, columns=col), how='outer', right_index=True, left_index=True)
cumulative_hazard_ = cumulative_hazard_.merge(pd.DataFrame(np.dot(c_0, v.T), index=c_0.index, columns=col), how='outer', right_index=True, left_index=True)
else:
s_0 = self.baseline_survival_
c_0 = self.baseline_cumulative_hazard_
col = _get_index(X)
v = self.predict_partial_hazard(X)
cumulative_hazard_ = pd.DataFrame(-np.dot(np.log(s_0), v.T), columns=col, index=s_0.index)
cumulative_hazard_ = pd.DataFrame(np.dot(c_0, v.T), columns=col, index=c_0.index)

return cumulative_hazard_

Expand Down Expand Up @@ -520,11 +525,17 @@ def _compute_baseline_hazards(self, df, T, E):
baseline_hazards_ = pd.DataFrame(index=self.durations.unique())
for stratum in df.index.unique():
baseline_hazards_ = baseline_hazards_.merge(
self._compute_baseline_hazard(data=df.ix[[stratum]], durations=T.ix[[stratum]], event_observed=E.ix[[stratum]], name=stratum),
left_index=True,
right_index=True,
how='left')
self._compute_baseline_hazard(data=df.ix[[stratum]], durations=T.ix[[stratum]], event_observed=E.ix[[stratum]], name=stratum),
left_index=True,
right_index=True,
how='left')
return baseline_hazards_.fillna(0)

else:
return self._compute_baseline_hazard(data=df, durations=T, event_observed=E, name='baseline hazard')

def _compute_baseline_survival(self):
survival_df = exp(-self.baseline_cumulative_hazard_)
if self.strata is None:
survival_df.columns = ['baseline survival']
return survival_df
2 changes: 1 addition & 1 deletion lifelines/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from __future__ import unicode_literals

__version__ = '0.9.4'
__version__ = '0.10.0'
82 changes: 30 additions & 52 deletions tests/test_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from lifelines.estimation import CoxPHFitter, AalenAdditiveFitter, KaplanMeierFitter, \
NelsonAalenFitter, BreslowFlemingHarringtonFitter, ExponentialFitter, \
WeibullFitter, BaseFitter
from lifelines.datasets import load_regression_dataset, load_larynx, load_waltons, load_kidney_transplant, load_rossi,\
from lifelines.datasets import load_larynx, load_waltons, load_kidney_transplant, load_rossi,\
load_lcd, load_panel_test, load_g3, load_holly_molly_polly
from lifelines.generate_datasets import generate_hazard_rates, generate_random_lifetimes, cumulative_integral
from lifelines.utils import concordance_index
Expand Down Expand Up @@ -678,13 +678,13 @@ def test_efron_newtons_method(self, data_nus):
assert np.abs(newton(X, T, E)[0][0] - -0.0335) < 0.0001

def test_fit_method(self, data_nus):
cf = CoxPHFitter(normalize=False)
cf = CoxPHFitter()
cf.fit(data_nus, duration_col='t', event_col='E')
assert np.abs(cf.hazards_.ix[0][0] - -0.0335) < 0.0001

def test_using_dataframes_vs_numpy_arrays(self, data_pred2):
# First without normalization
cf = CoxPHFitter(normalize=False)
cf = CoxPHFitter()
cf.fit(data_pred2, 't', 'E')

X = data_pred2[cf.data.columns]
Expand All @@ -694,22 +694,12 @@ def test_using_dataframes_vs_numpy_arrays(self, data_pred2):
hazards_n = cf.predict_partial_hazard(np.array(X))
assert np.all(hazards == hazards_n)

# Now with normalization
cf = CoxPHFitter(normalize=True)
cf.fit(data_pred2, 't', 'E')

hazards = cf.predict_partial_hazard(X)

# Compare with array argument
hazards_n = cf.predict_partial_hazard(np.array(X))
assert np.all(hazards == hazards_n)

def test_data_normalization(self, data_pred2):
# During fit, CoxPH copies the training data and normalizes it.
# Future calls should be normalized in the same way and
# internal training set should not be saved in a normalized state.

cf = CoxPHFitter(normalize=True)
cf = CoxPHFitter()
cf.fit(data_pred2, duration_col='t', event_col='E')

# Internal training set
Expand All @@ -731,20 +721,17 @@ def test_cox_ph_prediction_monotonicity(self, data_pred2):
e = data_pred2['E']
X = data_pred2[['x1', 'x2']]

for normalize in [True, False]:
msg = ("Predict methods should get the same concordance" +
" when {}normalizing".format('' if normalize else 'not '))
cf = CoxPHFitter(normalize=normalize)
cf.fit(data_pred2, duration_col='t', event_col='E')
cf = CoxPHFitter()
cf.fit(data_pred2, duration_col='t', event_col='E')

# Base comparison is partial_hazards
ci_ph = concordance_index(t, -cf.predict_partial_hazard(X).values, e)
# Base comparison is partial_hazards
ci_ph = concordance_index(t, -cf.predict_partial_hazard(X).values, e)

ci_med = concordance_index(t, cf.predict_median(X).ravel(), e)
assert ci_ph == ci_med, msg
ci_med = concordance_index(t, cf.predict_median(X).ravel(), e)
assert ci_ph == ci_med

ci_exp = concordance_index(t, cf.predict_expectation(X).ravel(), e)
assert ci_ph == ci_exp, msg
ci_exp = concordance_index(t, cf.predict_expectation(X).ravel(), e)
assert ci_ph == ci_exp

def test_crossval_for_cox_ph_with_normalizing_times(self, data_pred2, data_pred1):
cf = CoxPHFitter()
Expand Down Expand Up @@ -828,7 +815,7 @@ def test_output_against_R(self, rossi):
cat(round(mod.allison$coefficients, 4), sep=", ")
"""
expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849, 0.0915]])
cf = CoxPHFitter(normalize=False)
cf = CoxPHFitter()
cf.fit(rossi, duration_col='week', event_col='arrest')
npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)

Expand All @@ -838,21 +825,20 @@ def test_output_with_strata_against_R(self, rossi):
r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
paro, mar, wexp) + prio, data = rossi)
"""
expected = np.array([[-0.335, -0.059, 0.100]])
cf = CoxPHFitter(normalize=False)
expected = np.array([[-0.3355, -0.0590, 0.1002]])
cf = CoxPHFitter()
cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp'])
npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)


def test_penalized_output_against_R(self, rossi):
# R code:
#
# rossi <- read.csv('.../lifelines/datasets/rossi.csv')
# mod.allison <- coxph(Surv(week, arrest) ~ ridge(fin, age, race, wexp, mar, paro, prio,
# theta=1.0, scale=FALSE), data=rossi)
# theta=1.0, scale=TRUE), data=rossi)
# cat(round(mod.allison$coefficients, 4), sep=", ")
expected = np.array([[-0.3641, -0.0580, 0.2894, -0.1496, -0.3837, -0.0822, 0.0913]])
cf = CoxPHFitter(normalize=False, penalizer=1.0)
expected = np.array([[-0.3761, -0.0565, 0.3099, -0.1532, -0.4295, -0.0837, 0.0909]])
cf = CoxPHFitter(penalizer=1.0)
cf.fit(rossi, duration_col='week', event_col='arrest')
npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)

Expand All @@ -861,7 +847,7 @@ def test_coef_output_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschbe
df = load_kidney_transplant(usecols=['time', 'death',
'black_male', 'white_male',
'black_female'])
cf = CoxPHFitter(normalize=False)
cf = CoxPHFitter()
cf.fit(df, duration_col='time', event_col='death')

# coefs
Expand All @@ -872,7 +858,7 @@ def test_coef_output_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschbe
def test_se_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self):
# see table 8.1 in Survival Analysis by John P. Klein and Melvin L. Moeschberger, Second Edition
df = load_larynx()
cf = CoxPHFitter(normalize=False)
cf = CoxPHFitter()
cf.fit(df, duration_col='time', event_col='death')

# standard errors
Expand Down Expand Up @@ -911,16 +897,16 @@ def test_strata_works_if_only_a_single_element_is_in_the_strata(self):
cp.fit(df, 'T', 'Status', strata=['Stratum'])
assert True

def test_strata_against_r_output(self, rossi):
def test_strata_against_R_output(self, rossi):
"""
> library(survival)
> ross = read.csv('rossi.csv')
> rossi = read.csv('.../lifelines/datasets/rossi.csv')
> r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
paro, mar, wexp) + prio, data = rossi)
> r$loglik
"""

cp = CoxPHFitter(normalize=False)
cp = CoxPHFitter()
cp.fit(rossi, 'week', 'arrest', strata=['race', 'paro', 'mar', 'wexp'], include_likelihood=True)

npt.assert_almost_equal(cp.summary['coef'].values, [-0.335, -0.059, 0.100], decimal=3)
Expand All @@ -929,26 +915,18 @@ def test_strata_against_r_output(self, rossi):
def test_hazard_works_as_intended_with_strata_against_R_output(self, rossi):
"""
> library(survival)
> ross = read.csv('rossi.csv')
> rossi = read.csv('.../lifelines/datasets/rossi.csv')
> r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
paro, mar, wexp) + prio, data = rossi)
> basehaz(r, centered=FALSE)
> basehaz(r, centered=TRUE)
"""
cp = CoxPHFitter(normalize=False)
cp = CoxPHFitter()
cp.fit(rossi, 'week', 'arrest', strata=['race', 'paro', 'mar', 'wexp'])
npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 0)].ix[[14, 35, 37, 43, 52]].values, [0.28665890, 0.63524149, 1.01822603, 1.48403930, 1.48403930], decimal=2)
npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 1)].ix[[27, 43, 48, 52]].values, [0.35738173, 0.76415714, 1.26635373, 1.26635373], decimal=2)

def test_predict_log_hazard_relative_to_mean_with_normalization(self, rossi):
cox = CoxPHFitter(normalize=True)
cox.fit(rossi, 'week', 'arrest')
npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 0)].ix[[14, 35, 37, 43, 52]].values, [0.076600555, 0.169748261, 0.272088807, 0.396562717, 0.396562717], decimal=2)
npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 1)].ix[[27, 43, 48, 52]].values, [0.095499001, 0.204196905, 0.338393113, 0.338393113], decimal=2)

# they are equal because the data is normalized, so the mean of the covarites is all 0,
# thus exp(beta * 0) == 1, so exp(beta * X)/exp(beta * 0) = exp(beta * X)
assert_frame_equal(cox.predict_log_hazard_relative_to_mean(rossi), np.log(cox.predict_partial_hazard(rossi)))

def test_predict_log_hazard_relative_to_mean_without_normalization(self, rossi):
cox = CoxPHFitter(normalize=False)
def test_predict_log_hazard_relative_to_mean(self, rossi):
cox = CoxPHFitter()
cox.fit(rossi, 'week', 'arrest')
log_relative_hazards = cox.predict_log_hazard_relative_to_mean(rossi)
means = rossi.mean(0).to_frame().T
Expand Down

0 comments on commit 0fe0d10

Please sign in to comment.