Skip to content

Commit

Permalink
Merge 84e5852 into 6184551
Browse files Browse the repository at this point in the history
  • Loading branch information
CamDavidsonPilon committed May 29, 2017
2 parents 6184551 + 84e5852 commit ea8f160
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 77 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
### Changelogs

#### 0.10.0
- corrected bug that was returning the wrong baseline survival and hazard values in `CoxPHFitter` when `normalize=True`.
- removed `normalize` kwarg in `CoxPHFitter`. This was causing lots of confusion for users, and added code complexity. It's really nice to be able to remove it.
- correcting column name in `CoxPHFitter.baseline_survival_`
- `CoxPHFitter.baseline_cumulative_hazard_` is always centered, to mimic R's `basehaz` API.

#### 0.9.4
- adding `plot_loglogs` to `KaplanMeierFitter`
- added a (correct) check to see if some columns in a dataset will cause convergence problems.
Expand Down
45 changes: 19 additions & 26 deletions lifelines/fitters/coxph_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class CoxPHFitter(BaseFitter):
The penalty is 1/2 * penalizer * ||beta||^2.
"""

def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0, strata=None):
def __init__(self, alpha=0.95, tie_method='Efron', penalizer=0.0, strata=None):
if not (0 < alpha <= 1.):
raise ValueError('alpha parameter must be between 0 and 1.')
if penalizer < 0:
Expand All @@ -42,7 +42,6 @@ def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0
raise NotImplementedError("Only Efron is available atm.")

self.alpha = alpha
self.normalize = normalize
self.tie_method = tie_method
self.penalizer = penalizer
self.strata = strata
Expand Down Expand Up @@ -295,32 +294,27 @@ def fit(self, df, duration_col, event_col=None,
E = df[event_col]
del df[event_col]

# Store original non-normalized data
self.data = df if self.strata is None else df.reset_index()
self._check_values(df)

if self.normalize:
# Need to normalize future inputs as well
self._norm_mean = df.mean(0)
self._norm_std = df.std(0)
df = normalize(df)
self._norm_mean = df.mean(0)
self._norm_std = df.std(0)
df = normalize(df, self._norm_mean, self._norm_std)

E = E.astype(bool)

hazards_ = self._newton_rhaphson(df, T, E, initial_beta=initial_beta,
show_progress=show_progress,
include_likelihood=include_likelihood)

self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns,
index=['coef'])
self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
self.confidence_intervals_ = self._compute_confidence_intervals()

self.durations = T
self.event_observed = E

self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
self.baseline_hazard_ = self._compute_baseline_hazards(normalize(df, 0, 1 / self._norm_std), T, E)
self.baseline_cumulative_hazard_ = self.baseline_hazard_.cumsum()
self.baseline_survival_ = exp(-self.baseline_cumulative_hazard_)
self.baseline_survival_ = self._compute_baseline_survival()
return self

def _check_values(self, X):
Expand All @@ -342,7 +336,7 @@ def _compute_confidence_intervals(self):
columns=self.hazards_.columns)

def _compute_standard_errors(self):
se = np.sqrt(inv(-self._hessian_).diagonal())
se = np.sqrt(inv(-self._hessian_).diagonal()) / self._norm_std
return pd.DataFrame(se[None, :],
index=['se'], columns=self.hazards_.columns)

Expand Down Expand Up @@ -404,8 +398,6 @@ def predict_partial_hazard(self, X):
can be in any order. If a numpy array, columns must be in the
same order as the training data.
If covariates were normalized during fitting, they are normalized
in the same way here.
If X is a dataframe, the order of the columns do not matter. But
if X is an array, then the column ordering is assumed to be the
Expand All @@ -414,16 +406,11 @@ def predict_partial_hazard(self, X):
Returns the partial hazard for the individuals, partial since the
baseline hazard is not included. Equal to \exp{\beta X}
"""
index = _get_index(X)

if isinstance(X, pd.DataFrame):
order = self.hazards_.columns
X = X[order]

if self.normalize:
# Assuming correct ordering and number of columns
X = normalize(X, self._norm_mean.values, self._norm_std.values)

index = _get_index(X)
return pd.DataFrame(exp(np.dot(X, self.hazards_.T)), index=index)

def predict_log_hazard_relative_to_mean(self, X):
Expand Down Expand Up @@ -520,11 +507,17 @@ def _compute_baseline_hazards(self, df, T, E):
baseline_hazards_ = pd.DataFrame(index=self.durations.unique())
for stratum in df.index.unique():
baseline_hazards_ = baseline_hazards_.merge(
self._compute_baseline_hazard(data=df.ix[[stratum]], durations=T.ix[[stratum]], event_observed=E.ix[[stratum]], name=stratum),
left_index=True,
right_index=True,
how='left')
self._compute_baseline_hazard(data=df.ix[[stratum]], durations=T.ix[[stratum]], event_observed=E.ix[[stratum]], name=stratum),
left_index=True,
right_index=True,
how='left')
return baseline_hazards_.fillna(0)

else:
return self._compute_baseline_hazard(data=df, durations=T, event_observed=E, name='baseline hazard')

def _compute_baseline_survival(self):
survival_df = exp(-self.baseline_cumulative_hazard_)
if self.strata is None:
survival_df.columns = ['baseline survival']
return survival_df
2 changes: 1 addition & 1 deletion lifelines/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from __future__ import unicode_literals

__version__ = '0.9.4'
__version__ = '0.10.0'
78 changes: 28 additions & 50 deletions tests/test_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,13 +678,13 @@ def test_efron_newtons_method(self, data_nus):
assert np.abs(newton(X, T, E)[0][0] - -0.0335) < 0.0001

def test_fit_method(self, data_nus):
cf = CoxPHFitter(normalize=False)
cf = CoxPHFitter()
cf.fit(data_nus, duration_col='t', event_col='E')
assert np.abs(cf.hazards_.ix[0][0] - -0.0335) < 0.0001

def test_using_dataframes_vs_numpy_arrays(self, data_pred2):
# First without normalization
cf = CoxPHFitter(normalize=False)
cf = CoxPHFitter()
cf.fit(data_pred2, 't', 'E')

X = data_pred2[cf.data.columns]
Expand All @@ -694,22 +694,12 @@ def test_using_dataframes_vs_numpy_arrays(self, data_pred2):
hazards_n = cf.predict_partial_hazard(np.array(X))
assert np.all(hazards == hazards_n)

# Now with normalization
cf = CoxPHFitter(normalize=True)
cf.fit(data_pred2, 't', 'E')

hazards = cf.predict_partial_hazard(X)

# Compare with array argument
hazards_n = cf.predict_partial_hazard(np.array(X))
assert np.all(hazards == hazards_n)

def test_data_normalization(self, data_pred2):
# During fit, CoxPH copies the training data and normalizes it.
# Future calls should be normalized in the same way and
# internal training set should not be saved in a normalized state.

cf = CoxPHFitter(normalize=True)
cf = CoxPHFitter()
cf.fit(data_pred2, duration_col='t', event_col='E')

# Internal training set
Expand All @@ -731,20 +721,17 @@ def test_cox_ph_prediction_monotonicity(self, data_pred2):
e = data_pred2['E']
X = data_pred2[['x1', 'x2']]

for normalize in [True, False]:
msg = ("Predict methods should get the same concordance" +
" when {}normalizing".format('' if normalize else 'not '))
cf = CoxPHFitter(normalize=normalize)
cf.fit(data_pred2, duration_col='t', event_col='E')
cf = CoxPHFitter()
cf.fit(data_pred2, duration_col='t', event_col='E')

# Base comparison is partial_hazards
ci_ph = concordance_index(t, -cf.predict_partial_hazard(X).values, e)
# Base comparison is partial_hazards
ci_ph = concordance_index(t, -cf.predict_partial_hazard(X).values, e)

ci_med = concordance_index(t, cf.predict_median(X).ravel(), e)
assert ci_ph == ci_med, msg
ci_med = concordance_index(t, cf.predict_median(X).ravel(), e)
assert ci_ph == ci_med

ci_exp = concordance_index(t, cf.predict_expectation(X).ravel(), e)
assert ci_ph == ci_exp, msg
ci_exp = concordance_index(t, cf.predict_expectation(X).ravel(), e)
assert ci_ph == ci_exp

def test_crossval_for_cox_ph_with_normalizing_times(self, data_pred2, data_pred1):
cf = CoxPHFitter()
Expand Down Expand Up @@ -828,7 +815,7 @@ def test_output_against_R(self, rossi):
cat(round(mod.allison$coefficients, 4), sep=", ")
"""
expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849, 0.0915]])
cf = CoxPHFitter(normalize=False)
cf = CoxPHFitter()
cf.fit(rossi, duration_col='week', event_col='arrest')
npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)

Expand All @@ -839,20 +826,19 @@ def test_output_with_strata_against_R(self, rossi):
paro, mar, wexp) + prio, data = rossi)
"""
expected = np.array([[-0.335, -0.059, 0.100]])
cf = CoxPHFitter(normalize=False)
cf = CoxPHFitter()
cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp'])
npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)


def test_penalized_output_against_R(self, rossi):
# R code:
#
# rossi <- read.csv('.../lifelines/datasets/rossi.csv')
# mod.allison <- coxph(Surv(week, arrest) ~ ridge(fin, age, race, wexp, mar, paro, prio,
# theta=1.0, scale=FALSE), data=rossi)
# theta=1.0, scale=TRUE), data=rossi)
# cat(round(mod.allison$coefficients, 4), sep=", ")
expected = np.array([[-0.3641, -0.0580, 0.2894, -0.1496, -0.3837, -0.0822, 0.0913]])
cf = CoxPHFitter(normalize=False, penalizer=1.0)
expected = np.array([[-0.3761, -0.0565, 0.3099, -0.1532, -0.4295, -0.0837, 0.0909]])
cf = CoxPHFitter(penalizer=1.0)
cf.fit(rossi, duration_col='week', event_col='arrest')
npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)

Expand All @@ -861,7 +847,7 @@ def test_coef_output_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschbe
df = load_kidney_transplant(usecols=['time', 'death',
'black_male', 'white_male',
'black_female'])
cf = CoxPHFitter(normalize=False)
cf = CoxPHFitter()
cf.fit(df, duration_col='time', event_col='death')

# coefs
Expand All @@ -872,7 +858,7 @@ def test_coef_output_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschbe
def test_se_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self):
# see table 8.1 in Survival Analysis by John P. Klein and Melvin L. Moeschberger, Second Edition
df = load_larynx()
cf = CoxPHFitter(normalize=False)
cf = CoxPHFitter()
cf.fit(df, duration_col='time', event_col='death')

# standard errors
Expand Down Expand Up @@ -911,16 +897,16 @@ def test_strata_works_if_only_a_single_element_is_in_the_strata(self):
cp.fit(df, 'T', 'Status', strata=['Stratum'])
assert True

def test_strata_against_r_output(self, rossi):
def test_strata_against_R_output(self, rossi):
"""
> library(survival)
> ross = read.csv('rossi.csv')
> rossi = read.csv('.../lifelines/datasets/rossi.csv')
> r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
paro, mar, wexp) + prio, data = rossi)
> r$loglik
"""

cp = CoxPHFitter(normalize=False)
cp = CoxPHFitter()
cp.fit(rossi, 'week', 'arrest', strata=['race', 'paro', 'mar', 'wexp'], include_likelihood=True)

npt.assert_almost_equal(cp.summary['coef'].values, [-0.335, -0.059, 0.100], decimal=3)
Expand All @@ -929,26 +915,18 @@ def test_strata_against_r_output(self, rossi):
def test_hazard_works_as_intended_with_strata_against_R_output(self, rossi):
"""
> library(survival)
> ross = read.csv('rossi.csv')
> rossi = read.csv('.../lifelines/datasets/rossi.csv')
> r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
paro, mar, wexp) + prio, data = rossi)
> basehaz(r, centered=FALSE)
> basehaz(r, centered=TRUE)
"""
cp = CoxPHFitter(normalize=False)
cp = CoxPHFitter()
cp.fit(rossi, 'week', 'arrest', strata=['race', 'paro', 'mar', 'wexp'])
npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 0)].ix[[14, 35, 37, 43, 52]].values, [0.28665890, 0.63524149, 1.01822603, 1.48403930, 1.48403930], decimal=2)
npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 1)].ix[[27, 43, 48, 52]].values, [0.35738173, 0.76415714, 1.26635373, 1.26635373], decimal=2)

def test_predict_log_hazard_relative_to_mean_with_normalization(self, rossi):
cox = CoxPHFitter(normalize=True)
cox.fit(rossi, 'week', 'arrest')
npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 0)].ix[[14, 35, 37, 43, 52]].values, [0.076600555, 0.169748261, 0.272088807, 0.396562717, 0.396562717], decimal=2)
npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 1)].ix[[27, 43, 48, 52]].values, [0.095499001, 0.204196905, 0.338393113, 0.338393113], decimal=2)

# they are equal because the data is normalized, so the mean of the covarites is all 0,
# thus exp(beta * 0) == 1, so exp(beta * X)/exp(beta * 0) = exp(beta * X)
assert_frame_equal(cox.predict_log_hazard_relative_to_mean(rossi), np.log(cox.predict_partial_hazard(rossi)))

def test_predict_log_hazard_relative_to_mean_without_normalization(self, rossi):
cox = CoxPHFitter(normalize=False)
def test_predict_log_hazard_relative_to_mean(self, rossi):
cox = CoxPHFitter()
cox.fit(rossi, 'week', 'arrest')
log_relative_hazards = cox.predict_log_hazard_relative_to_mean(rossi)
means = rossi.mean(0).to_frame().T
Expand Down

0 comments on commit ea8f160

Please sign in to comment.