Skip to content

Commit

Permalink
Merge 25f6776 into 8136156
Browse files Browse the repository at this point in the history
  • Loading branch information
CamDavidsonPilon committed Dec 29, 2016
2 parents 8136156 + 25f6776 commit caf702f
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 62 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
- adding `plot_loglogs` to `KaplanMeierFitter`
- added a (correct) check to see if some columns in a dataset will cause convergence problems.
- removing `flat` argument in `plot` methods. It was causing confusion. To replicate it, one can set `ci_force_lines=True` and `show_censors=True`.
- adding `strata` keyword argument to `CoxPHFitter` on initialization (ex: `CoxPHFitter(strata=['v1', 'v2'])`. Why? Fitters initialized with `strata` can now be passed into `k_fold_cross_validation`, plus it makes unit testing `strata` fitters easier.

#### 0.9.2
- deprecates Pandas versions before 0.18.
Expand Down
9 changes: 5 additions & 4 deletions docs/Survival Regression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -468,19 +468,20 @@ Sometimes a covariate may not obey the proportional hazard assumption. In this c
Concordance = 0.638
"""
Model Selection in Survival Regression
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

With censorship, it's not correct to use a loss function like mean-squared-error or
mean-absolute-loss. Instead, one measure is the c-index, or concordance-index. This measure
evaluates the ordering of predicted times: how correct is the ordering? It is infact a generalization
If censorship is present, it's not appropriate to use a loss function like mean-squared-error or
mean-absolute-loss. Instead, one measure is the concordance-index, also known as the c-index. This measure
evaluates the accuracy of the ordering of predicted time. It is infact a generalization
of AUC, another common loss function, and is interpreted similarly:

* 0.5 is the expected result from random predictions,
* 1.0 is perfect concordance and,
* 0.0 is perfect anti-concordance (multiply predictions with -1 to get 1.0)

The measure is implemented in lifelines under `lifelines.utils.concordance_index` and accepts the actual times (along with any censorships), and the predicted times.
The measure is implemented in lifelines under `lifelines.utils.concordance_index` and accepts the actual times (along with any censorships) and the predicted times.

Cross Validation
######################################
Expand Down
4 changes: 2 additions & 2 deletions lifelines/fitters/coxph_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class CoxPHFitter(BaseFitter):
The penalty is 1/2 * penalizer * ||beta||^2.
"""

def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0):
def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0, strata=None):
if not (0 < alpha <= 1.):
raise ValueError('alpha parameter must be between 0 and 1.')
if penalizer < 0:
Expand All @@ -45,7 +45,7 @@ def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0
self.normalize = normalize
self.tie_method = tie_method
self.penalizer = penalizer
self.strata = None
self.strata = strata

def _get_efron_values(self, X, beta, T, E, include_likelihood=False):
"""
Expand Down
95 changes: 39 additions & 56 deletions tests/test_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections import Counter, Iterable
import os
import warnings
from itertools import combinations

try:
from StringIO import StringIO as stringio, StringIO
Expand Down Expand Up @@ -520,79 +521,61 @@ def test_BHF_fit(self):

class TestRegressionFitters():

def test_pickle(self, rossi):
@pytest.fixture
def regression_models(self):
return [CoxPHFitter(), AalenAdditiveFitter(), CoxPHFitter(strata=['race', 'paro', 'mar', 'wexp'])]

def test_pickle(self, rossi, regression_models):
from pickle import dump
for fitter in [CoxPHFitter, AalenAdditiveFitter]:
for fitter in regression_models:
output = stringio()
f = fitter().fit(rossi, 'week', 'arrest')
f = fitter.fit(rossi, 'week', 'arrest')
dump(f, output)

def test_fit_methods_require_duration_col(self):
X = load_regression_dataset()

aaf = AalenAdditiveFitter()
cph = CoxPHFitter()

with pytest.raises(TypeError):
aaf.fit(X)
with pytest.raises(TypeError):
cph.fit(X)
def test_fit_methods_require_duration_col(self, rossi, regression_models):
for fitter in regression_models:
with pytest.raises(TypeError):
fitter.fit(rossi)

def test_fit_methods_can_accept_optional_event_col_param(self):
X = load_regression_dataset()

aaf = AalenAdditiveFitter()
aaf.fit(X, 'T', event_col='E')
assert_series_equal(aaf.event_observed.sort_index(), X['E'].astype(bool), check_names=False)

aaf.fit(X, 'T')
npt.assert_array_equal(aaf.event_observed.values, np.ones(X.shape[0]))
def test_fit_methods_can_accept_optional_event_col_param(self, regression_models, rossi):
for model in regression_models:
model.fit(rossi, 'week', event_col='arrest')
assert_series_equal(model.event_observed.sort_index(), rossi['arrest'].astype(bool), check_names=False)

cph = CoxPHFitter()
cph.fit(X, 'T', event_col='E')
assert_series_equal(cph.event_observed.sort_index(), X['E'].astype(bool), check_names=False)
model.fit(rossi, 'week')
npt.assert_array_equal(model.event_observed.values, np.ones(rossi.shape[0]))

cph.fit(X, 'T')
npt.assert_array_equal(cph.event_observed.values, np.ones(X.shape[0]))
def test_predict_methods_in_regression_return_same_types(self, regression_models, rossi):

def test_predict_methods_in_regression_return_same_types(self):
X = load_regression_dataset()

aaf = AalenAdditiveFitter()
cph = CoxPHFitter()

aaf.fit(X, duration_col='T', event_col='E')
cph.fit(X, duration_col='T', event_col='E')
fitted_regression_models = map(lambda model: model.fit(rossi, duration_col='week', event_col='arrest'), regression_models)

for fit_method in ['predict_percentile', 'predict_median', 'predict_expectation', 'predict_survival_function', 'predict_cumulative_hazard']:
assert isinstance(getattr(aaf, fit_method)(X), type(getattr(cph, fit_method)(X)))
for fitter1, fitter2 in combinations(fitted_regression_models, 2):
assert isinstance(getattr(fitter1, fit_method)(rossi), type(getattr(fitter2, fit_method)(rossi)))

def test_duration_vector_can_be_normalized(self):
df = load_kidney_transplant()
t = df['time']
normalized_df = df.copy()
normalized_df['time'] = (normalized_df['time'] - t.mean()) / t.std()
def test_duration_vector_can_be_normalized(self, regression_models, rossi):
t = rossi['week']
normalized_rossi = rossi.copy()
normalized_rossi['week'] = (normalized_rossi['week'] - t.mean()) / t.std()

for fitter in [CoxPHFitter(), AalenAdditiveFitter()]:
for fitter in regression_models:
# we drop indexs since aaf will have a different "time" index.
hazards = fitter.fit(df, duration_col='time', event_col='death').hazards_.reset_index(drop=True)
hazards_norm = fitter.fit(normalized_df, duration_col='time', event_col='death').hazards_.reset_index(drop=True)
hazards = fitter.fit(rossi, duration_col='week', event_col='arrest').hazards_.reset_index(drop=True)
hazards_norm = fitter.fit(normalized_rossi, duration_col='week', event_col='arrest').hazards_.reset_index(drop=True)
assert_frame_equal(hazards, hazards_norm)

def test_prediction_methods_respect_index(self, data_pred2):
x = data_pred2[['x1', 'x2']].ix[:3].sort_index(ascending=False)
def test_prediction_methods_respect_index(self, regression_models, rossi):
X = rossi.ix[:3].sort_index(ascending=False)
expected_index = pd.Index(np.array([3, 2, 1, 0]))

cph = CoxPHFitter()
cph.fit(data_pred2, duration_col='t', event_col='E')
npt.assert_array_equal(cph.predict_partial_hazard(x).index, expected_index)
npt.assert_array_equal(cph.predict_percentile(x).index, expected_index)
npt.assert_array_equal(cph.predict_expectation(x).index, expected_index)

aaf = AalenAdditiveFitter()
aaf.fit(data_pred2, duration_col='t', event_col='E')
npt.assert_array_equal(aaf.predict_percentile(x).index, expected_index)
npt.assert_array_equal(aaf.predict_expectation(x).index, expected_index)
for fitter in regression_models:
fitter.fit(rossi, duration_col='week', event_col='arrest')
npt.assert_array_equal(fitter.predict_percentile(X).index, expected_index)
npt.assert_array_equal(fitter.predict_expectation(X).index, expected_index)
try:
npt.assert_array_equal(fitter.predict_partial_hazard(X).index, expected_index)
except AttributeError:
pass


class TestCoxPHFitter():
Expand Down
5 changes: 5 additions & 0 deletions tests/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,11 @@ def test_cross_validator_with_predictor_and_kwargs():
assert len(results_06) == 3


def test_cross_validator_with_stratified_cox_model():
cf = CoxPHFitter(strata=['race'])
utils.k_fold_cross_validation(cf, load_rossi(), duration_col='week', event_col='arrest')


def test_cross_validator_with_specific_loss_function():
def square_loss(y_actual, y_pred):
return ((y_actual - y_pred) ** 2).mean()
Expand Down

0 comments on commit caf702f

Please sign in to comment.