Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding strata to coxphfitter args #270

Merged
merged 3 commits into from
Dec 29, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
- adding `plot_loglogs` to `KaplanMeierFitter`
- added a (correct) check to see if some columns in a dataset will cause convergence problems.
- removing `flat` argument in `plot` methods. It was causing confusion. To replicate it, one can set `ci_force_lines=True` and `show_censors=True`.
- adding `strata` keyword argument to `CoxPHFitter` on initialization (ex: `CoxPHFitter(strata=['v1', 'v2'])`. Why? Fitters initialized with `strata` can now be passed into `k_fold_cross_validation`, plus it makes unit testing `strata` fitters easier.

#### 0.9.2
- deprecates Pandas versions before 0.18.
Expand Down
9 changes: 5 additions & 4 deletions docs/Survival Regression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -468,19 +468,20 @@ Sometimes a covariate may not obey the proportional hazard assumption. In this c
Concordance = 0.638
"""


Model Selection in Survival Regression
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

With censorship, it's not correct to use a loss function like mean-squared-error or
mean-absolute-loss. Instead, one measure is the c-index, or concordance-index. This measure
evaluates the ordering of predicted times: how correct is the ordering? It is infact a generalization
If censorship is present, it's not appropriate to use a loss function like mean-squared-error or
mean-absolute-loss. Instead, one measure is the concordance-index, also known as the c-index. This measure
evaluates the accuracy of the ordering of predicted time. It is infact a generalization
of AUC, another common loss function, and is interpreted similarly:

* 0.5 is the expected result from random predictions,
* 1.0 is perfect concordance and,
* 0.0 is perfect anti-concordance (multiply predictions with -1 to get 1.0)

The measure is implemented in lifelines under `lifelines.utils.concordance_index` and accepts the actual times (along with any censorships), and the predicted times.
The measure is implemented in lifelines under `lifelines.utils.concordance_index` and accepts the actual times (along with any censorships) and the predicted times.

Cross Validation
######################################
Expand Down
4 changes: 2 additions & 2 deletions lifelines/fitters/coxph_fitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class CoxPHFitter(BaseFitter):
The penalty is 1/2 * penalizer * ||beta||^2.
"""

def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0):
def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0, strata=None):
if not (0 < alpha <= 1.):
raise ValueError('alpha parameter must be between 0 and 1.')
if penalizer < 0:
Expand All @@ -45,7 +45,7 @@ def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0
self.normalize = normalize
self.tie_method = tie_method
self.penalizer = penalizer
self.strata = None
self.strata = strata

def _get_efron_values(self, X, beta, T, E, include_likelihood=False):
"""
Expand Down
95 changes: 39 additions & 56 deletions tests/test_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from collections import Counter, Iterable
import os
import warnings
from itertools import combinations

try:
from StringIO import StringIO as stringio, StringIO
Expand Down Expand Up @@ -520,79 +521,61 @@ def test_BHF_fit(self):

class TestRegressionFitters():

def test_pickle(self, rossi):
@pytest.fixture
def regression_models(self):
return [CoxPHFitter(), AalenAdditiveFitter(), CoxPHFitter(strata=['race', 'paro', 'mar', 'wexp'])]

def test_pickle(self, rossi, regression_models):
from pickle import dump
for fitter in [CoxPHFitter, AalenAdditiveFitter]:
for fitter in regression_models:
output = stringio()
f = fitter().fit(rossi, 'week', 'arrest')
f = fitter.fit(rossi, 'week', 'arrest')
dump(f, output)

def test_fit_methods_require_duration_col(self):
X = load_regression_dataset()

aaf = AalenAdditiveFitter()
cph = CoxPHFitter()

with pytest.raises(TypeError):
aaf.fit(X)
with pytest.raises(TypeError):
cph.fit(X)
def test_fit_methods_require_duration_col(self, rossi, regression_models):
for fitter in regression_models:
with pytest.raises(TypeError):
fitter.fit(rossi)

def test_fit_methods_can_accept_optional_event_col_param(self):
X = load_regression_dataset()

aaf = AalenAdditiveFitter()
aaf.fit(X, 'T', event_col='E')
assert_series_equal(aaf.event_observed.sort_index(), X['E'].astype(bool), check_names=False)

aaf.fit(X, 'T')
npt.assert_array_equal(aaf.event_observed.values, np.ones(X.shape[0]))
def test_fit_methods_can_accept_optional_event_col_param(self, regression_models, rossi):
for model in regression_models:
model.fit(rossi, 'week', event_col='arrest')
assert_series_equal(model.event_observed.sort_index(), rossi['arrest'].astype(bool), check_names=False)

cph = CoxPHFitter()
cph.fit(X, 'T', event_col='E')
assert_series_equal(cph.event_observed.sort_index(), X['E'].astype(bool), check_names=False)
model.fit(rossi, 'week')
npt.assert_array_equal(model.event_observed.values, np.ones(rossi.shape[0]))

cph.fit(X, 'T')
npt.assert_array_equal(cph.event_observed.values, np.ones(X.shape[0]))
def test_predict_methods_in_regression_return_same_types(self, regression_models, rossi):

def test_predict_methods_in_regression_return_same_types(self):
X = load_regression_dataset()

aaf = AalenAdditiveFitter()
cph = CoxPHFitter()

aaf.fit(X, duration_col='T', event_col='E')
cph.fit(X, duration_col='T', event_col='E')
fitted_regression_models = map(lambda model: model.fit(rossi, duration_col='week', event_col='arrest'), regression_models)

for fit_method in ['predict_percentile', 'predict_median', 'predict_expectation', 'predict_survival_function', 'predict_cumulative_hazard']:
assert isinstance(getattr(aaf, fit_method)(X), type(getattr(cph, fit_method)(X)))
for fitter1, fitter2 in combinations(fitted_regression_models, 2):
assert isinstance(getattr(fitter1, fit_method)(rossi), type(getattr(fitter2, fit_method)(rossi)))

def test_duration_vector_can_be_normalized(self):
df = load_kidney_transplant()
t = df['time']
normalized_df = df.copy()
normalized_df['time'] = (normalized_df['time'] - t.mean()) / t.std()
def test_duration_vector_can_be_normalized(self, regression_models, rossi):
t = rossi['week']
normalized_rossi = rossi.copy()
normalized_rossi['week'] = (normalized_rossi['week'] - t.mean()) / t.std()

for fitter in [CoxPHFitter(), AalenAdditiveFitter()]:
for fitter in regression_models:
# we drop indexs since aaf will have a different "time" index.
hazards = fitter.fit(df, duration_col='time', event_col='death').hazards_.reset_index(drop=True)
hazards_norm = fitter.fit(normalized_df, duration_col='time', event_col='death').hazards_.reset_index(drop=True)
hazards = fitter.fit(rossi, duration_col='week', event_col='arrest').hazards_.reset_index(drop=True)
hazards_norm = fitter.fit(normalized_rossi, duration_col='week', event_col='arrest').hazards_.reset_index(drop=True)
assert_frame_equal(hazards, hazards_norm)

def test_prediction_methods_respect_index(self, data_pred2):
x = data_pred2[['x1', 'x2']].ix[:3].sort_index(ascending=False)
def test_prediction_methods_respect_index(self, regression_models, rossi):
X = rossi.ix[:3].sort_index(ascending=False)
expected_index = pd.Index(np.array([3, 2, 1, 0]))

cph = CoxPHFitter()
cph.fit(data_pred2, duration_col='t', event_col='E')
npt.assert_array_equal(cph.predict_partial_hazard(x).index, expected_index)
npt.assert_array_equal(cph.predict_percentile(x).index, expected_index)
npt.assert_array_equal(cph.predict_expectation(x).index, expected_index)

aaf = AalenAdditiveFitter()
aaf.fit(data_pred2, duration_col='t', event_col='E')
npt.assert_array_equal(aaf.predict_percentile(x).index, expected_index)
npt.assert_array_equal(aaf.predict_expectation(x).index, expected_index)
for fitter in regression_models:
fitter.fit(rossi, duration_col='week', event_col='arrest')
npt.assert_array_equal(fitter.predict_percentile(X).index, expected_index)
npt.assert_array_equal(fitter.predict_expectation(X).index, expected_index)
try:
npt.assert_array_equal(fitter.predict_partial_hazard(X).index, expected_index)
except AttributeError:
pass


class TestCoxPHFitter():
Expand Down
5 changes: 5 additions & 0 deletions tests/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,11 @@ def test_cross_validator_with_predictor_and_kwargs():
assert len(results_06) == 3


def test_cross_validator_with_stratified_cox_model():
cf = CoxPHFitter(strata=['race'])
utils.k_fold_cross_validation(cf, load_rossi(), duration_col='week', event_col='arrest')


def test_cross_validator_with_specific_loss_function():
def square_loss(y_actual, y_pred):
return ((y_actual - y_pred) ** 2).mean()
Expand Down