From 20809c17d6e7cb4e71f52b9cdc2755d2ce7b0ac1 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Tue, 23 Jun 2015 22:42:18 -0400 Subject: [PATCH 01/11] bump version to 0.8.0 --- lifelines/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lifelines/version.py b/lifelines/version.py index 8b4603bbb..94d895c07 100644 --- a/lifelines/version.py +++ b/lifelines/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '0.7.1.0' +__version__ = '0.8.0.0' From 5858b3885486e3dda8f15578d84531f77e030a0d Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Tue, 23 Jun 2015 22:25:51 -0400 Subject: [PATCH 02/11] move files around first --- .travis.yml | 2 +- lifelines/estimation.py | 16 ++++++++-------- .../{_base_fitter.py => fitters/__init__.py} | 2 ++ lifelines/{ => fitters}/aalen_additive_fitter.py | 6 ++++-- .../breslow_fleming_harrington_fitter.py | 4 ++-- lifelines/{ => fitters}/coxph_fitter.py | 2 +- lifelines/{ => fitters}/exponential_fitter.py | 2 +- lifelines/{ => fitters}/kaplan_meier_fitter.py | 2 +- lifelines/{ => fitters}/nelson_aalen_fitter.py | 2 +- lifelines/{ => fitters}/weibull_fitter.py | 3 ++- lifelines/tests/__init__.py | 0 lifelines/{utils.py => utils/__init__.py} | 0 lifelines/{ => utils}/progress_bar.py | 0 setup.py | 5 +++-- {lifelines/tests => tests}/__main__.py | 0 {lifelines/tests => tests}/conftest.py | 0 {lifelines/tests => tests}/test_estimation.py | 10 +++++----- .../tests => tests}/test_generate_datasets.py | 4 ++-- {lifelines/tests => tests}/test_plotting.py | 6 +++--- {lifelines/tests => tests}/test_statistics.py | 4 ++-- {lifelines/tests => tests}/test_weibull.py | 2 +- {lifelines/tests => tests/utils}/test_utils.py | 5 +++-- 22 files changed, 42 insertions(+), 35 deletions(-) rename lifelines/{_base_fitter.py => fitters/__init__.py} (99%) rename lifelines/{ => fitters}/aalen_additive_fitter.py (99%) rename lifelines/{ => fitters}/breslow_fleming_harrington_fitter.py (96%) rename lifelines/{ => fitters}/coxph_fitter.py (99%) rename lifelines/{ => fitters}/exponential_fitter.py (98%) rename lifelines/{ => fitters}/kaplan_meier_fitter.py (98%) rename lifelines/{ => fitters}/nelson_aalen_fitter.py (99%) rename lifelines/{ => fitters}/weibull_fitter.py (99%) delete mode 100644 lifelines/tests/__init__.py rename lifelines/{utils.py => utils/__init__.py} (100%) rename lifelines/{ => utils}/progress_bar.py (100%) rename {lifelines/tests => tests}/__main__.py (100%) rename {lifelines/tests => tests}/conftest.py (100%) rename {lifelines/tests => tests}/test_estimation.py (99%) rename {lifelines/tests => tests}/test_generate_datasets.py (87%) rename {lifelines/tests => tests}/test_plotting.py (97%) rename {lifelines/tests => tests}/test_statistics.py (98%) rename {lifelines/tests => tests}/test_weibull.py (92%) rename {lifelines/tests => tests/utils}/test_utils.py (99%) diff --git a/.travis.yml b/.travis.yml index 663e0b860..b799c623f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,7 +28,7 @@ install: - conda create -n test-environment python=$TRAVIS_PYTHON_VERSION pip numpy scipy pandas matplotlib pytest - source activate test-environment # Build in place so we can run tests - - python setup.py build_ext --inplace + - python setup.py install - pip install coveralls - pip install pytest-cov # command to run tests diff --git a/lifelines/estimation.py b/lifelines/estimation.py index 906506b6c..00925a61b 100644 --- a/lifelines/estimation.py +++ b/lifelines/estimation.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- -from lifelines._base_fitter import BaseFitter -from lifelines.weibull_fitter import WeibullFitter -from lifelines.exponential_fitter import ExponentialFitter -from lifelines.nelson_aalen_fitter import NelsonAalenFitter -from lifelines.kaplan_meier_fitter import KaplanMeierFitter -from lifelines.breslow_fleming_harrington_fitter import BreslowFlemingHarringtonFitter -from lifelines.coxph_fitter import CoxPHFitter -from lifelines.aalen_additive_fitter import AalenAdditiveFitter +from lifelines.fitters import BaseFitter +from lifelines.fitters.weibull_fitter import WeibullFitter +from lifelines.fitters.exponential_fitter import ExponentialFitter +from lifelines.fitters.nelson_aalen_fitter import NelsonAalenFitter +from lifelines.fitters.kaplan_meier_fitter import KaplanMeierFitter +from lifelines.fitters.breslow_fleming_harrington_fitter import BreslowFlemingHarringtonFitter +from lifelines.fitters.coxph_fitter import CoxPHFitter +from lifelines.fitters.aalen_additive_fitter import AalenAdditiveFitter diff --git a/lifelines/_base_fitter.py b/lifelines/fitters/__init__.py similarity index 99% rename from lifelines/_base_fitter.py rename to lifelines/fitters/__init__.py index 089ccf9dc..3cba379b2 100644 --- a/lifelines/_base_fitter.py +++ b/lifelines/fitters/__init__.py @@ -1,7 +1,9 @@ # -*- coding: utf-8 -*- from __future__ import print_function + import numpy as np import pandas as pd + from lifelines.plotting import plot_estimate from lifelines.utils import qth_survival_times diff --git a/lifelines/aalen_additive_fitter.py b/lifelines/fitters/aalen_additive_fitter.py similarity index 99% rename from lifelines/aalen_additive_fitter.py rename to lifelines/fitters/aalen_additive_fitter.py index 59dffcd4c..c905a023b 100644 --- a/lifelines/aalen_additive_fitter.py +++ b/lifelines/fitters/aalen_additive_fitter.py @@ -1,13 +1,15 @@ # -*- coding: utf-8 -*- from __future__ import print_function + import numpy as np import pandas as pd from numpy.linalg import LinAlgError from scipy.integrate import trapz -from lifelines._base_fitter import BaseFitter + +from lifelines.fitters import BaseFitter from lifelines.utils import _get_index, inv_normal_cdf, epanechnikov_kernel, \ ridge_regression as lr, qth_survival_times -from lifelines.progress_bar import progress_bar +from lifelines.utils.progress_bar import progress_bar from lifelines.plotting import plot_regressions diff --git a/lifelines/breslow_fleming_harrington_fitter.py b/lifelines/fitters/breslow_fleming_harrington_fitter.py similarity index 96% rename from lifelines/breslow_fleming_harrington_fitter.py rename to lifelines/fitters/breslow_fleming_harrington_fitter.py index 4090706fa..9561d50e7 100644 --- a/lifelines/breslow_fleming_harrington_fitter.py +++ b/lifelines/fitters/breslow_fleming_harrington_fitter.py @@ -2,8 +2,8 @@ from __future__ import print_function import numpy as np -from lifelines._base_fitter import UnivariateFitter -from lifelines.nelson_aalen_fitter import NelsonAalenFitter +from lifelines.fitters import UnivariateFitter +from lifelines.fitters.nelson_aalen_fitter import NelsonAalenFitter from lifelines.utils import median_survival_times diff --git a/lifelines/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py similarity index 99% rename from lifelines/coxph_fitter.py rename to lifelines/fitters/coxph_fitter.py index 1aea9c5f1..fbe530ff5 100644 --- a/lifelines/coxph_fitter.py +++ b/lifelines/fitters/coxph_fitter.py @@ -8,7 +8,7 @@ from scipy.integrate import trapz import scipy.stats as stats -from lifelines._base_fitter import BaseFitter +from lifelines.fitters import BaseFitter from lifelines.utils import survival_table_from_events, inv_normal_cdf, normalize,\ significance_code, concordance_index, _get_index, qth_survival_times diff --git a/lifelines/exponential_fitter.py b/lifelines/fitters/exponential_fitter.py similarity index 98% rename from lifelines/exponential_fitter.py rename to lifelines/fitters/exponential_fitter.py index 9596c6000..64b5828d0 100644 --- a/lifelines/exponential_fitter.py +++ b/lifelines/fitters/exponential_fitter.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from lifelines._base_fitter import UnivariateFitter +from lifelines.fitters import UnivariateFitter from lifelines.utils import inv_normal_cdf diff --git a/lifelines/kaplan_meier_fitter.py b/lifelines/fitters/kaplan_meier_fitter.py similarity index 98% rename from lifelines/kaplan_meier_fitter.py rename to lifelines/fitters/kaplan_meier_fitter.py index 740e7c8a3..9d6fa9aef 100644 --- a/lifelines/kaplan_meier_fitter.py +++ b/lifelines/fitters/kaplan_meier_fitter.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from lifelines._base_fitter import UnivariateFitter +from lifelines.fitters import UnivariateFitter from lifelines.utils import _preprocess_inputs, _additive_estimate, StatError, inv_normal_cdf,\ median_survival_times diff --git a/lifelines/nelson_aalen_fitter.py b/lifelines/fitters/nelson_aalen_fitter.py similarity index 99% rename from lifelines/nelson_aalen_fitter.py rename to lifelines/fitters/nelson_aalen_fitter.py index 5cf73589d..359fe4a20 100644 --- a/lifelines/nelson_aalen_fitter.py +++ b/lifelines/fitters/nelson_aalen_fitter.py @@ -3,7 +3,7 @@ import numpy as np import pandas as pd -from lifelines._base_fitter import UnivariateFitter +from lifelines.fitters import UnivariateFitter from lifelines.utils import _preprocess_inputs, _additive_estimate, epanechnikov_kernel,\ inv_normal_cdf diff --git a/lifelines/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py similarity index 99% rename from lifelines/weibull_fitter.py rename to lifelines/fitters/weibull_fitter.py index 5f6afd1f3..dc86e139b 100644 --- a/lifelines/weibull_fitter.py +++ b/lifelines/fitters/weibull_fitter.py @@ -2,8 +2,9 @@ from __future__ import print_function, division import numpy as np import pandas as pd + from numpy.linalg import solve, norm, inv -from lifelines._base_fitter import UnivariateFitter +from lifelines.fitters import UnivariateFitter from lifelines.utils import inv_normal_cdf def _negative_log_likelihood(lambda_rho, T, E): diff --git a/lifelines/tests/__init__.py b/lifelines/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/lifelines/utils.py b/lifelines/utils/__init__.py similarity index 100% rename from lifelines/utils.py rename to lifelines/utils/__init__.py diff --git a/lifelines/progress_bar.py b/lifelines/utils/progress_bar.py similarity index 100% rename from lifelines/progress_bar.py rename to lifelines/utils/progress_bar.py diff --git a/setup.py b/setup.py index d4ab610b8..f75e6aaee 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,9 @@ def read(fname): url="https://github.com/CamDavidsonPilon/lifelines", packages=['lifelines', 'lifelines.datasets', - 'lifelines.tests'], + 'lifelines.fitters', + 'lifelines.utils', + ], long_description=read('README.txt'), classifiers=[ "Development Status :: 4 - Beta", @@ -44,7 +46,6 @@ def read(fname): "../README.txt", "../LICENSE", "../MANIFEST.in", - "../*.ipynb", "datasets/*", ] }, diff --git a/lifelines/tests/__main__.py b/tests/__main__.py similarity index 100% rename from lifelines/tests/__main__.py rename to tests/__main__.py diff --git a/lifelines/tests/conftest.py b/tests/conftest.py similarity index 100% rename from lifelines/tests/conftest.py rename to tests/conftest.py diff --git a/lifelines/tests/test_estimation.py b/tests/test_estimation.py similarity index 99% rename from lifelines/tests/test_estimation.py rename to tests/test_estimation.py index b8168abdf..209e1b753 100644 --- a/lifelines/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -9,14 +9,14 @@ from pandas.util.testing import assert_frame_equal, assert_series_equal import numpy.testing as npt -from ..utils import k_fold_cross_validation, StatError -from ..estimation import CoxPHFitter, AalenAdditiveFitter, KaplanMeierFitter, \ +from lifelines.utils import k_fold_cross_validation, StatError +from lifelines.estimation import CoxPHFitter, AalenAdditiveFitter, KaplanMeierFitter, \ NelsonAalenFitter, BreslowFlemingHarringtonFitter, ExponentialFitter, \ WeibullFitter, BaseFitter -from ..datasets import load_regression_dataset, load_larynx, load_waltons, load_kidney_transplant, load_rossi,\ +from lifelines.datasets import load_regression_dataset, load_larynx, load_waltons, load_kidney_transplant, load_rossi,\ load_lcd, load_panel_test, load_g3, load_holly_molly_polly -from ..generate_datasets import generate_hazard_rates, generate_random_lifetimes, cumulative_integral -from ..utils import concordance_index +from lifelines.generate_datasets import generate_hazard_rates, generate_random_lifetimes, cumulative_integral +from lifelines.utils import concordance_index @pytest.fixture diff --git a/lifelines/tests/test_generate_datasets.py b/tests/test_generate_datasets.py similarity index 87% rename from lifelines/tests/test_generate_datasets.py rename to tests/test_generate_datasets.py index e9f46cdd8..6091a42d4 100644 --- a/lifelines/tests/test_generate_datasets.py +++ b/tests/test_generate_datasets.py @@ -5,8 +5,8 @@ import pytest import matplotlib.pyplot as plt -from ..estimation import KaplanMeierFitter, NelsonAalenFitter -from ..generate_datasets import exponential_survival_data +from lifelines.estimation import KaplanMeierFitter, NelsonAalenFitter +from lifelines.generate_datasets import exponential_survival_data def test_exponential_data_sets_correct_censor(): diff --git a/lifelines/tests/test_plotting.py b/tests/test_plotting.py similarity index 97% rename from lifelines/tests/test_plotting.py rename to tests/test_plotting.py index a87584072..efc64e9c6 100644 --- a/lifelines/tests/test_plotting.py +++ b/tests/test_plotting.py @@ -3,9 +3,9 @@ import os import pytest import numpy as np -from ..estimation import NelsonAalenFitter, KaplanMeierFitter, AalenAdditiveFitter -from ..generate_datasets import generate_random_lifetimes, generate_hazard_rates -from ..plotting import plot_lifetimes +from lifelines.estimation import NelsonAalenFitter, KaplanMeierFitter, AalenAdditiveFitter +from lifelines.generate_datasets import generate_random_lifetimes, generate_hazard_rates +from lifelines.plotting import plot_lifetimes @pytest.mark.plottest diff --git a/lifelines/tests/test_statistics.py b/tests/test_statistics.py similarity index 98% rename from lifelines/tests/test_statistics.py rename to tests/test_statistics.py index 8dd4f1bd9..1d999109d 100644 --- a/lifelines/tests/test_statistics.py +++ b/tests/test_statistics.py @@ -4,8 +4,8 @@ import numpy.testing as npt import pytest -from .. import statistics as stats -from ..datasets import load_waltons, load_g3 +from lifelines import statistics as stats +from lifelines.datasets import load_waltons, load_g3 def test_unequal_intensity_with_random_data(): diff --git a/lifelines/tests/test_weibull.py b/tests/test_weibull.py similarity index 92% rename from lifelines/tests/test_weibull.py rename to tests/test_weibull.py index 1c302a680..0eb21f674 100644 --- a/lifelines/tests/test_weibull.py +++ b/tests/test_weibull.py @@ -1,6 +1,6 @@ import numpy as np -from lifelines import weibull_fitter as wf +from lifelines.fitters import weibull_fitter as wf def test_lambda_gradient(): diff --git a/lifelines/tests/test_utils.py b/tests/utils/test_utils.py similarity index 99% rename from lifelines/tests/test_utils.py rename to tests/utils/test_utils.py index 9dcf006f0..d5d2a5676 100644 --- a/lifelines/tests/test_utils.py +++ b/tests/utils/test_utils.py @@ -7,9 +7,10 @@ import numpy.testing as npt from numpy.linalg import norm, lstsq from numpy.random import randn -from ..estimation import CoxPHFitter -from ..datasets import (load_regression_dataset, load_larynx, +from lifelines.estimation import CoxPHFitter +from lifelines.datasets import (load_regression_dataset, load_larynx, load_waltons, load_rossi) + from lifelines import utils from lifelines.utils import _concordance_index as fast_cindex from lifelines.utils import _naive_concordance_index as slow_cindex From eb4afe0616596310bb99b3f3f9f97d32838c18ad Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Thu, 9 Jul 2015 20:51:38 -0400 Subject: [PATCH 03/11] add at-risk column to survival table --- lifelines/plotting.py | 2 +- lifelines/utils/__init__.py | 34 ++++++++++++++++++---------------- tests/utils/test_utils.py | 7 +++++++ 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/lifelines/plotting.py b/lifelines/plotting.py index 787b8b0ed..d2ccdafeb 100644 --- a/lifelines/plotting.py +++ b/lifelines/plotting.py @@ -2,7 +2,7 @@ from __future__ import print_function import numpy as np -from lifelines.utils import coalesce +from .utils import coalesce def is_latex_enabled(): diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 36cc0ec76..3481fa9c2 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -121,6 +121,9 @@ def group_survival_table_from_events(groups, durations, event_observed, birth_ti unique_groups = groups.unique() # set first group + + ### This function is terrible. clean it up! + g = unique_groups[0] ix = (groups == g) T = durations[ix] @@ -147,7 +150,8 @@ def group_survival_table_from_events(groups, durations, event_observed, birth_ti def survival_table_from_events(death_times, event_observed, birth_times=None, - columns=["removed", "observed", "censored", "entrance"], weights=None): + columns=["removed", "observed", "censored", "entrance", "at_risk"], + weights=None): """ Parameters: death_times: (n,) array of event times @@ -167,21 +171,18 @@ def survival_table_from_events(death_times, event_observed, birth_times=None, left the population due to event_observed) Example: - #input - survival_table_from_events( waltonT, np.ones_like(waltonT)) #available in test suite - - #output - removed observed censored entrance + removed observed censored entrance at_risk event_at - 0 0 0 0 11 - 6 1 1 0 0 - 7 2 2 0 0 - 9 3 3 0 0 - 13 3 3 0 0 - 15 2 2 0 0 + 0 0 0 0 11 11 + 6 1 1 0 0 11 + 7 2 2 0 0 10 + 9 3 3 0 0 8 + 13 3 3 0 0 5 + 15 2 2 0 0 2 """ + removed, observed, censored, entrance, at_risk = columns death_times = np.asarray(death_times) if birth_times is None: birth_times = min(0, death_times.min()) * np.ones(death_times.shape[0]) @@ -192,17 +193,18 @@ def survival_table_from_events(death_times, event_observed, birth_times=None, # deal with deaths and censorships df = pd.DataFrame(death_times, columns=["event_at"]) - df[columns[0]] = 1 if weights is None else weights - df[columns[1]] = np.asarray(event_observed) + df[removed] = 1 if weights is None else weights + df[observed] = np.asarray(event_observed) death_table = df.groupby("event_at").sum() - death_table[columns[2]] = (death_table[columns[0]] - death_table[columns[1]]).astype(int) + death_table[censored] = (death_table[removed] - death_table[observed]).astype(int) # deal with late births births = pd.DataFrame(birth_times, columns=['event_at']) - births[columns[3]] = 1 + births[entrance] = 1 births_table = births.groupby('event_at').sum() event_table = death_table.join(births_table, how='outer', sort=True).fillna(0) # http://wesmckinney.com/blog/?p=414 + event_table[at_risk] = event_table[entrance].cumsum() - event_table[removed].cumsum().shift(1).fillna(0) return event_table.astype(float) diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py index d5d2a5676..42465ea50 100644 --- a/tests/utils/test_utils.py +++ b/tests/utils/test_utils.py @@ -207,6 +207,13 @@ def test_group_survival_table_from_events_on_waltons_data(): assert all(removed.index == observed.index) assert all(removed.index == censored.index) +def test_survival_table_from_events_at_risk_column(): + df = load_waltons() + # from R + expected = [163.0, 162.0, 160.0, 157.0, 154.0, 152.0, 151.0, 148.0, 144.0, 139.0, 134.0, 133.0, 130.0, 128.0, 126.0, 119.0, 118.0, + 108.0, 107.0, 99.0, 96.0, 89.0, 87.0, 69.0, 65.0, 49.0, 38.0, 36.0, 27.0, 24.0, 14.0, 1.0] + df = utils.survival_table_from_events(df['T'], df['E']) + assert list(df['at_risk'][1:]) == expected # skip the first event as that is the birth time, 0. def test_survival_table_to_events_casts_to_float(): T, C = np.array([1, 2, 3, 4, 4, 5]), np.array([True, False, True, True, True, True]) From 607945d6b155d746131f11293257323f7fc6b53a Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Sun, 12 Jul 2015 20:29:58 -0400 Subject: [PATCH 04/11] Adding at risk column to survival_table_from_events --- lifelines/utils/__init__.py | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index 3481fa9c2..d77f8e590 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -108,8 +108,10 @@ def group_survival_table_from_events(groups, durations, event_observed, birth_ti ] """ + n = np.max(groups.shape) assert n == np.max(durations.shape) == np.max(event_observed.shape), "inputs must be of the same length." + if birth_times is None: # Create some birth times birth_times = np.zeros(np.max(durations.shape)) @@ -117,35 +119,26 @@ def group_survival_table_from_events(groups, durations, event_observed, birth_ti assert n == np.max(birth_times.shape), "inputs must be of the same length." - groups, durations, event_observed, birth_times = map(lambda x: pd.Series(np.reshape(x, (n,))), [groups, durations, event_observed, birth_times]) + groups, durations, event_observed, birth_times = [pd.Series(np.reshape(data, (n,))) for data in [groups, durations, event_observed, birth_times]] unique_groups = groups.unique() - # set first group - - ### This function is terrible. clean it up! - - g = unique_groups[0] - ix = (groups == g) - T = durations[ix] - C = event_observed[ix] - B = birth_times[ix] - - g_name = str(g) - data = survival_table_from_events(T, C, B, - columns=['removed:' + g_name, "observed:" + g_name, 'censored:' + g_name, 'entrance' + g_name]) - for g in unique_groups[1:]: - ix = groups == g + for i, group in enumerate(unique_groups): + ix = groups == group T = durations[ix] C = event_observed[ix] B = birth_times[ix] - g_name = str(g) - data = data.join(survival_table_from_events(T, C, B, - columns=['removed:' + g_name, "observed:" + g_name, 'censored:' + g_name, 'entrance' + g_name]), - how='outer') + group_name = str(group) + columns = [event_name + ":" + group_name for event_name in ['removed', 'observed', 'censored', 'entrance', 'at_risk']] + if i == 0: + data = survival_table_from_events(T, C, B, columns=columns) + else: + data = data.join(survival_table_from_events(T, C, B, columns=columns), how='outer') + data = data.fillna(0) # hmmm pandas its too bad I can't do data.ix[:limit] and leave out the if. if int(limit) != -1: data = data.ix[:limit] + return unique_groups, data.filter(like='removed:'), data.filter(like='observed:'), data.filter(like='censored:') From cca47600a72decbc8658af5c697a286138b87deb Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Sun, 12 Jul 2015 20:33:07 -0400 Subject: [PATCH 05/11] use at risk column in additive estiamtes --- lifelines/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index d77f8e590..b48507c5b 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -582,13 +582,13 @@ def _additive_estimate(events, timeline, _additive_f, _additive_var, reverse): """ if reverse: events = events.sort_index(ascending=False) - population = events['entrance'].sum() - events['removed'].cumsum().shift(1).fillna(0) + at_risk = events['at_risk'] deaths = events['observed'].shift(1).fillna(0) estimate_ = np.cumsum(_additive_f(population, deaths)).ffill().sort_index() var_ = np.cumsum(_additive_var(population, deaths)).ffill().sort_index() else: deaths = events['observed'] - population = events['entrance'].cumsum() - events['removed'].cumsum().shift(1).fillna(0) # slowest line here. + at_risk = events['at_risk'] estimate_ = np.cumsum(_additive_f(population, deaths)) var_ = np.cumsum(_additive_var(population, deaths)) From d14819f5b1316afdae65a69a1370f600529bdafc Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 13 Jul 2015 19:35:00 -0400 Subject: [PATCH 06/11] complete rename --- lifelines/utils/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index b48507c5b..f61e7ec20 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -584,13 +584,13 @@ def _additive_estimate(events, timeline, _additive_f, _additive_var, reverse): events = events.sort_index(ascending=False) at_risk = events['at_risk'] deaths = events['observed'].shift(1).fillna(0) - estimate_ = np.cumsum(_additive_f(population, deaths)).ffill().sort_index() - var_ = np.cumsum(_additive_var(population, deaths)).ffill().sort_index() + estimate_ = np.cumsum(_additive_f(at_risk, deaths)).ffill().sort_index() + var_ = np.cumsum(_additive_var(at_risk, deaths)).ffill().sort_index() else: deaths = events['observed'] at_risk = events['at_risk'] - estimate_ = np.cumsum(_additive_f(population, deaths)) - var_ = np.cumsum(_additive_var(population, deaths)) + estimate_ = np.cumsum(_additive_f(at_risk, deaths)) + var_ = np.cumsum(_additive_var(at_risk, deaths)) timeline = sorted(timeline) estimate_ = estimate_.reindex(timeline, method='pad').fillna(0) From a2a68b8177d0c5c942cd06cfc47bfef54b9a731e Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 13 Jul 2015 19:38:58 -0400 Subject: [PATCH 07/11] add survival_table_from_events to docs --- docs/Quickstart.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/Quickstart.rst b/docs/Quickstart.rst index 173d6bf98..aa7a6780a 100644 --- a/docs/Quickstart.rst +++ b/docs/Quickstart.rst @@ -114,6 +114,27 @@ Lifelines has some utility functions to transform this dataset into durations an T, C = datetimes_to_durations(start_times, end_times, freq='h') +Alternatively, perhaps you are intersted in viewing the survival table given some durations and censorship vectors. + + +.. code:: python + + from lifelines.utils import survival_table_from_events + + table = survival_table_from_events(T, C) + print table.head() + + """ + removed observed censored entrance at_risk + event_at + 0 0 0 0 60 60 + 2 2 1 1 0 60 + 3 3 1 2 0 58 + 4 5 3 2 0 55 + 5 12 6 6 0 50 + """ + + Survival Regression --------------------------------- From dc3c61ef6c9e1bb2c222bde6aa40abbc6efe6f9d Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Mon, 13 Jul 2015 19:40:03 -0400 Subject: [PATCH 08/11] more docs --- docs/Examples.rst | 21 +++++++++++++++++++++ docs/Quickstart.rst | 4 ++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/docs/Examples.rst b/docs/Examples.rst index 39e96722a..6a0648529 100644 --- a/docs/Examples.rst +++ b/docs/Examples.rst @@ -156,6 +156,27 @@ time (months, days, ...) observed deaths censored print C # np.array([1,1,1,1,1,1,1,0,1,1, ...]) +Alternatively, perhaps you are interested in viewing the survival table given some durations and censorship vectors. + + +.. code:: python + + from lifelines.utils import survival_table_from_events + + table = survival_table_from_events(T, C) + print table.head() + + """ + removed observed censored entrance at_risk + event_at + 0 0 0 0 60 60 + 2 2 1 1 0 60 + 3 3 1 2 0 58 + 4 5 3 2 0 55 + 5 12 6 6 0 50 + """ + + Plotting multiple figures on an plot ############################################## diff --git a/docs/Quickstart.rst b/docs/Quickstart.rst index aa7a6780a..d1a3c247e 100644 --- a/docs/Quickstart.rst +++ b/docs/Quickstart.rst @@ -114,7 +114,7 @@ Lifelines has some utility functions to transform this dataset into durations an T, C = datetimes_to_durations(start_times, end_times, freq='h') -Alternatively, perhaps you are intersted in viewing the survival table given some durations and censorship vectors. +Alternatively, perhaps you are interested in viewing the survival table given some durations and censorship vectors. .. code:: python @@ -123,7 +123,7 @@ Alternatively, perhaps you are intersted in viewing the survival table given som table = survival_table_from_events(T, C) print table.head() - + """ removed observed censored entrance at_risk event_at From 84e96a9f33bc1e9f81868ec1601a2b784c392faa Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Sat, 1 Aug 2015 12:00:58 -0400 Subject: [PATCH 09/11] adding sample size and power cals --- lifelines/fitters/weibull_fitter.py | 1 + lifelines/statistics.py | 58 +++++++++++++++++++++++++++++ lifelines/utils/__init__.py | 6 +-- tests/test_statistics.py | 9 +++++ 4 files changed, 71 insertions(+), 3 deletions(-) diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py index dc86e139b..2d48d2e93 100644 --- a/lifelines/fitters/weibull_fitter.py +++ b/lifelines/fitters/weibull_fitter.py @@ -7,6 +7,7 @@ from lifelines.fitters import UnivariateFitter from lifelines.utils import inv_normal_cdf + def _negative_log_likelihood(lambda_rho, T, E): if np.any(lambda_rho < 0): return np.inf diff --git a/lifelines/statistics.py b/lifelines/statistics.py index 3a8ec9ac1..f5b9f4a73 100644 --- a/lifelines/statistics.py +++ b/lifelines/statistics.py @@ -9,6 +9,64 @@ from lifelines.utils import group_survival_table_from_events +def sample_size_necessary_under_cph(power, ratio_of_participants, p_exp, p_con, + postulated_hazard_ratio, alpha=0.05): + """ + This computes the sample size for needed power to compare two groups under a Cox + Proportional Hazard model. + + References: + https://cran.r-project.org/web/packages/powerSurvEpi/powerSurvEpi.pdf + + Parameters: + power: power to detect the magnitude of the hazard ratio as small as that specified by postulated_hazard_ratio. + ratio_of_participants: ratio of participants in experimental group over control group. + p_exp: probability of failure in experimental group over period of study. + p_con: probability of failure in control group over period of study + postulated_hazard_ratio: the postulated hazard ratio + alpha: type I error rate + + Returns: + n_exp, n_con: the samples sizes need for the experiment and control group, respectively, to achieve desired power + """ + z = lambda p: stats.norm.ppf(p) + + m = 1.0 / ratio_of_participants \ + * ((ratio_of_participants * postulated_hazard_ratio + 1.0) / (postulated_hazard_ratio - 1.0)) ** 2 \ + * (z(1. - alpha / 2.) + z(power)) ** 2 + + n_exp = m * ratio_of_participants / (ratio_of_participants * p_exp + p_con) + n_con = m / (ratio_of_participants * p_exp + p_con) + + return int(np.ceil(n_exp)), int(np.ceil(n_con)) + + +def power_under_cph(n_exp, n_con, p_exp, p_con, postulated_hazard_ratio, alpha=0.05): + """ + This computes the sample size for needed power to compare two groups under a Cox + Proportional Hazard model. + + References: + https://cran.r-project.org/web/packages/powerSurvEpi/powerSurvEpi.pdf + + Parameters: + n_exp: size of the experiment group. + n_con: size of the control group. + p_exp: probability of failure in experimental group over period of study. + p_con: probability of failure in control group over period of study + postulated_hazard_ratio: the postulated hazard ratio + alpha: type I error rate + + Returns: + power: power to detect the magnitude of the hazard ratio as small as that specified by postulated_hazard_ratio. + """ + z = lambda p: stats.norm.ppf(p) + + m = n_exp * p_exp + n_con * p_con + k = float(n_exp) / float(n_con) + return stats.norm.cdf(np.sqrt(k * m) * abs(postulated_hazard_ratio - 1) / (k * postulated_hazard_ratio + 1) - z(1 - alpha / 2.)) + + def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_observed_B=None, alpha=0.95, t_0=-1, **kwargs): """ diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index f61e7ec20..5334ded2b 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -111,7 +111,7 @@ def group_survival_table_from_events(groups, durations, event_observed, birth_ti n = np.max(groups.shape) assert n == np.max(durations.shape) == np.max(event_observed.shape), "inputs must be of the same length." - + if birth_times is None: # Create some birth times birth_times = np.zeros(np.max(durations.shape)) @@ -143,7 +143,7 @@ def group_survival_table_from_events(groups, durations, event_observed, birth_ti def survival_table_from_events(death_times, event_observed, birth_times=None, - columns=["removed", "observed", "censored", "entrance", "at_risk"], + columns=["removed", "observed", "censored", "entrance", "at_risk"], weights=None): """ Parameters: @@ -172,7 +172,7 @@ def survival_table_from_events(death_times, event_observed, birth_times=None, 7 2 2 0 0 10 9 3 3 0 0 8 13 3 3 0 0 5 - 15 2 2 0 0 2 + 15 2 2 0 0 2 """ removed, observed, censored, entrance, at_risk = columns diff --git a/tests/test_statistics.py b/tests/test_statistics.py index 1d999109d..38ad9a8d5 100644 --- a/tests/test_statistics.py +++ b/tests/test_statistics.py @@ -7,6 +7,15 @@ from lifelines import statistics as stats from lifelines.datasets import load_waltons, load_g3 +def test_sample_size_necessary_under_cph(): + assert stats.sample_size_necessary_under_cph(0.8, 1, 0.8, 0.2, 0.139) == (14, 14) + assert stats.sample_size_necessary_under_cph(0.8, 1, 0.5, 0.5, 1.2) == (950, 950) + assert stats.sample_size_necessary_under_cph(0.8, 1.5, 0.5, 0.5, 1.2) == (1231, 821) + assert stats.sample_size_necessary_under_cph(0.8, 1.5, 0.5, 0.5, 1.2, alpha=0.01) == (1832, 1221) + +def test_power_under_cph(): + assert abs(stats.power_under_cph(12,12, 0.8, 0.2, 0.139) - 0.744937) < 10e-6 + assert abs(stats.power_under_cph(12,20, 0.8, 0.2, 1.2) - 0.05178317) < 10e-6 def test_unequal_intensity_with_random_data(): data1 = np.random.exponential(5, size=(2000, 1)) From b401593c9705ead104d90345bb10eb38c657eb95 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Sat, 1 Aug 2015 13:33:24 -0400 Subject: [PATCH 10/11] fixing left censorship + improved test --- lifelines/utils/__init__.py | 10 ++++++---- tests/test_estimation.py | 8 +++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py index f61e7ec20..da4b84022 100644 --- a/lifelines/utils/__init__.py +++ b/lifelines/utils/__init__.py @@ -582,10 +582,12 @@ def _additive_estimate(events, timeline, _additive_f, _additive_var, reverse): """ if reverse: events = events.sort_index(ascending=False) - at_risk = events['at_risk'] - deaths = events['observed'].shift(1).fillna(0) - estimate_ = np.cumsum(_additive_f(at_risk, deaths)).ffill().sort_index() - var_ = np.cumsum(_additive_var(at_risk, deaths)).ffill().sort_index() + at_risk = events['entrance'].sum() - events['removed'].cumsum().shift(1).fillna(0) + + deaths = events['observed'] + + estimate_ = np.cumsum(_additive_f(at_risk, deaths)).sort_index().shift(-1).fillna(0) + var_ = np.cumsum(_additive_var(at_risk, deaths)).sort_index().shift(-1).fillna(0) else: deaths = events['observed'] at_risk = events['at_risk'] diff --git a/tests/test_estimation.py b/tests/test_estimation.py index 209e1b753..8936b4078 100644 --- a/tests/test_estimation.py +++ b/tests/test_estimation.py @@ -349,12 +349,14 @@ def test_passing_in_left_censorship_creates_a_cumulative_density(self, sample_li assert not hasattr(kmf, 'survival_function_') def test_kmf_left_censorship_stats(self): + # from http://www.public.iastate.edu/~pdixon/stat505/Chapter%2011.pdf T = [3, 5, 5, 5, 6, 6, 10, 12] - C = [1, 0, 0, 1, 1, 1, 0, 1] + C = [1, 0, 0, 1, 1, 1, 0, 1] kmf = KaplanMeierFitter() kmf.fit(T, C, left_censorship=True) - assert kmf.cumulative_density_[kmf._label].ix[0] == 0.0 - assert kmf.cumulative_density_[kmf._label].ix[12] == 1.0 + + actual = kmf.cumulative_density_[kmf._label].values + npt.assert_almost_equal(actual, np.array([0, 0.437500, 0.5833333, 0.875, 0.875, 1])) def test_shifting_durations_doesnt_affect_survival_function_values(self): T = np.random.exponential(10, size=100) From 545a05faeb0dab2d7fc30876ff43f37a393c6fa4 Mon Sep 17 00:00:00 2001 From: Cameron Davidson-Pilon Date: Sat, 1 Aug 2015 13:49:57 -0400 Subject: [PATCH 11/11] update changelog --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 28d147599..874a3a8a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ ### Changelogs +#### 0.8.0 + - reorganized lifelines directories: + - moved test files out of main directory. + - moved `utils.py` into it's down directory. + - moved all estimators `fitters` directory. + - added a `at_risk` column to the output of `group_survival_table_from_events` and `survival_table_from_events` + - added sample size and power calculations for statistical tests. See `lifeline.statistics. sample_size_necessary_under_cph` and `lifelines.statistics. power_under_cph`. + - fixed a bug when using KaplanMeierFitter for left-censored data. + + #### 0.7.1 - addition of a l2 `penalizer` to `CoxPHFitter`. - dropped Fortran implementation of efficient Python version. Lifelines is pure python once again!