Merge 75a051e into d7f19d2

CamDavidsonPilon · Aug 1, 2015 · 870ce6a · 870ce6a
2 parents d7f19d2 + 75a051e
commit 870ce6a
Show file tree

Hide file tree

Showing 27 changed files with 204 additions and 81 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -28,7 +28,7 @@ install:
   - conda create -n test-environment python=$TRAVIS_PYTHON_VERSION pip numpy scipy pandas matplotlib pytest
   - source activate test-environment
   # Build in place so we can run tests
-  - python setup.py build_ext --inplace
+  - python setup.py install
   - pip install coveralls
   - pip install pytest-cov
 # command to run tests

diff --git a/docs/Examples.rst b/docs/Examples.rst
@@ -156,6 +156,27 @@ time (months, days, ...)      observed deaths       censored
     print C # np.array([1,1,1,1,1,1,1,0,1,1, ...])
 
 
+Alternatively, perhaps you are interested in viewing the survival table given some durations and censorship vectors.
+
+
+.. code:: python
+    
+    from lifelines.utils import survival_table_from_events
+
+    table = survival_table_from_events(T, C)
+    print table.head()
+    
+    """
+              removed  observed  censored  entrance  at_risk
+    event_at
+    0               0         0         0        60       60
+    2               2         1         1         0       60
+    3               3         1         2         0       58
+    4               5         3         2         0       55
+    5              12         6         6         0       50
+    """
+
+
 
 Plotting multiple figures on an plot 
 ##############################################

diff --git a/docs/Quickstart.rst b/docs/Quickstart.rst
@@ -114,6 +114,27 @@ Lifelines has some utility functions to transform this dataset into durations an
     T, C = datetimes_to_durations(start_times, end_times, freq='h')
 
 
+Alternatively, perhaps you are interested in viewing the survival table given some durations and censorship vectors.
+
+
+.. code:: python
+    
+    from lifelines.utils import survival_table_from_events
+
+    table = survival_table_from_events(T, C)
+    print table.head()
+
+    """
+              removed  observed  censored  entrance  at_risk
+    event_at
+    0               0         0         0        60       60
+    2               2         1         1         0       60
+    3               3         1         2         0       58
+    4               5         3         2         0       55
+    5              12         6         6         0       50
+    """
+
+
 Survival Regression
 ---------------------------------
 

diff --git a/lifelines/estimation.py b/lifelines/estimation.py
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
-from lifelines._base_fitter import BaseFitter
-from lifelines.weibull_fitter import WeibullFitter
-from lifelines.exponential_fitter import ExponentialFitter
-from lifelines.nelson_aalen_fitter import NelsonAalenFitter
-from lifelines.kaplan_meier_fitter import KaplanMeierFitter
-from lifelines.breslow_fleming_harrington_fitter import BreslowFlemingHarringtonFitter
-from lifelines.coxph_fitter import CoxPHFitter
-from lifelines.aalen_additive_fitter import AalenAdditiveFitter
+from lifelines.fitters import BaseFitter
+from lifelines.fitters.weibull_fitter import WeibullFitter
+from lifelines.fitters.exponential_fitter import ExponentialFitter
+from lifelines.fitters.nelson_aalen_fitter import NelsonAalenFitter
+from lifelines.fitters.kaplan_meier_fitter import KaplanMeierFitter
+from lifelines.fitters.breslow_fleming_harrington_fitter import BreslowFlemingHarringtonFitter
+from lifelines.fitters.coxph_fitter import CoxPHFitter
+from lifelines.fitters.aalen_additive_fitter import AalenAdditiveFitter
diff --git a/lifelines/_base_fitter.py → lifelines/fitters/__init__.py b/lifelines/_base_fitter.py → lifelines/fitters/__init__.py
@@ -1,7 +1,9 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
+
 import numpy as np
 import pandas as pd
+
 from lifelines.plotting import plot_estimate
 from lifelines.utils import qth_survival_times
 

diff --git a/lifelines/aalen_additive_fitter.py → lifelines/fitters/aalen_additive_fitter.py b/lifelines/aalen_additive_fitter.py → lifelines/fitters/aalen_additive_fitter.py
@@ -1,13 +1,15 @@
 # -*- coding: utf-8 -*-
 from __future__ import print_function
+
 import numpy as np
 import pandas as pd
 from numpy.linalg import LinAlgError
 from scipy.integrate import trapz
-from lifelines._base_fitter import BaseFitter
+
+from lifelines.fitters import BaseFitter
 from lifelines.utils import _get_index, inv_normal_cdf, epanechnikov_kernel, \
     ridge_regression as lr, qth_survival_times
-from lifelines.progress_bar import progress_bar
+from lifelines.utils.progress_bar import progress_bar
 from lifelines.plotting import plot_regressions
 
 

diff --git a/...ines/breslow_fleming_harrington_fitter.py → ...ters/breslow_fleming_harrington_fitter.py b/...ines/breslow_fleming_harrington_fitter.py → ...ters/breslow_fleming_harrington_fitter.py
@@ -2,8 +2,8 @@
 from __future__ import print_function
 import numpy as np
 
-from lifelines._base_fitter import UnivariateFitter
-from lifelines.nelson_aalen_fitter import NelsonAalenFitter
+from lifelines.fitters import UnivariateFitter
+from lifelines.fitters.nelson_aalen_fitter import NelsonAalenFitter
 from lifelines.utils import median_survival_times
 
 

diff --git a/lifelines/coxph_fitter.py → lifelines/fitters/coxph_fitter.py b/lifelines/coxph_fitter.py → lifelines/fitters/coxph_fitter.py
@@ -8,7 +8,7 @@
 from scipy.integrate import trapz
 import scipy.stats as stats
 
-from lifelines._base_fitter import BaseFitter
+from lifelines.fitters import BaseFitter
 from lifelines.utils import survival_table_from_events, inv_normal_cdf, normalize,\
     significance_code, concordance_index, _get_index, qth_survival_times
 

diff --git a/lifelines/exponential_fitter.py → lifelines/fitters/exponential_fitter.py b/lifelines/exponential_fitter.py → lifelines/fitters/exponential_fitter.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 
-from lifelines._base_fitter import UnivariateFitter
+from lifelines.fitters import UnivariateFitter
 from lifelines.utils import inv_normal_cdf
 
 

diff --git a/lifelines/kaplan_meier_fitter.py → lifelines/fitters/kaplan_meier_fitter.py b/lifelines/kaplan_meier_fitter.py → lifelines/fitters/kaplan_meier_fitter.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 
-from lifelines._base_fitter import UnivariateFitter
+from lifelines.fitters import UnivariateFitter
 from lifelines.utils import _preprocess_inputs, _additive_estimate, StatError, inv_normal_cdf,\
     median_survival_times
 

diff --git a/lifelines/nelson_aalen_fitter.py → lifelines/fitters/nelson_aalen_fitter.py b/lifelines/nelson_aalen_fitter.py → lifelines/fitters/nelson_aalen_fitter.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 
-from lifelines._base_fitter import UnivariateFitter
+from lifelines.fitters import UnivariateFitter
 from lifelines.utils import _preprocess_inputs, _additive_estimate, epanechnikov_kernel,\
     inv_normal_cdf
 

diff --git a/lifelines/weibull_fitter.py → lifelines/fitters/weibull_fitter.py b/lifelines/weibull_fitter.py → lifelines/fitters/weibull_fitter.py
@@ -2,10 +2,12 @@
 from __future__ import print_function, division
 import numpy as np
 import pandas as pd
+
 from numpy.linalg import solve, norm, inv
-from lifelines._base_fitter import UnivariateFitter
+from lifelines.fitters import UnivariateFitter
 from lifelines.utils import inv_normal_cdf
 
+
 def _negative_log_likelihood(lambda_rho, T, E):
     if np.any(lambda_rho < 0):
         return np.inf

diff --git a/lifelines/plotting.py b/lifelines/plotting.py
@@ -2,7 +2,7 @@
 from __future__ import print_function
 
 import numpy as np
-from lifelines.utils import coalesce
+from .utils import coalesce
 
 
 def is_latex_enabled():

diff --git a/lifelines/statistics.py b/lifelines/statistics.py
@@ -9,6 +9,64 @@
 from lifelines.utils import group_survival_table_from_events
 
 
+def sample_size_necessary_under_cph(power, ratio_of_participants, p_exp, p_con,
+                                    postulated_hazard_ratio, alpha=0.05):
+    """
+    This computes the sample size for needed power to compare two groups under a Cox
+    Proportional Hazard model.
+
+    References:
+        https://cran.r-project.org/web/packages/powerSurvEpi/powerSurvEpi.pdf
+
+    Parameters:
+        power: power to detect the magnitude of the hazard ratio as small as that specified by postulated_hazard_ratio.
+        ratio_of_participants: ratio of participants in experimental group over control group.
+        p_exp: probability of failure in experimental group over period of study.
+        p_con: probability of failure in control group over period of study
+        postulated_hazard_ratio: the postulated hazard ratio
+        alpha: type I error rate
+
+    Returns:
+        n_exp, n_con: the samples sizes need for the experiment and control group, respectively, to achieve desired power
+    """
+    z = lambda p: stats.norm.ppf(p)
+
+    m = 1.0 / ratio_of_participants \
+        * ((ratio_of_participants * postulated_hazard_ratio + 1.0) / (postulated_hazard_ratio - 1.0)) ** 2 \
+        * (z(1. - alpha / 2.) + z(power)) ** 2
+
+    n_exp = m * ratio_of_participants / (ratio_of_participants * p_exp + p_con)
+    n_con = m / (ratio_of_participants * p_exp + p_con)
+
+    return int(np.ceil(n_exp)), int(np.ceil(n_con))
+
+
+def power_under_cph(n_exp, n_con, p_exp, p_con, postulated_hazard_ratio, alpha=0.05):
+    """
+    This computes the sample size for needed power to compare two groups under a Cox
+    Proportional Hazard model.
+
+    References:
+        https://cran.r-project.org/web/packages/powerSurvEpi/powerSurvEpi.pdf
+
+    Parameters:
+        n_exp: size of the experiment group.
+        n_con: size of the control group.
+        p_exp: probability of failure in experimental group over period of study.
+        p_con: probability of failure in control group over period of study
+        postulated_hazard_ratio: the postulated hazard ratio
+        alpha: type I error rate
+
+    Returns:
+        power: power to detect the magnitude of the hazard ratio as small as that specified by postulated_hazard_ratio.
+    """
+    z = lambda p: stats.norm.ppf(p)
+
+    m = n_exp * p_exp + n_con * p_con
+    k = float(n_exp) / float(n_con)
+    return stats.norm.cdf(np.sqrt(k * m) * abs(postulated_hazard_ratio - 1) / (k * postulated_hazard_ratio + 1) - z(1 - alpha / 2.))
+
+
 def logrank_test(event_times_A, event_times_B, event_observed_A=None, event_observed_B=None,
                  alpha=0.95, t_0=-1, **kwargs):
     """

diff --git a/lifelines/tests/__init__.py b/lifelines/tests/__init__.py
diff --git a/lifelines/utils.py → lifelines/utils/__init__.py b/lifelines/utils.py → lifelines/utils/__init__.py
@@ -108,46 +108,43 @@ def group_survival_table_from_events(groups, durations, event_observed, birth_ti
         ]
 
     """
+
     n = np.max(groups.shape)
     assert n == np.max(durations.shape) == np.max(event_observed.shape), "inputs must be of the same length."
+
     if birth_times is None:
         # Create some birth times
         birth_times = np.zeros(np.max(durations.shape))
         birth_times[:] = np.min(durations)
 
     assert n == np.max(birth_times.shape), "inputs must be of the same length."
 
-    groups, durations, event_observed, birth_times = map(lambda x: pd.Series(np.reshape(x, (n,))), [groups, durations, event_observed, birth_times])
+    groups, durations, event_observed, birth_times = [pd.Series(np.reshape(data, (n,))) for data in [groups, durations, event_observed, birth_times]]
     unique_groups = groups.unique()
 
-    # set first group
-    g = unique_groups[0]
-    ix = (groups == g)
-    T = durations[ix]
-    C = event_observed[ix]
-    B = birth_times[ix]
-
-    g_name = str(g)
-    data = survival_table_from_events(T, C, B,
-                                      columns=['removed:' + g_name, "observed:" + g_name, 'censored:' + g_name, 'entrance' + g_name])
-    for g in unique_groups[1:]:
-        ix = groups == g
+    for i, group in enumerate(unique_groups):
+        ix = groups == group
         T = durations[ix]
         C = event_observed[ix]
         B = birth_times[ix]
-        g_name = str(g)
-        data = data.join(survival_table_from_events(T, C, B,
-                                                    columns=['removed:' + g_name, "observed:" + g_name, 'censored:' + g_name, 'entrance' + g_name]),
-                         how='outer')
+        group_name = str(group)
+        columns = [event_name + ":" + group_name for event_name in ['removed', 'observed', 'censored', 'entrance', 'at_risk']]
+        if i == 0:
+            data = survival_table_from_events(T, C, B, columns=columns)
+        else:
+            data = data.join(survival_table_from_events(T, C, B, columns=columns), how='outer')
+
     data = data.fillna(0)
     # hmmm pandas its too bad I can't do data.ix[:limit] and leave out the if.
     if int(limit) != -1:
         data = data.ix[:limit]
+
     return unique_groups, data.filter(like='removed:'), data.filter(like='observed:'), data.filter(like='censored:')
 
 
 def survival_table_from_events(death_times, event_observed, birth_times=None,
-                               columns=["removed", "observed", "censored", "entrance"], weights=None):
+                               columns=["removed", "observed", "censored", "entrance", "at_risk"],
+                               weights=None):
     """
     Parameters:
         death_times: (n,) array of event times
@@ -167,21 +164,18 @@ def survival_table_from_events(death_times, event_observed, birth_times=None,
          left the population due to event_observed)
 
     Example:
-        #input
-        survival_table_from_events( waltonT, np.ones_like(waltonT)) #available in test suite
-
-        #output
 
-                  removed  observed  censored  entrance
+                  removed  observed  censored  entrance   at_risk
         event_at
-        0               0         0         0        11
-        6               1         1         0         0
-        7               2         2         0         0
-        9               3         3         0         0
-        13              3         3         0         0
-        15              2         2         0         0
+        0               0         0         0        11        11
+        6               1         1         0         0        11
+        7               2         2         0         0        10
+        9               3         3         0         0         8
+        13              3         3         0         0         5
+        15              2         2         0         0         2
 
     """
+    removed, observed, censored, entrance, at_risk = columns
     death_times = np.asarray(death_times)
     if birth_times is None:
         birth_times = min(0, death_times.min()) * np.ones(death_times.shape[0])
@@ -192,17 +186,18 @@ def survival_table_from_events(death_times, event_observed, birth_times=None,
 
     # deal with deaths and censorships
     df = pd.DataFrame(death_times, columns=["event_at"])
-    df[columns[0]] = 1 if weights is None else weights
-    df[columns[1]] = np.asarray(event_observed)
+    df[removed] = 1 if weights is None else weights
+    df[observed] = np.asarray(event_observed)
     death_table = df.groupby("event_at").sum()
-    death_table[columns[2]] = (death_table[columns[0]] - death_table[columns[1]]).astype(int)
+    death_table[censored] = (death_table[removed] - death_table[observed]).astype(int)
 
     # deal with late births
     births = pd.DataFrame(birth_times, columns=['event_at'])
-    births[columns[3]] = 1
+    births[entrance] = 1
     births_table = births.groupby('event_at').sum()
 
     event_table = death_table.join(births_table, how='outer', sort=True).fillna(0)  # http://wesmckinney.com/blog/?p=414
+    event_table[at_risk] = event_table[entrance].cumsum() - event_table[removed].cumsum().shift(1).fillna(0)
     return event_table.astype(float)
 
 
@@ -587,15 +582,17 @@ def _additive_estimate(events, timeline, _additive_f, _additive_var, reverse):
     """
     if reverse:
         events = events.sort_index(ascending=False)
-        population = events['entrance'].sum() - events['removed'].cumsum().shift(1).fillna(0)
-        deaths = events['observed'].shift(1).fillna(0)
-        estimate_ = np.cumsum(_additive_f(population, deaths)).ffill().sort_index()
-        var_ = np.cumsum(_additive_var(population, deaths)).ffill().sort_index()
+        at_risk = events['entrance'].sum() - events['removed'].cumsum().shift(1).fillna(0)
+
+        deaths = events['observed']
+
+        estimate_ = np.cumsum(_additive_f(at_risk, deaths)).sort_index().shift(-1).fillna(0)
+        var_ = np.cumsum(_additive_var(at_risk, deaths)).sort_index().shift(-1).fillna(0)
     else:
         deaths = events['observed']
-        population = events['entrance'].cumsum() - events['removed'].cumsum().shift(1).fillna(0)  # slowest line here.
-        estimate_ = np.cumsum(_additive_f(population, deaths))
-        var_ = np.cumsum(_additive_var(population, deaths))
+        at_risk = events['at_risk']
+        estimate_ = np.cumsum(_additive_f(at_risk, deaths))
+        var_ = np.cumsum(_additive_var(at_risk, deaths))
 
     timeline = sorted(timeline)
     estimate_ = estimate_.reindex(timeline, method='pad').fillna(0)

diff --git a/lifelines/progress_bar.py → lifelines/utils/progress_bar.py b/lifelines/progress_bar.py → lifelines/utils/progress_bar.py
diff --git a/lifelines/version.py b/lifelines/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '0.7.1.0'
+__version__ = '0.8.0.0'