v0.18.1 (#624)

* refactor gradients to use autograd - jacobian is too slow in _bounds * lint * lint * bump version * lint and changelog * adding log-logistic fitter. This is much easier with the new patterns and autograd * adding log-logistic fitter. This is much easier with the new patterns and autograd * I had no idea this test file existed * lint
CamDavidsonPilon · Feb 2, 2019 · d84384f · d84384f
1 parent 3276e6e
commit d84384f
Show file tree

Hide file tree

Showing 17 changed files with 578 additions and 240 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 ### Changelogs
 
+### 0.18.1
+ - bug fixes in `LogNormalFitter` variance estimates
+ - improve convergence of `LogNormalFitter`. We now model the log of sigma internally, but still expose sigma externally. 
+ - use the `autograd` lib to help with gradients.
+ - New `LogLogisticFitter` univariate fitter available. 
+
 ### 0.18.0
  - `LogNormalFitter` is a new univariate fitter you can use.
  - `WeibullFitter` now correctly returns the confidence intervals (previously returned only NaNs)

diff --git a/docs/Survival analysis with lifelines.rst b/docs/Survival analysis with lifelines.rst
@@ -537,8 +537,8 @@ In lifelines, estimation is available using the ``WeibullFitter`` class. The ``p
 
 
 
-Other parametric models: Exponential & Log-Normal
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Other parametric models: Exponential, Log-Logistic & Log-Normal
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Similarly, there are other parametric models in lifelines. Generally, which parametric model to choose is determined by either knowledge of the distribution of durations, or some sort of model goodness-of-fit. Below are the three parametric models, and the Nelson-Aalen nonparametric model, of the same data.
 
@@ -547,6 +547,7 @@ Similarly, there are other parametric models in lifelines. Generally, which para
     from lifelines import WeibullFitter
     from lifelines import ExponentialFitter
     from lifelines import LogNormalFitter
+    from lifelines import LogLogisticFitter
     from lifelines import NelsonAalenFitter
 
     from lifelines.datasets import load_waltons
@@ -560,11 +561,13 @@ Similarly, there are other parametric models in lifelines. Generally, which para
     exf = ExponentialFitter().fit(T, E, label='ExponentalFitter')
     lnf = LogNormalFitter().fit(T, E, label='LogNormalFitter')
     naf = NelsonAalenFitter().fit(T, E, label='NelsonAalenFitter')
+    llf = LogLogisticFitter().fit(T, E, label='LogLogisticFitter')
 
-    ax = wf.plot()
-    ax = exf.plot(ax=ax)
-    ax = lnf.plot(ax=ax)
-    ax = naf.plot(ax=ax)
+    ax = wf.plot(ci_show=False)
+    ax = exf.plot(ax=ax, ci_show=False)
+    ax = lnf.plot(ax=ax, ci_show=False)
+    ax = naf.plot(ax=ax, ci_show=False)
+    ax = llf.plot(ax=ax, ci_show=False)
     plt.title("Cumulative hazard rate estimates\n of Walton's data")
 
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -60,7 +60,7 @@
 #
 # The short X.Y version.
 
-version = "0.18.0"
+version = "0.18.1"
 # The full version, including dev info
 release = version
 

diff --git a/docs/images/survival_weibull.png b/docs/images/survival_weibull.png
diff --git a/docs/images/waltons_cumulative_hazard.png b/docs/images/waltons_cumulative_hazard.png
diff --git a/lifelines/__init__.py b/lifelines/__init__.py
@@ -11,6 +11,7 @@
 from lifelines.fitters.aalen_additive_fitter import AalenAdditiveFitter
 from lifelines.fitters.aalen_johansen_fitter import AalenJohansenFitter
 from lifelines.fitters.log_normal_fitter import LogNormalFitter
+from lifelines.fitters.log_logistic_fitter import LogLogisticFitter
 
 from lifelines.version import __version__
 
@@ -25,4 +26,5 @@
     "CoxTimeVaryingFitter",
     "AalenJohansenFitter",
     "LogNormalFitter",
+    "LogLogisticFitter",
 ]
diff --git a/lifelines/fitters/log_logistic_fitter.py b/lifelines/fitters/log_logistic_fitter.py
@@ -0,0 +1,292 @@
+# -*- coding: utf-8 -*-
+from __future__ import print_function, division
+import warnings
+import numpy as np
+import pandas as pd
+import autograd.numpy as anp
+from autograd import hessian, value_and_grad
+from scipy.optimize import minimize
+
+from scipy import stats
+from numpy.linalg import inv
+from lifelines.fitters import UnivariateFitter
+from lifelines.utils import (
+    _to_array,
+    inv_normal_cdf,
+    check_nans_or_infs,
+    ConvergenceError,
+    string_justify,
+    format_p_value,
+    format_floats,
+)
+
+
+def _negative_log_likelihood(params, T, E):
+    n = T.shape[0]
+    alpha_, beta_ = params
+    ll = (
+        E
+        * (
+            anp.log(beta_)
+            - anp.log(alpha_)
+            + (beta_ - 1) * anp.log(T)
+            - (beta_ - 1) * anp.log(alpha_)
+            - anp.log1p((T / alpha_) ** beta_)
+        )
+    ).sum() + -1 * anp.log1p((T / alpha_) ** beta_).sum()
+    return -ll / n
+
+
+class LogLogisticFitter(UnivariateFitter):
+
+    r"""
+    This class implements a Log-Logistic model for univariate data. The model has parameterized
+    form:
+
+    .. math::  S(t) = (1 + (t/\alpha)^{\beta})^{-1},   \alpha > 0, \beta > 0,
+
+    and the hazard rate is:
+
+    .. math::  h(t) = (\beta/\alpha)(t / \alpha) ^ {\beta-1} / (1 + (t/\alpha)^{\beta})
+
+    After calling the `.fit` method, you have access to properties like:
+    ``cumulative_hazard_``, ``plot``, ``survival_function_``, ``alpha_`` and ``beta_``.
+
+    A summary of the fit is available with the method 'print_summary()'
+    
+    Examples
+    --------
+
+    >>> from lifelines import LogLogisticFitter 
+    >>> from lifelines.datasets import load_waltons
+    >>> waltons = load_waltons()
+    >>> llf = WeibullFitter()
+    >>> llf.fit(waltons['T'], waltons['E'])
+    >>> llf.plot()
+    >>> print(llf.alpha_)
+
+    """
+
+    def fit(
+        self,
+        durations,
+        event_observed=None,
+        timeline=None,
+        label="LogLogistic_estimate",
+        alpha=None,
+        ci_labels=None,
+        show_progress=False,
+    ):  # pylint: disable=too-many-arguments
+        """
+        Parameters
+        ----------
+        durations: an array, or pd.Series
+          length n, duration subject was observed for
+        event_observed: numpy array or pd.Series, optional
+          length n, True if the the death was observed, False if the event
+           was lost (right-censored). Defaults all True if event_observed==None
+        timeline: list, optional
+            return the estimate at the values in timeline (postively increasing)
+        label: string, optional
+            a string to name the column of the estimate.
+        alpha: float, optional
+            the alpha value in the confidence intervals. Overrides the initializing
+           alpha for this call to fit only.
+        ci_labels: list, optional
+            add custom column names to the generated confidence intervals
+              as a length-2 list: [<lower-bound name>, <upper-bound name>]. Default: <label>_lower_<alpha>
+        show_progress: boolean, optional
+            since this is an iterative fitting algorithm, switching this to True will display some iteration details.
+
+        Returns
+        -------
+          self : LogLogisticFitter
+            self with new properties like ``cumulative_hazard_``, ``survival_function_``, ``alpha_``, and ``beta_``.
+
+        """
+
+        check_nans_or_infs(durations)
+        if event_observed is not None:
+            check_nans_or_infs(event_observed)
+
+        self.durations = np.asarray(durations, dtype=float)
+        # check for negative or 0 durations - these are not allowed in a weibull model.
+        if np.any(self.durations <= 0):
+            raise ValueError(
+                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
+            )
+
+        self.event_observed = (
+            np.asarray(event_observed, dtype=int) if event_observed is not None else np.ones_like(self.durations)
+        )
+
+        if timeline is not None:
+            self.timeline = np.sort(np.asarray(timeline))
+        else:
+            self.timeline = np.linspace(self.durations.min(), self.durations.max(), self.durations.shape[0])
+
+        self._label = label
+        alpha = alpha if alpha is not None else self.alpha
+
+        # estimation
+        (self.alpha_, self.beta_), self._log_likelihood, self._hessian_ = self._fit_model(
+            self.durations, self.event_observed, show_progress=show_progress
+        )
+        self.variance_matrix_ = inv(self._hessian_)
+
+        self.survival_function_ = self.survival_function_at_times(self.timeline).to_frame(name=self._label)
+
+        self.hazard_ = self.hazard_at_times(self.timeline).to_frame(self._label)
+
+        self.cumulative_hazard_ = self.cumulative_hazard_at_times(self.timeline).to_frame(self._label)
+
+        self.confidence_interval_ = self._bounds(alpha, ci_labels)
+        self.median_ = self.alpha_
+
+        # estimation methods
+        self._estimate_name = "cumulative_hazard_"
+        self._predict_label = label
+        self._update_docstrings()
+
+        # plotting - Cumulative hazard takes priority.
+        self.plot_cumulative_hazard = self.plot
+
+        return self
+
+    def _estimation_method(self, t):
+        return self.survival_function_at_times(t)
+
+    def hazard_at_times(self, times):
+        a, b = self.alpha_, self.beta_
+
+        return pd.Series((b / a) * (times / a) ** (b - 1) / (1 + (times / a) ** b), index=_to_array(times))
+
+    def survival_function_at_times(self, times):
+        return pd.Series(np.exp(-self.cumulative_hazard_at_times(times)), index=_to_array(times))
+
+    def cumulative_hazard_at_times(self, times):
+        alpha_, beta_ = self.alpha_, self.beta_
+        return pd.Series(np.log((times / alpha_) ** beta_ + 1), index=_to_array(times))
+
+    def _fit_model(self, T, E, show_progress=True):
+
+        initial_values = np.ones(2)
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+
+            results = minimize(
+                value_and_grad(_negative_log_likelihood),  # pylint: disable=no-value-for-parameter
+                initial_values,
+                jac=True,
+                method="L-BFGS-B",
+                args=(T, E),
+                bounds=((0.000001, None), (0.000001, None)),  # to stay well away from 0.
+                options={"disp": show_progress},
+            )
+
+            if results.success:
+                hessian_ = hessian(_negative_log_likelihood)(results.x, T, E)  # pylint: disable=no-value-for-parameter
+                return results.x, -results.fun, hessian_ * T.shape[0]
+            print(results)
+            raise ConvergenceError("Did not converge. This is a lifelines problem, not yours;")
+
+    def _bounds(self, alpha, ci_labels):
+        alpha2 = inv_normal_cdf((1.0 + alpha) / 2.0)
+        df = pd.DataFrame(index=self.timeline)
+
+        def _d_cumulative_hazard_d_alpha_(alpha_, beta_, T):
+            return -beta_ / alpha_ / ((alpha_ / T) ** beta_ + 1)
+
+        def _d_cumulative_hazard_d_beta_(alpha_, beta_, T):
+            return np.log(T / alpha_) / ((alpha_ / T) ** beta_ + 1)
+
+        gradient_at_mle = np.stack(
+            [
+                _d_cumulative_hazard_d_alpha_(self.alpha_, self.beta_, self.timeline),
+                _d_cumulative_hazard_d_beta_(self.alpha_, self.beta_, self.timeline),
+            ]
+        ).T
+
+        std_cumulative_hazard = np.sqrt(
+            np.einsum("nj,jk,nk->n", gradient_at_mle, self.variance_matrix_, gradient_at_mle)
+        )
+
+        if ci_labels is None:
+            ci_labels = ["%s_upper_%.2f" % (self._label, alpha), "%s_lower_%.2f" % (self._label, alpha)]
+        assert len(ci_labels) == 2, "ci_labels should be a length 2 array."
+
+        df[ci_labels[0]] = self.cumulative_hazard_at_times(self.timeline) + alpha2 * std_cumulative_hazard
+        df[ci_labels[1]] = self.cumulative_hazard_at_times(self.timeline) - alpha2 * std_cumulative_hazard
+        return df
+
+    def _compute_standard_errors(self):
+        var_alpha_, var_beta_ = self.variance_matrix_.diagonal()
+        return pd.DataFrame([[np.sqrt(var_alpha_), np.sqrt(var_beta_)]], index=["se"], columns=["alpha_", "beta_"])
+
+    def _compute_confidence_bounds_of_parameters(self):
+        se = self._compute_standard_errors().loc["se"]
+        alpha2 = inv_normal_cdf((1.0 + self.alpha) / 2.0)
+        return pd.DataFrame(
+            [np.array([self.alpha_, self.beta_]) + alpha2 * se, np.array([self.alpha_, self.beta_]) - alpha2 * se],
+            columns=["alpha_", "beta_"],
+            index=["upper-bound", "lower-bound"],
+        )
+
+    def _compute_z_values(self):
+        return np.asarray([self.alpha_ - 1, self.beta_ - 1]) / self._compute_standard_errors().loc["se"]
+
+    def _compute_p_values(self):
+        U = self._compute_z_values() ** 2
+        return stats.chi2.sf(U, 1)
+
+    @property
+    def summary(self):
+        """Summary statistics describing the fit.
+        Set alpha property in the object before calling.
+
+        Returns
+        -------
+        df : pd.DataFrame
+            Contains columns coef, exp(coef), se(coef), z, p, lower, upper
+        """
+        lower_upper_bounds = self._compute_confidence_bounds_of_parameters()
+        df = pd.DataFrame(index=["alpha_", "beta_"])
+        df["coef"] = [self.alpha_, self.beta_]
+        df["se(coef)"] = self._compute_standard_errors().loc["se"]
+        df["lower %.2f" % self.alpha] = lower_upper_bounds.loc["lower-bound"]
+        df["upper %.2f" % self.alpha] = lower_upper_bounds.loc["upper-bound"]
+        df["p"] = self._compute_p_values()
+        with np.warnings.catch_warnings():
+            np.warnings.filterwarnings("ignore")
+            df["-log2(p)"] = -np.log2(df["p"])
+        return df
+
+    def print_summary(self, decimals=2, **kwargs):
+        """
+        Print summary statistics describing the fit, the coefficients, and the error bounds.
+
+        Parameters
+        -----------
+        decimals: int, optional (default=2)
+            specify the number of decimal places to show
+        kwargs:
+            print additional metadata in the output (useful to provide model names, dataset names, etc.) when comparing 
+            multiple outputs. 
+
+        """
+        justify = string_justify(18)
+        print(self)
+        print("{} = {}".format(justify("number of subjects"), self.durations.shape[0]))
+        print("{} = {}".format(justify("number of events"), np.where(self.event_observed)[0].shape[0]))
+        print("{} = {:.3f}".format(justify("log-likelihood"), self._log_likelihood))
+        print("{} = {}".format(justify("hypothesis"), "alpha != 1, beta != 1"))
+
+        for k, v in kwargs.items():
+            print("{} = {}\n".format(justify(k), v))
+
+        print(end="\n")
+        print("---")
+
+        df = self.summary
+        print(df.to_string(float_format=format_floats(decimals), formatters={"p": format_p_value(decimals)}))