Merge 949d782 into 7f4ab63

CamDavidsonPilon · Dec 10, 2016 · e51f818 · e51f818
2 parents 7f4ab63 + 949d782
commit e51f818
Show file tree

Hide file tree

Showing 7 changed files with 73 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 ### Changelogs
 
+#### 0.9.3 (in dev)
+ - adding `plot_loglogs` to `KaplanMeierFitter`
+
 #### 0.9.2
  - deprecates Pandas versions before 0.18.
  - throw an error if no admissable pairs in the c-index calculation. Previosly a NaN was returned.

diff --git a/docs/Survival Regression.rst b/docs/Survival Regression.rst
@@ -427,6 +427,21 @@ This example data is from the paper `here <http://socserv.socsci.mcmaster.ca/jfo
 
 To access the coefficients and the baseline hazard, you can use ``cf.hazards_`` and ``cf.baseline_hazard_`` respectively. After fitting, you can use use the suite of prediction methods (similar to Aalen's additve model above): ``.predict_hazard(X)``, ``.predict_survival_function(X)``, etc. 
 
+Checking the proportional hazards assumption
+################
+
+A quick and visual way to check the proportional hazards assumption of a variable is to plot the survival curves segmented by the values of the variable. If the survival curves are the same "shape", and differ only by constant factor, then the assumption holds. A more clear way to see this is to plot what's called the loglogs curve: the log(-log(survival curve)) vs log(time). If the curves are parallel (and hence do not cross each other), then it's likely the variable satisfies the assumption. If the curves do cross, likely you'll have to "stratify" the variable (see next section). In lifelines, the ``KaplanMeierFitter`` object has a ``.plot_loglogs`` function for this purpose. 
+
+The following is the loglogs curves of two variables in our regime dataset. The first is the democracy type, which does have (close to) parallel lines, hence satisfies our assumption:
+
+.. image:: images/lls_democracy.png
+
+
+The second variable is the regime type, and this variable does not follow the proportional hazards assumption.
+
+.. image:: images/lls_regime_type.png
+
+
 Stratification
 ################
 

diff --git a/docs/images/lls_democracy.png b/docs/images/lls_democracy.png
diff --git a/docs/images/lls_regime_type.png b/docs/images/lls_regime_type.png
diff --git a/lifelines/datasets/__init__.py b/lifelines/datasets/__init__.py
@@ -52,14 +52,12 @@ def load_holly_molly_polly(**kwargs):
     From https://stat.ethz.ch/education/semesters/ss2011/seminar/contents/presentation_10.pdf
     Used as a toy example for CoxPH in recurrent SA.
 
-     ID Status Stratum Start(days) Stop(days) tx T
-     M 1 1 0 100 1 100
-     M 1 2 100 105 1 5
-     H 1 1 0 30 0 30
-     H 1 2 30 50 0 20
-     P 1 1 0 20 0 20
-     P 1 2 20 60 0 40
-     P 1 3 60 85 0 25
+      ID  Status  Stratum  Start(days)  Stop(days)  tx    T
+    0  M       1        1            0         100   1  100
+    1  M       1        2          100         105   1    5
+    2  H       1        1            0          30   0   30
+    3  H       1        2           30          50   0   20
+    4  P       1        1            0          20   0   20
 
     """
     return load_dataset('holly_molly_polly.tsv', sep="\s", **kwargs)

diff --git a/lifelines/fitters/kaplan_meier_fitter.py b/lifelines/fitters/kaplan_meier_fitter.py
@@ -6,6 +6,7 @@
 from lifelines.fitters import UnivariateFitter
 from lifelines.utils import _preprocess_inputs, _additive_estimate, StatError, inv_normal_cdf,\
     median_survival_times
+from lifelines.plotting import plot_loglogs
 
 
 class KaplanMeierFitter(UnivariateFitter):
@@ -76,6 +77,7 @@ def fit(self, durations, event_observed=None, timeline=None, entry=None, label='
         # plotting functions
         self.plot = self._plot_estimate(estimate_name)
         setattr(self, "plot_" + estimate_name, self.plot)
+        self.plot_loglogs = plot_loglogs(self)
         return self
 
     def _bounds(self, cumulative_sq_, alpha, ci_labels):

diff --git a/lifelines/plotting.py b/lifelines/plotting.py
@@ -168,6 +168,7 @@ def add_at_risk_counts(*fitters, **kwargs):
     return ax
 
 
+
 def plot_lifetimes(lifetimes, event_observed=None, birthtimes=None,
                    order=False, block=True):
     """
@@ -242,6 +243,52 @@ def create_dataframe_slicer(iloc, ix):
     return lambda df: getattr(df, get_method)[user_submitted_slice]
 
 
+def plot_loglogs(cls):
+    doc_string = """
+    Specifies a plot of the log(-log(SV)) versus log(time) where SV is the estimated survival function.
+    """
+
+    def _plot_loglogs(ix=None, iloc=None, show_censors=False, censor_styles=None, **kwargs):
+
+        loglog = lambda s: np.log(-np.log(s))
+
+        if (ix is not None) and (iloc is not None):
+            raise ValueError('Cannot set both ix and iloc in call to .plot().')
+
+        if censor_styles is None:
+            censor_styles = {}
+
+        set_kwargs_ax(kwargs)
+        set_kwargs_color(kwargs)
+        set_kwargs_drawstyle(kwargs)
+        kwargs['logx'] = True
+
+        dataframe_slicer = create_dataframe_slicer(iloc, ix)
+
+        # plot censors
+        ax = kwargs['ax']
+        colour = kwargs['color']
+
+        if show_censors and cls.event_table['censored'].sum() > 0:
+            cs = {
+                'marker': '+',
+                'ms': 12,
+                'mew': 1
+            }
+            cs.update(censor_styles)
+            times = dataframe_slicer(cls.event_table.ix[(cls.event_table['censored'] > 0)]).index.values.astype(float)
+            v = cls.predict(times)
+            # don't log times, as Pandas will take care of all log-scaling later.
+            ax.plot(times, loglog(v), linestyle='None',
+                    color=colour, **cs)
+
+        # plot estimate
+        dataframe_slicer(loglog(cls.survival_function_)).plot(**kwargs)
+        return ax
+    _plot_loglogs.__doc__ = doc_string
+    return _plot_loglogs
+
+
 def plot_estimate(cls, estimate):
     doc_string = """"
         Plots a pretty version of the fitted %s.