Merge pull request #471 from CamDavidsonPilon/v0.14.3

V0.14.3
CamDavidsonPilon · May 24, 2018 · c1bc505 · c1bc505
2 parents 3556625 + 94c6472
commit c1bc505
Show file tree

Hide file tree

Showing 7 changed files with 155 additions and 29 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 ### Changelogs
 
+#### 0.14.3 
+ - fixes a bug when subtracting or dividing two `UnivariateFitters` with labels. 
+ - fixes an import error with using `CoxTimeVaryingFitter` predict methods.
+ - adds a `column` argument to `CoxTimeVaryingFitter` and `CoxPHFitter` `plot` method to plot only a subset of columns.
+
+#### 0.14.2
+ - some quality of life improvements for working with `CoxTimeVaryingFitter` including new `predict_` methods.
+
 #### 0.14.1
  - fixed bug with using weights and strata in `CoxPHFitter`
  - fixed bug in using non-integer weights in `KaplanMeierFitter`

diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py
@@ -46,9 +46,12 @@ def subtract(other):
             other_estimate = getattr(other, estimate)
             new_index = np.concatenate((other_estimate.index, self_estimate.index))
             new_index = np.unique(new_index)
-            return self_estimate.reindex(new_index, method='ffill') - \
-                other_estimate.reindex(new_index, method='ffill')
-
+            return pd.DataFrame(
+                    self_estimate.reindex(new_index, method='ffill').values - \
+                    other_estimate.reindex(new_index, method='ffill').values,
+                    index=new_index,
+                    columns=['diff']
+                )
         subtract.__doc__ = doc_string
         return subtract
 
@@ -67,9 +70,12 @@ def divide(other):
             other_estimate = getattr(other, estimate)
             new_index = np.concatenate((other_estimate.index, self_estimate.index))
             new_index = np.unique(new_index)
-            return self_estimate.reindex(new_index, method='ffill') / \
-                other_estimate.reindex(new_index, method='ffill')
-
+            return pd.DataFrame(
+                    self_estimate.reindex(new_index, method='ffill').values / \
+                    other_estimate.reindex(new_index, method='ffill').values,
+                    index=new_index,
+                    columns=['ratio']
+                )
         divide.__doc__ = doc_string
         return divide
 

diff --git a/lifelines/fitters/cox_time_varying_fitter.py b/lifelines/fitters/cox_time_varying_fitter.py
@@ -16,7 +16,7 @@
     significance_code, normalize,\
     pass_for_numeric_dtypes_or_raise, check_low_var,\
     check_for_overlapping_intervals, check_complete_separation_low_variance,\
-    ConvergenceWarning, StepSizer
+    ConvergenceWarning, StepSizer, _get_index
 
 
 class CoxTimeVaryingFitter(BaseFitter):
@@ -184,12 +184,13 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
 
             # Save these as pending result
             hessian, gradient = h, g
+            norm_delta = norm(delta)
 
             if show_progress:
-                print("Iteration %d: norm_delta = %.6f, step_size = %.3f, ll = %.6f, seconds_since_start = %.1f" % (i, norm(delta), step_size, ll, time.time() - start))
+                print("Iteration %d: norm_delta = %.6f, step_size = %.3f, ll = %.6f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, time.time() - start))
 
             # convergence criteria
-            if norm(delta) < precision:
+            if norm_delta < precision:
                 converging, completed = False, True
             elif i >= 50:
                 # 50 iterations steps with N-R is a lot.
@@ -199,12 +200,12 @@ def _newton_rhaphson(self, df, stop_times_events, show_progress=False, step_size
                 converging, completed = False, False
             elif abs(ll - previous_ll) < precision:
                 converging, completed = False, True
-            elif abs(ll) < 0.0001 and norm(delta) > 1.0:
+            elif abs(ll) < 0.0001 and norm_delta > 1.0:
                 warnings.warn("The log-likelihood is getting suspciously close to 0 and the delta is still large. There may be complete separation in the dataset. This may result in incorrect inference of coefficients. \
 See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/ ", ConvergenceWarning)
                 converging, completed = False, False
 
-            step_size = step_sizer.update(norm(delta)).next()
+            step_size = step_sizer.update(norm_delta).next()
 
             beta += delta
 
@@ -358,20 +359,35 @@ def print_summary(self):
               end='\n\n')
         return
 
-    def plot(self, standardized=False, **kwargs):
+    def plot(self, standardized=False, columns=None, **kwargs):
         """
-        standardized: standardize each estimated coefficient and confidence interval endpoints by the standard error of the estimate.
+        Produces a visual representation of the fitted coefficients, including their standard errors and magnitudes.
+
+        Parameters:
+            standardized: standardize each estimated coefficient and confidence interval
+                          endpoints by the standard error of the estimate.
+            columns : list-like, default None
+        Returns:
+            ax: the matplotlib axis that be edited.
 
         """
         from matplotlib import pyplot as plt
 
         ax = kwargs.get('ax', None) or plt.figure().add_subplot(111)
         yaxis_locations = range(len(self.hazards_.columns))
 
-        summary = self.summary
-        lower_bound = self.confidence_intervals_.loc['lower-bound'].copy()
-        upper_bound = self.confidence_intervals_.loc['upper-bound'].copy()
-        hazards = self.hazards_.values[0].copy()
+        if columns is not None:
+            yaxis_locations = range(len(columns))
+            summary = self.summary.loc[columns]
+            lower_bound = self.confidence_intervals_[columns].loc['lower-bound'].copy()
+            upper_bound = self.confidence_intervals_[columns].loc['upper-bound'].copy()
+            hazards = self.hazards_[columns].values[0].copy()
+        else:
+            yaxis_locations = range(len(self.hazards_.columns))
+            summary = self.summary
+            lower_bound = self.confidence_intervals_.loc['lower-bound'].copy()
+            upper_bound = self.confidence_intervals_.loc['upper-bound'].copy()
+            hazards = self.hazards_.values[0].copy()
 
         if standardized:
             se = summary['se(coef)']

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
@@ -609,27 +609,34 @@ def _compute_baseline_survival(self):
             survival_df.columns = ['baseline survival']
         return survival_df
 
-    def plot(self, standardized=False, **kwargs):
+    def plot(self, standardized=False, columns=None, **kwargs):
         """
         Produces a visual representation of the fitted coefficients, including their standard errors and magnitudes.
 
         Parameters:
             standardized: standardize each estimated coefficient and confidence interval
                           endpoints by the standard error of the estimate.
-
+            columns : list-like, default None
         Returns:
             ax: the matplotlib axis that be edited.
 
         """
         from matplotlib import pyplot as plt
 
         ax = kwargs.get('ax', None) or plt.figure().add_subplot(111)
-        yaxis_locations = range(len(self.hazards_.columns))
 
-        summary = self.summary
-        lower_bound = self.confidence_intervals_.loc['lower-bound'].copy()
-        upper_bound = self.confidence_intervals_.loc['upper-bound'].copy()
-        hazards = self.hazards_.values[0].copy()
+        if columns is not None:
+            yaxis_locations = range(len(columns))
+            summary = self.summary.loc[columns]
+            lower_bound = self.confidence_intervals_[columns].loc['lower-bound'].copy()
+            upper_bound = self.confidence_intervals_[columns].loc['upper-bound'].copy()
+            hazards = self.hazards_[columns].values[0].copy()
+        else:
+            yaxis_locations = range(len(self.hazards_.columns))
+            summary = self.summary
+            lower_bound = self.confidence_intervals_.loc['lower-bound'].copy()
+            upper_bound = self.confidence_intervals_.loc['upper-bound'].copy()
+            hazards = self.hazards_.values[0].copy()
 
         if standardized:
             se = summary['se(coef)']

diff --git a/lifelines/version.py b/lifelines/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '0.14.2'
+__version__ = '0.14.3'
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
@@ -239,6 +239,20 @@ def test_subtraction_function(self, positive_sample_lifetimes, univariate_fitter
 
             npt.assert_array_almost_equal(f1.subtract(f1).sum().values, 0.0)
 
+    def test_subtract_function_with_labelled_data(self, positive_sample_lifetimes, univariate_fitters):
+        T2 = np.arange(1, 50)
+        for fitter in univariate_fitters:
+            f1 = fitter()
+            f2 = fitter()
+
+            f1.fit(positive_sample_lifetimes[0], label='A')
+            f2.fit(T2, label='B')
+
+            result = f1.subtract(f2)
+            assert result.columns == ['diff']
+            assert result.shape[1] == 1
+
+
     def test_divide_function(self, positive_sample_lifetimes, univariate_fitters):
         T2 = np.arange(1, 50)
         for fitter in univariate_fitters:
@@ -248,11 +262,25 @@ def test_divide_function(self, positive_sample_lifetimes, univariate_fitters):
             f1.fit(positive_sample_lifetimes[0])
             f2.fit(T2)
 
-            result = f1.subtract(f2)
+            result = f1.divide(f2)
             assert result.shape[0] == (np.unique(np.concatenate((f1.timeline, f2.timeline))).shape[0])
 
             npt.assert_array_almost_equal(np.log(f1.divide(f1)).sum().values, 0.0)
 
+    def test_divide_function_with_labelled_data(self, positive_sample_lifetimes, univariate_fitters):
+        T2 = np.arange(1, 50)
+        for fitter in univariate_fitters:
+            f1 = fitter()
+            f2 = fitter()
+
+            f1.fit(positive_sample_lifetimes[0], label='A')
+            f2.fit(T2, label='B')
+
+            result = f1.divide(f2)
+            assert result.columns == ['ratio']
+            assert result.shape[1] == 1
+
+
     def test_valueerror_is_thrown_if_alpha_out_of_bounds(self, univariate_fitters):
         for fitter in univariate_fitters:
             with pytest.raises(ValueError):
@@ -1532,3 +1560,40 @@ def test_output_versus_Rs_against_standford_heart_transplant(self, ctv, heart):
         npt.assert_almost_equal(ctv.summary['se(coef)'].values, [0.0137, 0.0705, 0.3672, 0.3138], decimal=3)
         npt.assert_almost_equal(ctv.summary['p'].values, [0.048, 0.038, 0.083, 0.974], decimal=3)
 
+
+    def test_error_is_raised_if_using_non_numeric_data(self, ctv):
+        df = pd.DataFrame.from_dict({
+            'id': [1, 2, 3,],
+            'start': [0., 0., 0.],
+            'end': [1., 2., 3.],
+            'e': [1, 1, 1],
+            'bool_': [True, True, False],
+            'int_': [1, -1, 0],
+            'uint8_': pd.Series([1, -1, 0], dtype="uint8"),
+            'string_': ['test', 'a', '2.5'],
+            'float_': [1.2, -0.5, 0.0],
+            'categorya_': pd.Series([1, 2, 3], dtype='category'),
+            'categoryb_': pd.Series(['a', 'b', 'a'], dtype='category'),
+
+        })
+
+        for subset in [
+            ['start', 'end', 'e', 'id', 'categorya_'],
+            ['start', 'end', 'e', 'id', 'categoryb_'],
+            ['start', 'end', 'e', 'id', 'string_'],
+        ]:
+            with pytest.raises(TypeError):
+                ctv.fit(df[subset], id_col='id', event_col='e', stop_col='end')
+
+        for subset in [
+            ['start', 'end', 'e', 'id', 'bool_'],
+            ['start', 'end', 'e', 'id', 'int_'],
+            ['start', 'end', 'e', 'id', 'float_'],
+            ['start', 'end', 'e', 'id', 'uint8_'],
+        ]:
+            ctv.fit(df[subset],  id_col='id', event_col='e', stop_col='end')
+
+    def test_ctv_prediction_methods(self, ctv, heart):
+        ctv.fit(heart, id_col='id', event_col='event')
+        assert ctv.predict_log_partial_hazard(heart).shape[0] == heart.shape[0]
+        assert ctv.predict_partial_hazard(heart).shape[0] == heart.shape[0]
diff --git a/tests/test_plotting.py b/tests/test_plotting.py
@@ -5,11 +5,11 @@
 import pandas as pd
 import numpy as np
 from lifelines.estimation import NelsonAalenFitter, KaplanMeierFitter, AalenAdditiveFitter,\
-    CoxPHFitter
+    CoxPHFitter, CoxTimeVaryingFitter
 from lifelines.generate_datasets import generate_random_lifetimes, generate_hazard_rates
 from lifelines.plotting import plot_lifetimes
 from lifelines.datasets import load_waltons, load_regression_dataset, load_lcd,\
-    load_panel_test
+    load_panel_test, load_stanford_heart_transplants
 from lifelines.generate_datasets import cumulative_integral
 
 
@@ -233,12 +233,36 @@ def test_coxph_plotting(self, block):
         self.plt.title('test_coxph_plotting')
         self.plt.show(block=block)
 
+    def test_coxph_plotting_with_subset_of_columns(self, block):
+        df = load_regression_dataset()
+        cp = CoxPHFitter()
+        cp.fit(df, "T", "E")
+        cp.plot(columns=['var1', 'var2'])
+        self.plt.title('test_coxph_plotting_with_subset_of_columns')
+        self.plt.show(block=block)
+
+    def test_coxph_plotting_with_subset_of_columns_and_standardized(self, block):
+        df = load_regression_dataset()
+        cp = CoxPHFitter()
+        cp.fit(df, "T", "E")
+        cp.plot(True, columns=['var1', 'var2'])
+        self.plt.title('test_coxph_plotting_with_subset_of_columns_and_standardized')
+        self.plt.show(block=block)
+
     def test_coxph_plotting_normalized(self, block):
         df = load_regression_dataset()
         cp = CoxPHFitter()
         cp.fit(df, "T", "E")
         cp.plot(True)
-        self.plt.title('test_coxph_plotting')
+        self.plt.title('test_coxph_plotting_normalized')
+        self.plt.show(block=block)
+
+    def test_coxtv_plotting_with_subset_of_columns_and_standardized(self, block):
+        df = load_stanford_heart_transplants()
+        ctv = CoxTimeVaryingFitter()
+        ctv.fit(df, id_col='id', event_col='event')
+        ctv.plot(True, columns=['age', 'year'])
+        self.plt.title('test_coxtv_plotting_with_subset_of_columns_and_standardized')
         self.plt.show(block=block)
 
     def test_kmf_left_censorship_plots(self, block):