Merge ce2135a into 75bd954

CamDavidsonPilon · May 23, 2016 · eabc8f9 · eabc8f9
2 parents 75bd954 + ce2135a
commit eabc8f9
Show file tree

Hide file tree

Showing 11 changed files with 53 additions and 36 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,13 @@
 ### Changelogs
 
-#### Forthcoming 0.9.0
+#### 0.9.2
+ - deprecates Pandas versions before 0.18.
+ - throw an error if no admissable pairs in the c-index calculation. Previosly a NaN was returned.
+
+#### 0.9.1
+ - add two summary functions to Weibull and Exponential fitter, solves #224
+
+#### 0.9.0
  - new prediction function in `CoxPHFitter`, `predict_log_hazard_relative_to_mean`, that mimics what R's `predict.coxph` does.
  - removing the `predict` method in CoxPHFitter and AalenAdditiveFitter. This is because the choice of `predict_median` as a default was causing too much confusion, and no other natual choice as a default was available. All other `predict_` methods remain. 
  - Default predict method in `k_fold_cross_validation` is now `predict_expectation`

diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py
@@ -111,6 +111,6 @@ def _conditional_time_to_event_(self):
         """
         age = self.survival_function_.index.values[:, None]
         columns = ['%s - Conditional time remaining to event' % self._label]
-        return pd.DataFrame(qth_survival_times(self.survival_function_[self._label] * 0.5, self.survival_function_).T.sort(ascending=False).values,
+        return pd.DataFrame(qth_survival_times(self.survival_function_[self._label] * 0.5, self.survival_function_).T.sort_index(ascending=False).values,
                             index=self.survival_function_.index,
                             columns=columns) - age
diff --git a/lifelines/fitters/aalen_additive_fitter.py b/lifelines/fitters/aalen_additive_fitter.py
@@ -32,7 +32,7 @@ class AalenAdditiveFitter(BaseFitter):
         For example, this shrinks the absolute value of c_{i,t}. Recommended, even if a small value.
       smoothing_penalizer: Attach a L2 penalizer to difference between adjacent (over time) coefficents. For
         example, this shrinks the absolute value of c_{i,t} - c_{i,t+1}.
-      nn_cumulative_hazard: If True, forces the negative values in cumulative hazards to be 0 instead. Default True. 
+      nn_cumulative_hazard: If True, forces the negative values in cumulative hazards to be 0 instead. Default True.
 
     """
 
@@ -428,13 +428,11 @@ def plot(self, ix=None, iloc=None, columns=[], legend=True, **kwargs):
         """
         from matplotlib import pyplot as plt
 
-
         def shaded_plot(ax, x, y, y_upper, y_lower, **kwargs):
             base_line, = ax.plot(x, y, drawstyle='steps-post', **kwargs)
             fill_between_steps(x, y_lower, y2=y_upper, ax=ax, alpha=0.25,
                                color=base_line.get_color(), linewidth=1.0)
 
-
         assert (ix is None or iloc is None), 'Cannot set both ix and iloc in call to .plot'
 
         get_method = "ix" if ix is not None else "iloc"
@@ -448,7 +446,7 @@ def shaded_plot(ax, x, y, y_upper, y_lower, **kwargs):
             columns = self.cumulative_hazards_.columns
 
         if 'ax' in kwargs:
-            # don't use a .get here, as the default parameter will be called. In this case, 
+            # don't use a .get here, as the default parameter will be called. In this case,
             # plt.figure().add_subplot(111), which instantiates a new window
             ax = kwargs['ax']
         else:

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
@@ -223,7 +223,7 @@ def _newton_rhaphson(self, X, T, E, initial_beta=None, step_size=1.,
             delta = solve(-h, step_size * g.T)
             if np.any(np.isnan(delta)):
                 raise ValueError("delta contains nan value(s). Convergence halted.")
-                
+
             # Save these as pending result
             hessian, gradient = h, g
 
@@ -403,7 +403,7 @@ def predict_partial_hazard(self, X):
         """
         X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
             can be in any order. If a numpy array, columns must be in the
-            same order as the training data.        
+            same order as the training data.
 
         If covariates were normalized during fitting, they are normalized
         in the same way here.
@@ -431,20 +431,19 @@ def predict_log_hazard_relative_to_mean(self, X):
         """
         X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
             can be in any order. If a numpy array, columns must be in the
-            same order as the training data.        
+            same order as the training data.
 
-        Returns the log hazard relative to the hazard of the mean covariates. This is the behaviour 
+        Returns the log hazard relative to the hazard of the mean covariates. This is the behaviour
         of R's predict.coxph.
         """
         mean_covariates = self.data.mean(0).to_frame().T
-        return np.log(self.predict_partial_hazard(X)/self.predict_partial_hazard(mean_covariates).squeeze())
-
+        return np.log(self.predict_partial_hazard(X) / self.predict_partial_hazard(mean_covariates).squeeze())
 
     def predict_cumulative_hazard(self, X):
         """
         X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
             can be in any order. If a numpy array, columns must be in the
-            same order as the training data.        
+            same order as the training data.
 
         Returns the cumulative hazard for the individuals.
         """
@@ -457,7 +456,7 @@ def predict_survival_function(self, X):
         """
         X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
             can be in any order. If a numpy array, columns must be in the
-            same order as the training data.        
+            same order as the training data.
 
         Returns the estimated survival functions for the individuals
         """
@@ -467,7 +466,7 @@ def predict_percentile(self, X, p=0.5):
         """
         X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
             can be in any order. If a numpy array, columns must be in the
-            same order as the training data.   
+            same order as the training data.
 
         By default, returns the median lifetimes for the individuals.
         http://stats.stackexchange.com/questions/102986/percentile-loss-functions
@@ -479,7 +478,7 @@ def predict_median(self, X):
         """
         X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
             can be in any order. If a numpy array, columns must be in the
-            same order as the training data.   
+            same order as the training data.
 
         Returns the median lifetimes for the individuals
         """
@@ -489,8 +488,8 @@ def predict_expectation(self, X):
         """
         X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
             can be in any order. If a numpy array, columns must be in the
-            same order as the training data. 
-                   
+            same order as the training data.
+
         Compute the expected lifetime, E[T], using covarites X.
         """
         index = _get_index(X)

diff --git a/lifelines/fitters/exponential_fitter.py b/lifelines/fitters/exponential_fitter.py
@@ -100,9 +100,9 @@ def _compute_confidence_bounds_of_parameters(self):
         se = self._compute_standard_errors().ix['se']
         alpha2 = inv_normal_cdf((1. + self.alpha) / 2.)
         return pd.DataFrame([
-                np.array([self.lambda_]) + alpha2 * se,
-                np.array([self.lambda_]) - alpha2 * se,
-              ], columns=['lambda_'], index=['upper-bound', 'lower-bound'])
+            np.array([self.lambda_]) + alpha2 * se,
+            np.array([self.lambda_]) - alpha2 * se,
+        ], columns=['lambda_'], index=['upper-bound', 'lower-bound'])
 
     @property
     def summary(self):

diff --git a/lifelines/fitters/weibull_fitter.py b/lifelines/fitters/weibull_fitter.py
@@ -194,9 +194,9 @@ def _compute_confidence_bounds_of_parameters(self):
         se = self._compute_standard_errors().ix['se']
         alpha2 = inv_normal_cdf((1. + self.alpha) / 2.)
         return pd.DataFrame([
-                np.array([self.lambda_, self.rho_]) + alpha2 * se,
-                np.array([self.lambda_, self.rho_]) - alpha2 * se,
-               ], columns=['lambda_', 'rho_'], index=['upper-bound', 'lower-bound'])
+            np.array([self.lambda_, self.rho_]) + alpha2 * se,
+            np.array([self.lambda_, self.rho_]) - alpha2 * se,
+        ], columns=['lambda_', 'rho_'], index=['upper-bound', 'lower-bound'])
 
     @property
     def summary(self):

diff --git a/lifelines/generate_datasets.py b/lifelines/generate_datasets.py
@@ -246,8 +246,8 @@ def generate_random_lifetimes(hazard_rates, timelines, size=1, censor=None):
 def generate_observational_matrix(n, d, timelines, constant=False, independent=0, n_binary=0, model="aalen"):
     hz, coeff, covariates = generate_hazard_rates(n, d, timelines, constant=False, independent=0, n_binary=0, model=model)
     R = generate_random_lifetimes(hz, timelines)
-    covariates["event_at"] = R.T
-    return covariates.sort("event_at"), pd.DataFrame(cumulative_integral(coeff.values, timelines), columns=coeff.columns, index=timelines)
+    covariates["event_at"] = R.T[0]
+    return covariates.sort_values(by="event_at"), pd.DataFrame(cumulative_integral(coeff.values, timelines), columns=coeff.columns, index=timelines)
 
 
 def cumulative_integral(fx, x):

diff --git a/lifelines/plotting.py b/lifelines/plotting.py
@@ -216,6 +216,7 @@ def set_kwargs_ax(kwargs):
     if "ax" not in kwargs:
         kwargs["ax"] = plt.figure().add_subplot(111)
 
+
 def set_kwargs_color(kwargs):
     import matplotlib as mpl
     if int(mpl.__version__.split('.')[1]) > 4:
@@ -226,6 +227,7 @@ def set_kwargs_color(kwargs):
         kwargs['color'] = coalesce(kwargs.get('c'), kwargs.get('color'),
                                    next(kwargs["ax"]._get_lines.color_cycle))
 
+
 def set_kwargs_drawstyle(kwargs):
     kwargs['drawstyle'] = kwargs.get('drawstyle', 'steps-post')
 
@@ -304,7 +306,6 @@ def plot(ix=None, iloc=None, flat=False, show_censors=False,
             estimate_ = getattr(cls, estimate)
             confidence_interval_ = getattr(cls, 'confidence_interval_')
 
-
         dataframe_slicer = create_dataframe_slicer(iloc, ix)
 
         # plot censors
@@ -313,15 +314,15 @@ def plot(ix=None, iloc=None, flat=False, show_censors=False,
 
         if show_censors and cls.event_table['censored'].sum() > 0:
             cs = {
-                'marker': '+', 
-                'ms': 12, 
+                'marker': '+',
+                'ms': 12,
                 'mew': 1
             }
             cs.update(censor_styles)
             times = dataframe_slicer(cls.event_table.ix[(cls.event_table['censored'] > 0)]).index.values.astype(float)
             v = cls.predict(times)
             ax.plot(times, v, linestyle='None',
-                              color=colour, **cs)
+                    color=colour, **cs)
 
         # plot estimate
         dataframe_slicer(estimate_).plot(**kwargs)
@@ -330,9 +331,9 @@ def plot(ix=None, iloc=None, flat=False, show_censors=False,
         if ci_show:
             if ci_force_lines:
                 dataframe_slicer(confidence_interval_).plot(linestyle="-", linewidth=1,
-                                                   color=[colour], legend=True,
-                                                   drawstyle=kwargs.get('drawstyle', 'default'),
-                                                   ax=ax, alpha=0.6)
+                                                            color=[colour], legend=True,
+                                                            drawstyle=kwargs.get('drawstyle', 'default'),
+                                                            ax=ax, alpha=0.6)
             else:
                 x = dataframe_slicer(confidence_interval_).index.values.astype(float)
                 lower = dataframe_slicer(confidence_interval_.filter(like='lower')).values[:, 0]

diff --git a/lifelines/utils/__init__.py b/lifelines/utils/__init__.py
@@ -465,12 +465,12 @@ def k_fold_cross_validation(fitters, df, duration_col, event_col=None,
         event_col = 'E'
         df[event_col] = 1.
 
-    df = df.reindex(np.random.permutation(df.index)).sort(event_col)
+    df = df.reindex(np.random.permutation(df.index)).sort_values(event_col)
 
     assignments = np.array((n // k + 1) * list(range(1, k + 1)))
     assignments = assignments[:n]
 
-    testing_columns = df.columns - [duration_col, event_col]
+    testing_columns = df.columns.difference([duration_col, event_col])
 
     for i in range(1, k + 1):
 
@@ -864,6 +864,9 @@ def handle_pairs(truth, pred, first_ix):
         num_correct += correct
         num_tied += tied
 
+    if num_pairs == 0:
+        raise ZeroDivisionError("No admissable pairs in the dataset.")
+
     return (num_correct + num_tied / 2) / num_pairs
 
 
@@ -913,4 +916,6 @@ def concordance_value(time_a, time_b, pred_a, pred_b):
                 paircount += 1.0
                 csum += concordance_value(time_a, time_b, pred_a, pred_b)
 
+    if paircount == 0:
+        raise ZeroDivisionError("No admissable pairs in the dataset.")
     return csum / paircount
diff --git a/lifelines/version.py b/lifelines/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '0.9.1.0'
+__version__ = '0.9.2'
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
@@ -302,6 +302,13 @@ def test_concordance_index_returns_same_after_shifting():
     assert utils.concordance_index(T, T_) == utils.concordance_index(T - 5, T_ - 5) == utils.concordance_index(T, T_ - 5) == utils.concordance_index(T - 5, T_)
 
 
+def test_both_concordance_index_function_deal_with_ties_the_same_way():
+    actual_times = np.array([1, 1, 2])
+    predicted_times = np.array([1, 2, 3])
+    obs = np.ones(3)
+    assert fast_cindex(actual_times, predicted_times, obs) == slow_cindex(actual_times, predicted_times, obs) == 1.0 
+
+
 def test_survival_table_from_events_with_non_negative_T_and_no_lagged_births():
     n = 10
     T = np.arange(n)