Merge pull request #293 from CamDavidsonPilon/coxph-baseline

Coxph baseline fix; remove normalize; bump version
CamDavidsonPilon · May 29, 2017 · 0fe0d10 · 0fe0d10
2 parents 6184551 + c9a0ae2
commit 0fe0d10
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 85 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,12 @@
 ### Changelogs
 
+#### 0.10.0
+ - corrected bug that was returning the wrong baseline survival and hazard values in `CoxPHFitter` when `normalize=True`. 
+ - removed  `normalize` kwarg in `CoxPHFitter`. This was causing lots of confusion for users, and added code complexity. It's really nice to be able to remove it.
+ - correcting column name in `CoxPHFitter.baseline_survival_`
+ - `CoxPHFitter.baseline_cumulative_hazard_` is always centered, to mimic R's `basehaz` API.
+ - new `predict_log_partial_hazards` to `CoxPHFitter`
+
 #### 0.9.4
  - adding `plot_loglogs` to `KaplanMeierFitter`
  - added a (correct) check to see if some columns in a dataset will cause convergence problems.

diff --git a/lifelines/fitters/coxph_fitter.py b/lifelines/fitters/coxph_fitter.py
@@ -33,7 +33,7 @@ class CoxPHFitter(BaseFitter):
         The penalty is 1/2 * penalizer * ||beta||^2.
     """
 
-    def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0, strata=None):
+    def __init__(self, alpha=0.95, tie_method='Efron', penalizer=0.0, strata=None):
         if not (0 < alpha <= 1.):
             raise ValueError('alpha parameter must be between 0 and 1.')
         if penalizer < 0:
@@ -42,7 +42,6 @@ def __init__(self, alpha=0.95, tie_method='Efron', normalize=True, penalizer=0.0
             raise NotImplementedError("Only Efron is available atm.")
 
         self.alpha = alpha
-        self.normalize = normalize
         self.tie_method = tie_method
         self.penalizer = penalizer
         self.strata = strata
@@ -295,32 +294,27 @@ def fit(self, df, duration_col, event_col=None,
             E = df[event_col]
             del df[event_col]
 
-        # Store original non-normalized data
         self.data = df if self.strata is None else df.reset_index()
         self._check_values(df)
-
-        if self.normalize:
-            # Need to normalize future inputs as well
-            self._norm_mean = df.mean(0)
-            self._norm_std = df.std(0)
-            df = normalize(df)
+        self._norm_mean = df.mean(0)
+        self._norm_std = df.std(0)
+        df = normalize(df, self._norm_mean, self._norm_std)
 
         E = E.astype(bool)
 
         hazards_ = self._newton_rhaphson(df, T, E, initial_beta=initial_beta,
                                          show_progress=show_progress,
                                          include_likelihood=include_likelihood)
 
-        self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns,
-                                     index=['coef'])
+        self.hazards_ = pd.DataFrame(hazards_.T, columns=df.columns, index=['coef']) / self._norm_std
         self.confidence_intervals_ = self._compute_confidence_intervals()
 
         self.durations = T
         self.event_observed = E
 
-        self.baseline_hazard_ = self._compute_baseline_hazards(df, T, E)
+        self.baseline_hazard_ = self._compute_baseline_hazards(normalize(df, 0, 1 / self._norm_std), T, E)
         self.baseline_cumulative_hazard_ = self.baseline_hazard_.cumsum()
-        self.baseline_survival_ = exp(-self.baseline_cumulative_hazard_)
+        self.baseline_survival_ = self._compute_baseline_survival()
         return self
 
     def _check_values(self, X):
@@ -342,7 +336,7 @@ def _compute_confidence_intervals(self):
                             columns=self.hazards_.columns)
 
     def _compute_standard_errors(self):
-        se = np.sqrt(inv(-self._hessian_).diagonal())
+        se = np.sqrt(inv(-self._hessian_).diagonal()) / self._norm_std
         return pd.DataFrame(se[None, :],
                             index=['se'], columns=self.hazards_.columns)
 
@@ -404,8 +398,6 @@ def predict_partial_hazard(self, X):
             can be in any order. If a numpy array, columns must be in the
             same order as the training data.
 
-        If covariates were normalized during fitting, they are normalized
-        in the same way here.
 
         If X is a dataframe, the order of the columns do not matter. But
         if X is an array, then the column ordering is assumed to be the
@@ -414,17 +406,28 @@ def predict_partial_hazard(self, X):
         Returns the partial hazard for the individuals, partial since the
         baseline hazard is not included. Equal to \exp{\beta X}
         """
-        index = _get_index(X)
+        return exp(self.predict_log_partial_hazard(X))
 
+    def predict_log_partial_hazard(self, X):
+        """
+        X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
+            can be in any order. If a numpy array, columns must be in the
+            same order as the training data.
+
+
+        If X is a dataframe, the order of the columns do not matter. But
+        if X is an array, then the column ordering is assumed to be the
+        same as the training dataset.
+
+        Returns the log of the partial hazard for the individuals, partial since the
+        baseline hazard is not included. Equal to \beta X
+        """
         if isinstance(X, pd.DataFrame):
             order = self.hazards_.columns
             X = X[order]
 
-        if self.normalize:
-            # Assuming correct ordering and number of columns
-            X = normalize(X, self._norm_mean.values, self._norm_std.values)
-
-        return pd.DataFrame(exp(np.dot(X, self.hazards_.T)), index=index)
+        index = _get_index(X)
+        return pd.DataFrame(np.dot(X, self.hazards_.T), index=index)
 
     def predict_log_hazard_relative_to_mean(self, X):
         """
@@ -433,29 +436,31 @@ def predict_log_hazard_relative_to_mean(self, X):
             same order as the training data.
 
         Returns the log hazard relative to the hazard of the mean covariates. This is the behaviour
-        of R's predict.coxph.
+        of R's predict.coxph. Equal to \beta X - \beta \bar{X}
         """
         mean_covariates = self.data.mean(0).to_frame().T
-        return np.log(self.predict_partial_hazard(X) / self.predict_partial_hazard(mean_covariates).squeeze())
+        return self.predict_log_partial_hazard(X) - self.predict_log_partial_hazard(mean_covariates).squeeze()
 
     def predict_cumulative_hazard(self, X):
         """
         X: a (n,d) covariate numpy array or DataFrame. If a DataFrame, columns
             can be in any order. If a numpy array, columns must be in the
             same order as the training data.
+
+        Returns the cumulative hazard of individuals.
         """
         if self.strata:
             cumulative_hazard_ = pd.DataFrame()
             for stratum, stratified_X in X.groupby(self.strata):
-                s_0 = self.baseline_survival_[[stratum]]
+                c_0 = self.baseline_cumulative_hazard_[[stratum]]
                 col = _get_index(stratified_X)
                 v = self.predict_partial_hazard(stratified_X)
-                cumulative_hazard_ = cumulative_hazard_.merge(pd.DataFrame(-np.dot(np.log(s_0), v.T), index=s_0.index, columns=col), how='outer', right_index=True, left_index=True)
+                cumulative_hazard_ = cumulative_hazard_.merge(pd.DataFrame(np.dot(c_0, v.T), index=c_0.index, columns=col), how='outer', right_index=True, left_index=True)
         else:
-            s_0 = self.baseline_survival_
+            c_0 = self.baseline_cumulative_hazard_
             col = _get_index(X)
             v = self.predict_partial_hazard(X)
-            cumulative_hazard_ = pd.DataFrame(-np.dot(np.log(s_0), v.T), columns=col, index=s_0.index)
+            cumulative_hazard_ = pd.DataFrame(np.dot(c_0, v.T), columns=col, index=c_0.index)
 
         return cumulative_hazard_
 
@@ -520,11 +525,17 @@ def _compute_baseline_hazards(self, df, T, E):
             baseline_hazards_ = pd.DataFrame(index=self.durations.unique())
             for stratum in df.index.unique():
                 baseline_hazards_ = baseline_hazards_.merge(
-                                                 self._compute_baseline_hazard(data=df.ix[[stratum]], durations=T.ix[[stratum]], event_observed=E.ix[[stratum]], name=stratum),
-                                                 left_index=True,
-                                                 right_index=True,
-                                                 how='left')
+                    self._compute_baseline_hazard(data=df.ix[[stratum]], durations=T.ix[[stratum]], event_observed=E.ix[[stratum]], name=stratum),
+                    left_index=True,
+                    right_index=True,
+                    how='left')
             return baseline_hazards_.fillna(0)
 
         else:
             return self._compute_baseline_hazard(data=df, durations=T, event_observed=E, name='baseline hazard')
+
+    def _compute_baseline_survival(self):
+        survival_df = exp(-self.baseline_cumulative_hazard_)
+        if self.strata is None:
+            survival_df.columns = ['baseline survival']
+        return survival_df
diff --git a/lifelines/version.py b/lifelines/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '0.9.4'
+__version__ = '0.10.0'
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
@@ -20,7 +20,7 @@
 from lifelines.estimation import CoxPHFitter, AalenAdditiveFitter, KaplanMeierFitter, \
     NelsonAalenFitter, BreslowFlemingHarringtonFitter, ExponentialFitter, \
     WeibullFitter, BaseFitter
-from lifelines.datasets import load_regression_dataset, load_larynx, load_waltons, load_kidney_transplant, load_rossi,\
+from lifelines.datasets import load_larynx, load_waltons, load_kidney_transplant, load_rossi,\
     load_lcd, load_panel_test, load_g3, load_holly_molly_polly
 from lifelines.generate_datasets import generate_hazard_rates, generate_random_lifetimes, cumulative_integral
 from lifelines.utils import concordance_index
@@ -678,13 +678,13 @@ def test_efron_newtons_method(self, data_nus):
         assert np.abs(newton(X, T, E)[0][0] - -0.0335) < 0.0001
 
     def test_fit_method(self, data_nus):
-        cf = CoxPHFitter(normalize=False)
+        cf = CoxPHFitter()
         cf.fit(data_nus, duration_col='t', event_col='E')
         assert np.abs(cf.hazards_.ix[0][0] - -0.0335) < 0.0001
 
     def test_using_dataframes_vs_numpy_arrays(self, data_pred2):
         # First without normalization
-        cf = CoxPHFitter(normalize=False)
+        cf = CoxPHFitter()
         cf.fit(data_pred2, 't', 'E')
 
         X = data_pred2[cf.data.columns]
@@ -694,22 +694,12 @@ def test_using_dataframes_vs_numpy_arrays(self, data_pred2):
         hazards_n = cf.predict_partial_hazard(np.array(X))
         assert np.all(hazards == hazards_n)
 
-        # Now with normalization
-        cf = CoxPHFitter(normalize=True)
-        cf.fit(data_pred2, 't', 'E')
-
-        hazards = cf.predict_partial_hazard(X)
-
-        # Compare with array argument
-        hazards_n = cf.predict_partial_hazard(np.array(X))
-        assert np.all(hazards == hazards_n)
-
     def test_data_normalization(self, data_pred2):
         # During fit, CoxPH copies the training data and normalizes it.
         # Future calls should be normalized in the same way and
         # internal training set should not be saved in a normalized state.
 
-        cf = CoxPHFitter(normalize=True)
+        cf = CoxPHFitter()
         cf.fit(data_pred2, duration_col='t', event_col='E')
 
         # Internal training set
@@ -731,20 +721,17 @@ def test_cox_ph_prediction_monotonicity(self, data_pred2):
         e = data_pred2['E']
         X = data_pred2[['x1', 'x2']]
 
-        for normalize in [True, False]:
-            msg = ("Predict methods should get the same concordance" +
-                   " when {}normalizing".format('' if normalize else 'not '))
-            cf = CoxPHFitter(normalize=normalize)
-            cf.fit(data_pred2, duration_col='t', event_col='E')
+        cf = CoxPHFitter()
+        cf.fit(data_pred2, duration_col='t', event_col='E')
 
-            # Base comparison is partial_hazards
-            ci_ph = concordance_index(t, -cf.predict_partial_hazard(X).values, e)
+        # Base comparison is partial_hazards
+        ci_ph = concordance_index(t, -cf.predict_partial_hazard(X).values, e)
 
-            ci_med = concordance_index(t, cf.predict_median(X).ravel(), e)
-            assert ci_ph == ci_med, msg
+        ci_med = concordance_index(t, cf.predict_median(X).ravel(), e)
+        assert ci_ph == ci_med
 
-            ci_exp = concordance_index(t, cf.predict_expectation(X).ravel(), e)
-            assert ci_ph == ci_exp, msg
+        ci_exp = concordance_index(t, cf.predict_expectation(X).ravel(), e)
+        assert ci_ph == ci_exp
 
     def test_crossval_for_cox_ph_with_normalizing_times(self, data_pred2, data_pred1):
         cf = CoxPHFitter()
@@ -828,7 +815,7 @@ def test_output_against_R(self, rossi):
         cat(round(mod.allison$coefficients, 4), sep=", ")
         """
         expected = np.array([[-0.3794, -0.0574, 0.3139, -0.1498, -0.4337, -0.0849,  0.0915]])
-        cf = CoxPHFitter(normalize=False)
+        cf = CoxPHFitter()
         cf.fit(rossi, duration_col='week', event_col='arrest')
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)
 
@@ -838,21 +825,20 @@ def test_output_with_strata_against_R(self, rossi):
         r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
                     paro, mar, wexp) + prio, data = rossi)
         """
-        expected = np.array([[-0.335, -0.059, 0.100]])
-        cf = CoxPHFitter(normalize=False)
+        expected = np.array([[-0.3355, -0.0590, 0.1002]])
+        cf = CoxPHFitter()
         cf.fit(rossi, duration_col='week', event_col='arrest', strata=['race', 'paro', 'mar', 'wexp'])
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)
 
-
     def test_penalized_output_against_R(self, rossi):
         # R code:
         #
         # rossi <- read.csv('.../lifelines/datasets/rossi.csv')
         # mod.allison <- coxph(Surv(week, arrest) ~ ridge(fin, age, race, wexp, mar, paro, prio,
-        #                                                 theta=1.0, scale=FALSE), data=rossi)
+        #                                                 theta=1.0, scale=TRUE), data=rossi)
         # cat(round(mod.allison$coefficients, 4), sep=", ")
-        expected = np.array([[-0.3641, -0.0580, 0.2894, -0.1496, -0.3837, -0.0822, 0.0913]])
-        cf = CoxPHFitter(normalize=False, penalizer=1.0)
+        expected = np.array([[-0.3761, -0.0565, 0.3099, -0.1532, -0.4295, -0.0837, 0.0909]])
+        cf = CoxPHFitter(penalizer=1.0)
         cf.fit(rossi, duration_col='week', event_col='arrest')
         npt.assert_array_almost_equal(cf.hazards_.values, expected, decimal=3)
 
@@ -861,7 +847,7 @@ def test_coef_output_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschbe
         df = load_kidney_transplant(usecols=['time', 'death',
                                              'black_male', 'white_male',
                                              'black_female'])
-        cf = CoxPHFitter(normalize=False)
+        cf = CoxPHFitter()
         cf.fit(df, duration_col='time', event_col='death')
 
         # coefs
@@ -872,7 +858,7 @@ def test_coef_output_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschbe
     def test_se_against_Survival_Analysis_by_John_Klein_and_Melvin_Moeschberger(self):
         # see table 8.1 in Survival Analysis by John P. Klein and Melvin L. Moeschberger, Second Edition
         df = load_larynx()
-        cf = CoxPHFitter(normalize=False)
+        cf = CoxPHFitter()
         cf.fit(df, duration_col='time', event_col='death')
 
         # standard errors
@@ -911,16 +897,16 @@ def test_strata_works_if_only_a_single_element_is_in_the_strata(self):
         cp.fit(df, 'T', 'Status', strata=['Stratum'])
         assert True
 
-    def test_strata_against_r_output(self, rossi):
+    def test_strata_against_R_output(self, rossi):
         """
         > library(survival)
-        > ross = read.csv('rossi.csv')
+        > rossi = read.csv('.../lifelines/datasets/rossi.csv')
         > r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
             paro, mar, wexp) + prio, data = rossi)
         > r$loglik
         """
 
-        cp = CoxPHFitter(normalize=False)
+        cp = CoxPHFitter()
         cp.fit(rossi, 'week', 'arrest', strata=['race', 'paro', 'mar', 'wexp'], include_likelihood=True)
 
         npt.assert_almost_equal(cp.summary['coef'].values, [-0.335, -0.059, 0.100], decimal=3)
@@ -929,26 +915,18 @@ def test_strata_against_r_output(self, rossi):
     def test_hazard_works_as_intended_with_strata_against_R_output(self, rossi):
         """
         > library(survival)
-        > ross = read.csv('rossi.csv')
+        > rossi = read.csv('.../lifelines/datasets/rossi.csv')
         > r = coxph(formula = Surv(week, arrest) ~ fin + age + strata(race,
             paro, mar, wexp) + prio, data = rossi)
-        > basehaz(r, centered=FALSE)
+        > basehaz(r, centered=TRUE)
         """
-        cp = CoxPHFitter(normalize=False)
+        cp = CoxPHFitter()
         cp.fit(rossi, 'week', 'arrest', strata=['race', 'paro', 'mar', 'wexp'])
-        npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 0)].ix[[14, 35, 37, 43, 52]].values, [0.28665890, 0.63524149, 1.01822603, 1.48403930, 1.48403930], decimal=2)
-        npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 1)].ix[[27, 43, 48, 52]].values, [0.35738173, 0.76415714, 1.26635373, 1.26635373], decimal=2)
-
-    def test_predict_log_hazard_relative_to_mean_with_normalization(self, rossi):
-        cox = CoxPHFitter(normalize=True)
-        cox.fit(rossi, 'week', 'arrest')
+        npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 0)].ix[[14, 35, 37, 43, 52]].values, [0.076600555, 0.169748261, 0.272088807, 0.396562717, 0.396562717], decimal=2)
+        npt.assert_almost_equal(cp.baseline_cumulative_hazard_[(0, 0, 0, 1)].ix[[27, 43, 48, 52]].values, [0.095499001, 0.204196905, 0.338393113, 0.338393113], decimal=2)
 
-        # they are equal because the data is normalized, so the mean of the covarites is all 0,
-        # thus exp(beta * 0) == 1, so exp(beta * X)/exp(beta * 0) = exp(beta * X)
-        assert_frame_equal(cox.predict_log_hazard_relative_to_mean(rossi), np.log(cox.predict_partial_hazard(rossi)))
-
-    def test_predict_log_hazard_relative_to_mean_without_normalization(self, rossi):
-        cox = CoxPHFitter(normalize=False)
+    def test_predict_log_hazard_relative_to_mean(self, rossi):
+        cox = CoxPHFitter()
         cox.fit(rossi, 'week', 'arrest')
         log_relative_hazards = cox.predict_log_hazard_relative_to_mean(rossi)
         means = rossi.mean(0).to_frame().T