Apply normalization to future data

If a model normalizes data during fitting, it should also normalize data passed to the predict methods later in the same way. As a logical consequence, CoxPH now stores the original data instead of the normalized version as well as the normalization variables. I also added some checks to make sure that incoming dataframes have the same input columns present as the fitting data. In case the incoming data is an ndarray, the order is assumed to be correct (as before). The following examples now work, where they before would crash or simply return the wrong result: # fulldata has columns x1, x2, t, e cf.predict(fulldata) # shuffleddata has columns x2, t, x1, e, y1, y2, y3 cf.predict(shuffleddata) # Xrev has columns x2, x1 cf.predict(Xrev) Signed-off-by: Jonas Kalderstam <jonas@kalderstam.se>
CamDavidsonPilon · Nov 26, 2014 · 19fb043 · CamDavidsonPilon · Nov 26, 2014 · CamDavidsonPilon
1 parent a9b807d
commit 19fb043
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 15 deletions.
diff --git a/lifelines/estimation.py b/lifelines/estimation.py
@@ -1012,7 +1012,14 @@ def fit(self, df, duration_col='T', event_col='E',
         E = df[event_col]
         del df[duration_col]
         del df[event_col]
+
+        # Store original non-normalized data
+        self.data = df
+
         if self.normalize:
+            # Need to normalize future inputs as well
+            self._norm_mean = df.mean(0)
+            self._norm_std = df.std(0)
             df = normalize(df)
 
         E = E.astype(bool)
@@ -1026,7 +1033,6 @@ def fit(self, df, duration_col='T', event_col='E',
                                      index=['coef'])
         self.confidence_intervals_ = self._compute_confidence_intervals()
 
-        self.data = df
         self.durations = T
         self.event_observed = E
 
@@ -1107,9 +1113,25 @@ def predict_partial_hazard(self, X):
         """
         X: a (n,d) covariate matrix
 
+        If covariates were normalized during fitting, they are normalized
+        in the same way here.
+
+        If X is a dataframe, the order of the columns do not matter. But
+        if X is an array, then the column ordering is assumed to be the
+        same as the training dataset.
+
         Returns the partial hazard for the individuals, partial since the
         baseline hazard is not included. Equal to \exp{\beta X}
         """
+        # Make sure column ordering is the same as during fitting
+        if isinstance(X, pd.DataFrame):
+            X = X[self.data.columns]
+        # If it's not a dataframe, order is up to user
+
+        if self.normalize:
+            # Assuming correct ordering and number of columns
+            X = normalize(X, self._norm_mean.values, self._norm_std.values)
+
         return exp(np.dot(X, self.hazards_.T))
 
     def predict_cumulative_hazard(self, X):

diff --git a/lifelines/tests/test_suite.py b/lifelines/tests/test_suite.py
@@ -35,6 +35,15 @@
 
 class MiscTests(unittest.TestCase):
 
+    def test_unnormalize(self):
+        df = pd.read_csv('./datasets/larynx.csv')
+        m = df.mean(0)
+        s = df.std(0)
+
+        ndf = normalize(df)
+
+        npt.assert_almost_equal(df.values, unnormalize(ndf, m, s).values)
+
     def test_normalize(self):
         df = pd.read_csv('./datasets/larynx.csv')
         n,d = df.shape
@@ -673,7 +682,7 @@ def test_crossval_for_aalen_add(self):
             expected = 0.85
             msg = "Expected min-mean c-index {:.2f} < {:.2f}"
             self.assertTrue(np.mean(mean_scores) > expected,
-                            msg.format(expected, scores.mean())) 
+                            msg.format(expected, scores.mean()))
 
 class RegressionTests(unittest.TestCase):
 
@@ -939,26 +948,61 @@ def test_fit_method(self):
         cf.fit(data_nus, duration_col='t', event_col='E')
         self.assertTrue(np.abs(cf.hazards_.ix[0][0] - -0.0335) < 0.0001)
 
-    def test_data_sorting(self):
-        # During fit, CoxPH copies the training data and sorts it
-        # Make sure the final concordance matches betweens sorted
-        # and unsorted versions of the data set to verify that
-        # the order was not screwed up
+    def test_column_shuffling(self):
+        # Order of columns should not matter for dataframes
+        cf = CoxPHFitter(normalize=False)
+        cf.fit(data_pred2, 't', 'E')
+
+        # Reversed order
+        X = data_pred2[cf.data.columns]
+        X_reversed = data_pred2[list(reversed(cf.data.columns))]
 
-        cf = CoxPHFitter()
+        # Predictions should be exactly the same
+        hazards = cf.predict_partial_hazard(X)
+        hazards_r = cf.predict_partial_hazard(X_reversed)
+
+        self.assertTrue(np.all(hazards == hazards_r))
+
+        # Should still work with numpy arrays
+        hazards_n = cf.predict_partial_hazard(np.array(X))
+        self.assertTrue(np.all(hazards == hazards_n))
+
+
+        # Again with normalization
+        cf = CoxPHFitter(normalize=True)
+        cf.fit(data_pred2, 't', 'E')
+
+        # Predictions should be exactly the same
+        hazards = cf.predict_partial_hazard(X)
+        hazards_r = cf.predict_partial_hazard(X_reversed)
+
+        self.assertTrue(np.all(hazards == hazards_r))
+
+        # Should still work with numpy arrays
+        hazards_n = cf.predict_partial_hazard(np.array(X))
+        self.assertTrue(np.all(hazards == hazards_n))
+
+
+    def test_data_normalization(self):
+        # During fit, CoxPH copies the training data and normalizes it.
+        # Future calls should be normalized in the same way and
+        # internal training set should not be saved in a normalized state.
+
+        cf = CoxPHFitter(normalize=True)
         cf.fit(data_pred2, duration_col='t', event_col='E')
 
-        # Internal training c-index
+        # Internal training set
         ci_trn = concordance_index(cf.durations,
                                    -cf.predict_partial_hazard(cf.data).ravel(),
                                    cf.event_observed)
-        # Against original order
+        # New data should normalize in the exact same way
         ci_org = concordance_index(data_pred2['t'],
                                    -cf.predict_partial_hazard(data_pred2[['x1', 'x2']]).ravel(),
                                    data_pred2['E'])
 
-        self.assertEqual(ci_org, ci_trn,
-                         "Reordering should not change concordance index for cox!")
+        self.assertEqual(ci_org, ci_trn)
+
+
 
 
     def test_crossval_for_cox_ph_with_normalizing_times(self):

diff --git a/lifelines/utils.py b/lifelines/utils.py
@@ -126,7 +126,7 @@ def survival_table_from_events(durations, event_observed, min_observations,
         0               0         0         0        11
         6               1         1         0         0
         7               2         2         0         0
-        9               3         3         0         0 
+        9               3         3         0         0
         13              3         3         0         0
         15              2         2         0         0
 
@@ -315,8 +315,22 @@ def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E',
 
     return scores
 
-def normalize(X):
-    return (X - X.mean(0))/X.std(0)
+def normalize(X, mean=None, std=None):
+    '''
+    Normalize X. If mean OR std is None, normalizes
+    X to have mean 0 and std 1.
+    '''
+    if mean is None or std is None:
+        mean = X.mean(0)
+        std = X.std(0)
+    return (X - mean) / std
+
+def unnormalize(X, mean, std):
+    '''
+    Reverse a normalization. Requires the original mean and
+    standard deviation of the data set.
+    '''
+    return X * std + mean
 
 def epanechnikov_kernel(t, T, bandwidth=1.):
     M = 0.75 * (1 - (t - T) / bandwidth) ** 2