Skip to content

Commit

Permalink
Apply normalization to future data
Browse files Browse the repository at this point in the history
If a model normalizes data during fitting, it should also
normalize data passed to the predict methods later in the same
way. As a logical consequence, CoxPH now stores the original data
instead of the normalized version as well as the normalization
variables.

I also added some checks to make sure that incoming dataframes have
the same input columns present as the fitting data. In case the incoming
data is an ndarray, the order is assumed to be correct (as before).

The following examples now work, where they before would crash or simply
return the wrong result:

    # fulldata has columns x1, x2, t, e
    cf.predict(fulldata)

    # shuffleddata has columns x2, t, x1, e, y1, y2, y3
    cf.predict(shuffleddata)

    # Xrev has columns x2, x1
    cf.predict(Xrev)

Signed-off-by: Jonas Kalderstam <jonas@kalderstam.se>
  • Loading branch information
spacecowboy committed Nov 26, 2014
1 parent a9b807d commit 19fb043
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 15 deletions.
24 changes: 23 additions & 1 deletion lifelines/estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,7 +1012,14 @@ def fit(self, df, duration_col='T', event_col='E',
E = df[event_col]
del df[duration_col]
del df[event_col]

# Store original non-normalized data
self.data = df

if self.normalize:
# Need to normalize future inputs as well
self._norm_mean = df.mean(0)
self._norm_std = df.std(0)

This comment has been minimized.

Copy link
@CamDavidsonPilon

CamDavidsonPilon Nov 26, 2014

Owner

nice catch!

df = normalize(df)

E = E.astype(bool)
Expand All @@ -1026,7 +1033,6 @@ def fit(self, df, duration_col='T', event_col='E',
index=['coef'])
self.confidence_intervals_ = self._compute_confidence_intervals()

self.data = df
self.durations = T
self.event_observed = E

Expand Down Expand Up @@ -1107,9 +1113,25 @@ def predict_partial_hazard(self, X):
"""
X: a (n,d) covariate matrix
If covariates were normalized during fitting, they are normalized
in the same way here.
If X is a dataframe, the order of the columns do not matter. But
if X is an array, then the column ordering is assumed to be the
same as the training dataset.
Returns the partial hazard for the individuals, partial since the
baseline hazard is not included. Equal to \exp{\beta X}
"""
# Make sure column ordering is the same as during fitting
if isinstance(X, pd.DataFrame):
X = X[self.data.columns]
# If it's not a dataframe, order is up to user

if self.normalize:
# Assuming correct ordering and number of columns
X = normalize(X, self._norm_mean.values, self._norm_std.values)

return exp(np.dot(X, self.hazards_.T))

def predict_cumulative_hazard(self, X):
Expand Down
66 changes: 55 additions & 11 deletions lifelines/tests/test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@

class MiscTests(unittest.TestCase):

def test_unnormalize(self):
df = pd.read_csv('./datasets/larynx.csv')
m = df.mean(0)
s = df.std(0)

ndf = normalize(df)

npt.assert_almost_equal(df.values, unnormalize(ndf, m, s).values)

def test_normalize(self):
df = pd.read_csv('./datasets/larynx.csv')
n,d = df.shape
Expand Down Expand Up @@ -673,7 +682,7 @@ def test_crossval_for_aalen_add(self):
expected = 0.85
msg = "Expected min-mean c-index {:.2f} < {:.2f}"
self.assertTrue(np.mean(mean_scores) > expected,
msg.format(expected, scores.mean()))
msg.format(expected, scores.mean()))

class RegressionTests(unittest.TestCase):

Expand Down Expand Up @@ -939,26 +948,61 @@ def test_fit_method(self):
cf.fit(data_nus, duration_col='t', event_col='E')
self.assertTrue(np.abs(cf.hazards_.ix[0][0] - -0.0335) < 0.0001)

def test_data_sorting(self):
# During fit, CoxPH copies the training data and sorts it
# Make sure the final concordance matches betweens sorted
# and unsorted versions of the data set to verify that
# the order was not screwed up
def test_column_shuffling(self):
# Order of columns should not matter for dataframes
cf = CoxPHFitter(normalize=False)
cf.fit(data_pred2, 't', 'E')

# Reversed order
X = data_pred2[cf.data.columns]
X_reversed = data_pred2[list(reversed(cf.data.columns))]

cf = CoxPHFitter()
# Predictions should be exactly the same
hazards = cf.predict_partial_hazard(X)
hazards_r = cf.predict_partial_hazard(X_reversed)

self.assertTrue(np.all(hazards == hazards_r))

# Should still work with numpy arrays
hazards_n = cf.predict_partial_hazard(np.array(X))
self.assertTrue(np.all(hazards == hazards_n))


# Again with normalization
cf = CoxPHFitter(normalize=True)
cf.fit(data_pred2, 't', 'E')

# Predictions should be exactly the same
hazards = cf.predict_partial_hazard(X)
hazards_r = cf.predict_partial_hazard(X_reversed)

self.assertTrue(np.all(hazards == hazards_r))

# Should still work with numpy arrays
hazards_n = cf.predict_partial_hazard(np.array(X))
self.assertTrue(np.all(hazards == hazards_n))


def test_data_normalization(self):
# During fit, CoxPH copies the training data and normalizes it.
# Future calls should be normalized in the same way and
# internal training set should not be saved in a normalized state.

cf = CoxPHFitter(normalize=True)
cf.fit(data_pred2, duration_col='t', event_col='E')

# Internal training c-index
# Internal training set
ci_trn = concordance_index(cf.durations,
-cf.predict_partial_hazard(cf.data).ravel(),
cf.event_observed)
# Against original order
# New data should normalize in the exact same way
ci_org = concordance_index(data_pred2['t'],
-cf.predict_partial_hazard(data_pred2[['x1', 'x2']]).ravel(),
data_pred2['E'])

self.assertEqual(ci_org, ci_trn,
"Reordering should not change concordance index for cox!")
self.assertEqual(ci_org, ci_trn)




def test_crossval_for_cox_ph_with_normalizing_times(self):
Expand Down
20 changes: 17 additions & 3 deletions lifelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def survival_table_from_events(durations, event_observed, min_observations,
0 0 0 0 11
6 1 1 0 0
7 2 2 0 0
9 3 3 0 0
9 3 3 0 0
13 3 3 0 0
15 2 2 0 0
Expand Down Expand Up @@ -315,8 +315,22 @@ def k_fold_cross_validation(fitter, df, duration_col='T', event_col='E',

return scores

def normalize(X):
return (X - X.mean(0))/X.std(0)
def normalize(X, mean=None, std=None):

This comment has been minimized.

Copy link
@CamDavidsonPilon

CamDavidsonPilon Nov 26, 2014

Owner

👍

'''
Normalize X. If mean OR std is None, normalizes
X to have mean 0 and std 1.
'''
if mean is None or std is None:
mean = X.mean(0)
std = X.std(0)
return (X - mean) / std

def unnormalize(X, mean, std):
'''
Reverse a normalization. Requires the original mean and
standard deviation of the data set.
'''
return X * std + mean

def epanechnikov_kernel(t, T, bandwidth=1.):
M = 0.75 * (1 - (t - T) / bandwidth) ** 2
Expand Down

0 comments on commit 19fb043

Please sign in to comment.