Merge ceccebb into 70999a0

CamDavidsonPilon · Sep 2, 2019 · afbbf33 · afbbf33
2 parents 70999a0 + ceccebb
commit afbbf33
Show file tree

Hide file tree

Showing 7 changed files with 163 additions and 60 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,11 @@
 #### 0.22.4
 
 ##### New features
- - Some performance improvements to parametric regression models.
+ - Some performance improvements to regression models.
+ - lifelines will avoid penalizing the intercept (aka bias) variables in regression models.
+
+##### Bug fixes
+ - Fixed issue where `concordance_index` would never exit if NaNs in dataset.
 
 
 #### 0.22.3

diff --git a/examples/SaaS churn and piecewise regression models.ipynb b/examples/SaaS churn and piecewise regression models.ipynb
diff --git a/lifelines/fitters/__init__.py b/lifelines/fitters/__init__.py
@@ -1413,9 +1413,6 @@ def _fit(
         df = df.astype(float)
         self._check_values(df, utils.coalesce(Ts[1], Ts[0]), E, weights, entries)
 
-        _norm_std = df.std(0)
-        _norm_std[_norm_std < 1e-8] = 1.0
-
         _index = pd.MultiIndex.from_tuples(
             sum(([(name, col) for col in columns] for name, columns in self.regressors.items()), [])
         )
@@ -1426,7 +1423,13 @@ def _fit(
             self._norm_mean_ = df[self.regressors[self._primary_parameter_name]].mean(0)
             self._norm_mean_ancillary = df[self.regressors[self._ancillary_parameter_name]].mean(0)
 
-        self._norm_std = pd.Series([_norm_std.loc[variable_name] for _, variable_name in _index], index=_index)
+        _norm_std = df.std(0)
+        self._constant_cols = pd.Series(
+            [(_norm_std.loc[variable_name] < 1e-8) for (_, variable_name) in _index], index=_index
+        )
+        self._norm_std = pd.Series([_norm_std.loc[variable_name] for (_, variable_name) in _index], index=_index)
+        self._norm_std[self._constant_cols] = 1.0
+        _norm_std[_norm_std < 1e-8] = 1.0
 
         _params, self.log_likelihood_, self._hessian_ = self._fit_model(
             log_likelihood_function,
@@ -1466,6 +1469,8 @@ def _log_likelihood(self):
 
     def _add_penalty(self, params, neg_ll):
         params, _ = flatten(params)
+        # remove constant cols from being penalized
+        params = params[~self._constant_cols]
         if self.penalizer > 0:
             penalty = (params ** 2).sum()
         else:
@@ -1487,7 +1492,7 @@ def _fit_model(self, likelihood, Ts, Xs, E, weights, entries, show_progress=Fals
         initial_point_array, unflatten = flatten(initial_point_dict)
 
         if initial_point is None and isinstance(initial_point, dict):
-            initial_point_array = flatten(initial_point)
+            initial_point_array, _ = flatten(initial_point)  # TODO: test
 
         assert initial_point_array.shape[0] == Xs.size, "initial_point is not the correct shape."
 
@@ -2577,7 +2582,7 @@ def _create_initial_point(self, Ts, E, entries, weights, Xs):
         """
         See https://github.com/CamDavidsonPilon/lifelines/issues/664
         """
-        constant_col = (Xs.df.var(0) < 1e-8).idxmax()
+        constant_col = (Xs.df.std(0) < 1e-8).idxmax()
 
         def _transform_ith_param(param):
             if param <= 0:
@@ -2614,8 +2619,12 @@ def _transform_ith_param(param):
 
     def _add_penalty(self, params, neg_ll):
         params, _ = flatten(params)
-        if self.penalizer > 0:
+        # remove intercepts from being penalized
+        params = params[~self._constant_cols]
+        if self.penalizer > 0 and self.l1_ratio > 0:
             penalty = self.l1_ratio * anp.abs(params).sum() + 0.5 * (1.0 - self.l1_ratio) * (params ** 2).sum()
+        elif self.penalizer > 0 and self.l1_ratio <= 0:
+            penalty = 0.5 * (params ** 2).sum()
         else:
             penalty = 0
         return neg_ll + self.penalizer * penalty

diff --git a/lifelines/fitters/piecewise_exponential_regression_fitter.py b/lifelines/fitters/piecewise_exponential_regression_fitter.py
@@ -45,8 +45,9 @@ def _add_penalty(self, params, neg_ll):
         coef_penalty = 0
 
         if self.penalizer > 0:
-            for i in range(params_stacked.shape[1] - 1):  # assuming the intercept col is the last column...
-                coef_penalty = coef_penalty + (params_stacked[:, i]).var()
+            for i in range(params_stacked.shape[1]):
+                if not self._constant_cols[i]:
+                    coef_penalty = coef_penalty + (params_stacked[:, i]).var()
 
         return neg_ll + self.penalizer * coef_penalty
 

diff --git a/lifelines/utils/concordance.py b/lifelines/utils/concordance.py
@@ -263,4 +263,9 @@ def _preprocess_scoring_data(event_times, predicted_scores, event_observed):
         if event_observed.shape != event_times.shape:
             raise ValueError("Observed events must be 1-dimensional of same length as event times")
 
+    # check for NaNs
+    for a in [event_times, predicted_scores, event_observed]:
+        if np.isnan(a).any():
+            raise ValueError("NaNs detected in inputs, please correct or drop.")
+
     return event_times, predicted_scores, event_observed
diff --git a/tests/test_estimation.py b/tests/test_estimation.py
@@ -1392,6 +1392,34 @@ def test_BHF_fit_when_KMF_throws_an_error(self):
         bfh.fit(observations, entry=births)
 
 
+class TestParametricRegressionFitter:
+    @pytest.fixture
+    def rossi(self):
+        rossi = load_rossi()
+        rossi["_int"] = 1.0
+        return rossi
+
+    def test_custom_weibull_model_gives_the_same_data_as_implemented_weibull_model(self, rossi):
+        class CustomWeibull(ParametricRegressionFitter):
+
+            _fitted_parameter_names = ["lambda_", "rho_"]
+
+            def _cumulative_hazard(self, params, T, Xs):
+                lambda_ = anp.exp(anp.dot(Xs["lambda_"], params["lambda_"]))
+                rho_ = anp.exp(anp.dot(Xs["rho_"], params["rho_"]))
+
+                return (T / lambda_) ** rho_
+
+        cb = CustomWeibull()
+        wf = WeibullAFTFitter(fit_intercept=False)
+
+        cb.fit(rossi, "week", "arrest", regressors={"lambda_": rossi.columns, "rho_": ["_int"]})
+        wf.fit(rossi, "week", "arrest")
+
+        assert_frame_equal(cb.summary.loc["lambda_"], wf.summary.loc["lambda_"], check_less_precise=2)
+        npt.assert_allclose(cb.log_likelihood_, wf.log_likelihood_)
+
+
 class TestRegressionFitters:
     @pytest.fixture
     def rossi(self):

diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
@@ -961,7 +961,7 @@ def test_sklearn_GridSearchCV_accept_model(self, X, Y):
         clf = GridSearchCV(base_model(), grid_params, cv=4)
         clf.fit(X, Y)
 
-        assert clf.best_params_ == {"model_ancillary": False, "penalizer": 0.01}
+        assert clf.best_params_ == {"model_ancillary": False, "penalizer": 100.0}
         assert clf.predict(X).shape[0] == X.shape[0]
 
     def test_model_can_accept_things_like_strata(self, X, Y):