Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

handle floating point values more accurate #277

Merged
merged 3 commits into from Aug 5, 2020
Merged

Conversation

StrikerRUS
Copy link
Member

I run all tests with the increased test dataset fraction (0.6) and compared results with ones obtained from master but with changed dataset splitting routine to ensure that inputs are the same in both cases.

diff --git a/tests/e2e/test_e2e.py b/tests/e2e/test_e2e.py
index b2aacb5..2644547 100644
--- a/tests/e2e/test_e2e.py
+++ b/tests/e2e/test_e2e.py
@@ -38,7 +38,7 @@ CLASSIFICATION = pytest.mark.clf
 
 
 # Set of helper functions to make parametrization less verbose.
-def regression(model, test_fraction=0.02):
+def regression(model, test_fraction=0.6):
     return (
         model,
         utils.get_regression_model_trainer(test_fraction),
@@ -46,7 +46,7 @@ def regression(model, test_fraction=0.02):
     )
 
 
-def classification(model, test_fraction=0.02):
+def classification(model, test_fraction=0.6):
     return (
         model,
         utils.get_classification_model_trainer(test_fraction),
@@ -54,7 +54,7 @@ def classification(model, test_fraction=0.02):
     )
 
 
-def classification_binary(model, test_fraction=0.02):
+def classification_binary(model, test_fraction=0.6):
     return (
         model,
         utils.get_binary_classification_model_trainer(test_fraction),
@@ -62,7 +62,7 @@ def classification_binary(model, test_fraction=0.02):
     )
 
 
-def regression_random(model, test_fraction=0.02):
+def regression_random(model, test_fraction=0.6):
     return (
         model,
         utils.get_regression_random_data_model_trainer(test_fraction),
@@ -70,7 +70,7 @@ def regression_random(model, test_fraction=0.02):
     )
 
 
-def classification_random(model, test_fraction=0.02):
+def classification_random(model, test_fraction=0.6):
     return (
         model,
         utils.get_classification_random_data_model_trainer(test_fraction),
@@ -78,7 +78,7 @@ def classification_random(model, test_fraction=0.02):
     )
 
 
-def classification_binary_random(model, test_fraction=0.02):
+def classification_binary_random(model, test_fraction=0.6):
     return (
         model,
         utils.get_classification_binary_random_data_model_trainer(
@@ -87,7 +87,7 @@ def classification_binary_random(model, test_fraction=0.02):
     )
 
 
-def regression_bounded(model, test_fraction=0.02):
+def regression_bounded(model, test_fraction=0.6):
     return (
         model,
         utils.get_bounded_regression_model_trainer(test_fraction),
@@ -228,11 +228,11 @@ STATSMODELS_LINEAR_REGULARIZED_PARAMS = dict(method="elastic_net",
 
         # XGBoost (tree method "hist")
         regression(xgboost.XGBRegressor(**XGBOOST_HIST_PARAMS),
-                   test_fraction=0.2),
+                   test_fraction=0.6),
         classification(xgboost.XGBClassifier(**XGBOOST_HIST_PARAMS),
-                       test_fraction=0.2),
+                       test_fraction=0.6),
         classification_binary(xgboost.XGBClassifier(**XGBOOST_HIST_PARAMS),
-                              test_fraction=0.2),
+                              test_fraction=0.6),
 
         # XGBoost (LINEAR)
         regression(xgboost.XGBRegressor(**XGBOOST_PARAMS_LINEAR)),
diff --git a/tests/utils.py b/tests/utils.py
index 46bae2d..f72b1a3 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -14,7 +14,7 @@ from lightning.impl.base import BaseClassifier as LightBaseClassifier
 from sklearn import datasets
 from sklearn.base import BaseEstimator, RegressorMixin, clone
 from sklearn.ensemble._forest import ForestClassifier
-from sklearn.utils import shuffle
+from sklearn.model_selection import train_test_split
 from sklearn.linear_model._base import LinearClassifierMixin
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.svm import SVC, NuSVC
@@ -70,25 +70,17 @@ class ModelTrainer:
         np.random.seed(seed=7)
         if dataset_name == "boston":
             self.name = "train_model_regression"
-            dataset = datasets.load_boston()
-            self.X, self.y = shuffle(
-                dataset.data, dataset.target, random_state=13)
+            self.X, self.y = datasets.load_boston(True)
         elif dataset_name == "boston_y_bounded":
             self.name = "train_model_regression_bounded"
-            dataset = datasets.load_boston()
-            self.X, self.y = shuffle(
-                dataset.data, dataset.target, random_state=13)
+            self.X, self.y = datasets.load_boston(True)
             self.y = np.arctan(self.y) / np.pi + 0.5  # (0; 1)
         elif dataset_name == "iris":
             self.name = "train_model_classification"
-            dataset = datasets.load_iris()
-            self.X, self.y = shuffle(
-                dataset.data, dataset.target, random_state=13)
+            self.X, self.y = datasets.load_iris(True)
         elif dataset_name == "breast_cancer":
             self.name = "train_model_classification_binary"
-            dataset = datasets.load_breast_cancer()
-            self.X, self.y = shuffle(
-                dataset.data, dataset.target, random_state=13)
+            self.X, self.y = datasets.load_breast_cancer(True)
         elif dataset_name == "regression_rnd":
             self.name = "train_model_regression_random_data"
             N = 1000
@@ -107,9 +99,9 @@ class ModelTrainer:
         else:
             raise ValueError("Unknown dataset name: {}".format(dataset_name))
 
-        offset = int(self.X.shape[0] * (1 - test_fraction))
-        self.X_train, self.y_train = self.X[:offset], self.y[:offset]
-        self.X_test, self.y_test = self.X[offset:], self.y[offset:]
+        (self.X_train, self.X_test,
+         self.y_train, self.y_test) = train_test_split(
+            self.X, self.y, test_size=test_fraction, random_state=13)
 
     @classmethod
     def get_instance(cls, dataset_name, test_fraction=0.02):

image

Seems that at least it doesn't make things worse, but even decreases the number of failed tests sometimes.

@coveralls
Copy link

coveralls commented Jul 26, 2020

Coverage Status

Coverage increased (+0.007%) to 96.536% when pulling aeb4301 on floats_improvement into eef38e7 on master.

@@ -151,7 +151,7 @@ def __init__(self, model):

def _assemble_tree(self, tree):
if "leaf" in tree:
return ast.NumVal(tree["leaf"])
return ast.NumVal(tree["leaf"], dtype=np.float32)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# all thresholds into float32.
threshold_num_val = ast.NumVal(threshold, dtype=np.float32)

threshold_num_val = ast.NumVal(self._tree.threshold[node_id])
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refer to #190 (review).

Now threshold matches original type in scikit-learn (double).



def format_float(value):
return np.format_float_positional(value, unique=True, trim="0")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe format_float_scientific will be better: https://numpy.org/doc/stable/reference/generated/numpy.format_float_scientific.html. But I'm not sure how many languages support scientific notation.

Comment on lines +571 to +572
y_pred_executed = np.array(
y_pred_executed, dtype=y_pred_true.dtype, copy=False)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quite often different packages not only cast input values in predict method, but also return result with different types. For instance, XGBoost always returns float: https://github.com/dmlc/xgboost/blob/12110c900eff0aaa06045ecf717e6c5a36a164d5/python-package/xgboost/core.py#L1373

Comment on lines +133 to +134
if isinstance(estimator, (BaseDecisionTree, BaseForest)):
self.X_test = self.X_test.astype(np.float32, copy=False)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little concerned that here we treat a symptom not the cause. By casting the input vector to float32 and them passing it as strings into estimators we don't exactly reproduce the actual environment, where casted values will be transformed back to doubles because of the score function signature. What do you think? Am I overthinking this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little concerned that here we treat a symptom not the cause.

Yeah, you're absolutely right! To fix the root cause we should support multiple floating types in target languages. In this PR I just propose to make our tests a little bit fairer. Native libraries do double -> float conversion and in our tests we perform double -> float -> double. Unfortunately, I'm not a numerical expert but it seems that float -> double is a safe conversion: https://stackoverflow.com/questions/29648271/convert-float-double-float.

Comment on lines +135 to +136
elif isinstance(estimator, BaseLibSVM):
self.X_test = self.X_test.astype(np.float64, copy=False)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment on lines +242 to +244
return np.float64(items[0])
else:
return [float(i) for i in items]
return [np.float64(i) for i in items]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use numpy types across all codebase for the consistency.

@izeigerman
Copy link
Member

This is some amazing investigation (as always)! I'll take a look soon. Thank you 👍

Copy link
Member

@izeigerman izeigerman left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks great overall but I don't understand why numbers in tests shifted so dramatically.

ast.IfExpr(
ast.CompExpr(
ast.FeatureRef(5),
ast.NumVal(6.79699993),
ast.FeatureRef(12),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure I understand why the feature index changed here.

Copy link
Member Author

@StrikerRUS StrikerRUS Aug 4, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, all tests have changed due to new train/test splitting routine which is now done by scikit-learn function (train_test_split). I found it easier to change random_state param only in one place compared to manual shuffleing in multiple places (see the diff in my opening comment).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, stupid me. I overlooked that change. It's a bit harder to compare apples to apples because of this though. Do you by chance remember whether there were any changes to expected values before you updated the splitting logic? I'm trying to identify the impact of this update.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, updating splitting logic was the first step as I wanted to play around with random_state. Let's choose easier way. Let me split this PR into two.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in aeb4301.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you, this is so much better!

Copy link
Member

@izeigerman izeigerman left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome, this looks great. Thanks a lot 👍

@izeigerman izeigerman merged commit 4b3fb61 into master Aug 5, 2020
@izeigerman izeigerman deleted the floats_improvement branch August 5, 2020 16:08
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Development

Successfully merging this pull request may close these issues.

None yet

3 participants