Merge pull request #574 from EducationalTestingService/fix-multinomia…

…lnb-loading Fix loading issue with `MultinomialNB` model files.
EducationalTestingService · Oct 22, 2019 · 9bfa6b3 · 9bfa6b3
2 parents 9a81b14 + 1bd88f0
commit 9bfa6b3
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 17 deletions.
diff --git a/skll/learner.py b/skll/learner.py
@@ -1464,21 +1464,22 @@ def _train_setup(self, examples):
         self.feat_selector = SelectByMinCount(
             min_count=self._min_feature_count)
 
-        # Create scaler if we weren't passed one and it's necessary
-        if not issubclass(self._model_type, MultinomialNB):
-            if self._feature_scaling != 'none':
-                scale_with_mean = self._feature_scaling in {
-                    'with_mean', 'both'}
-                scale_with_std = self._feature_scaling in {'with_std', 'both'}
-                self.scaler = StandardScaler(copy=True,
-                                             with_mean=scale_with_mean,
-                                             with_std=scale_with_std)
-            else:
-                # Doing this is to prevent any modification of feature values
-                # using a dummy transformation
-                self.scaler = StandardScaler(copy=False,
-                                             with_mean=False,
-                                             with_std=False)
+        # Create a scaler if we weren't passed one and we are asked
+        # to do feature scaling; note that we do not support feature
+        # scaling for `MultinomialNB` learners
+        if (not issubclass(self._model_type, MultinomialNB) and
+                self._feature_scaling != 'none'):
+            scale_with_mean = self._feature_scaling in {'with_mean', 'both'}
+            scale_with_std = self._feature_scaling in {'with_std', 'both'}
+            self.scaler = StandardScaler(copy=True,
+                                         with_mean=scale_with_mean,
+                                         with_std=scale_with_std)
+        else:
+            # Doing this is to prevent any modification of feature values
+            # using a dummy transformation
+            self.scaler = StandardScaler(copy=False,
+                                         with_mean=False,
+                                         with_std=False)
 
     def train(self, examples, param_grid=None, grid_search_folds=3,
               grid_search=True, grid_objective=None,
@@ -1647,8 +1648,7 @@ def train(self, examples, param_grid=None, grid_search_folds=3,
                              'feature values.')
 
         # Scale features if necessary
-        if not issubclass(self._model_type, MultinomialNB):
-            xtrain = self.scaler.fit_transform(xtrain)
+        xtrain = self.scaler.fit_transform(xtrain)
 
         # check whether any feature values are too large
         self._check_max_feature_value(xtrain)

diff --git a/tests/test_classification.py b/tests/test_classification.py
@@ -104,6 +104,9 @@ def tearDown():
     for output_file in glob(join(output_dir, 'clf_metrics_objective_overlap*')):
         os.unlink(output_file)
 
+    for output_file in glob(join(output_dir, 'test_multinomialnb_loading*')):
+        os.unlink(output_file)
+
     config_files = [join(config_dir,
                          cfgname) for cfgname in ['test_single_file.cfg',
                                                   'test_single_file_saved_subset.cfg']]
@@ -1739,3 +1742,24 @@ def test_metrics_and_objectives_overlap():
                                              [["f1_score_weighted", "unweighted_kappa", "accuracy"]],
                                              [[], ["accuracy"], ["accuracy", "unweighted_kappa"]]):
         yield (check_metrics_and_objectives_overlap, task, metrics, objectives)
+
+
+def test_multinomialnb_loading():
+    """
+    Make sure we can load MultnomialNB models from disk
+    """
+
+    output_dir = join(_my_dir, 'output')
+
+    learner = Learner('MultinomialNB')
+    train_fs, test_fs = make_classification_data(num_examples=100, non_negative=True)
+    learner.train(train_fs, grid_search=False)
+    model_file = join(output_dir, 'test_multinomialnb_loading.model')
+    learner.save(model_file)
+    predictions1 = learner.predict(test_fs)
+    del learner
+
+    learner2 = Learner.from_file(model_file)
+    predictions2 = learner2.predict(test_fs)
+
+    assert_array_equal(predictions1, predictions2)