Merge acce084 into 3765a86

EducationalTestingService · Mar 4, 2019 · bb329d8 · bb329d8
2 parents 3765a86 + acce084
commit bb329d8
Show file tree

Hide file tree

Showing 5 changed files with 260 additions and 106 deletions.
diff --git a/skll/learner.py b/skll/learner.py
@@ -20,6 +20,7 @@
 from collections import Counter, defaultdict
 from functools import wraps
 from importlib import import_module
+from math import floor, log10
 from itertools import combinations
 from multiprocessing import cpu_count
 
@@ -1035,6 +1036,54 @@ def load(self, learner_path):
         del self.__dict__
         self.__dict__ = Learner.from_file(learner_path).__dict__
 
+    def _convert_coef_array_to_feature_names(self, coef, feature_name_prefix=''):
+        """
+        A helper method used by `model_params` to convert the model coefficients
+        array into a dictionary with feature names as keys and the coefficients
+        as values.
+
+        Parameters
+        ----------
+        coef : np.array
+            A numpy array with the model coefficients
+        feature_name_prefix : str, optional
+            An optional string that should be prefixed to the feature
+            name, e.g. the name of the class for LogisticRegression
+            or the class pair for SVCs with linear kernels.
+
+        Returns
+        -------
+        res : dict
+            A dictionary of labeled weights
+        """
+        res = {}
+
+        # if we are doing feature hashing, then we need to make up
+        # the feature names
+        if isinstance(self.feat_vectorizer, FeatureHasher):
+            self.logger.warning("No feature names are available since this model was trained on hashed features.")
+            num_features = len(coef)
+            index_width_in_feature_name = int(floor(log10(num_features))) + 1
+            feature_names = []
+            for idx in range(num_features):
+                index_str = str(idx + 1).zfill(index_width_in_feature_name)
+                feature_names.append('hashed_feature_{}'.format(index_str))
+            feature_indices = range(num_features)
+            vocabulary = dict(zip(feature_names, feature_indices))
+
+        # otherwise we can just use the DictVectorizer vocabulary
+        # to get the feature names
+        else:
+            vocabulary = self.feat_vectorizer.vocabulary_
+
+        # create the final result dictionary with the prefixed
+        # feature names and the corresponding coefficient
+        for feat, idx in iteritems(vocabulary):
+            if coef[idx]:
+                res['{}{}'.format(feature_name_prefix, feat)] = coef[idx]
+
+        return res
+
     @property
     def model_params(self):
         """
@@ -1064,16 +1113,19 @@ def model_params(self):
             coef = self.model.coef_
             intercept = {'_intercept_': self.model.intercept_}
 
-            # convert SVR coefficient format (1 x matrix) to array
+            # convert SVR coefficient from a matrix to a 1D array
+            # and convert from sparse to dense also if necessary.
+            # However, this last bit may not be necessary
+            # if we did feature scaling and coef is already dense.
             if isinstance(self._model, SVR):
-                coef = coef.toarray()[0]
+                if sp.issparse(coef):
+                    coef = coef.toarray()
+                coef = coef[0]
 
             # inverse transform to get indices for before feature selection
             coef = coef.reshape(1, -1)
             coef = self.feat_selector.inverse_transform(coef)[0]
-            for feat, idx in iteritems(self.feat_vectorizer.vocabulary_):
-                if coef[idx]:
-                    res[feat] = coef[idx]
+            res = self._convert_coef_array_to_feature_names(coef)
 
         elif isinstance(self._model, LinearSVC) or isinstance(self._model, LogisticRegression):
             label_list = self.label_list
@@ -1084,13 +1136,15 @@ def model_params(self):
             if len(self.label_list) == 2:
                 label_list = self.label_list[-1:]
 
+            if isinstance(self.feat_vectorizer, FeatureHasher):
+                self.logger.warning("No feature names are available since this model was trained on hashed features.")
+
             for i, label in enumerate(label_list):
                 coef = self.model.coef_[i]
                 coef = coef.reshape(1, -1)
                 coef = self.feat_selector.inverse_transform(coef)[0]
-                for feat, idx in iteritems(self.feat_vectorizer.vocabulary_):
-                    if coef[idx]:
-                        res['{}\t{}'.format(label, feat)] = coef[idx]
+                label_res = self._convert_coef_array_to_feature_names(coef, feature_name_prefix='{}\t'.format(label))
+                res.update(label_res)
 
             if isinstance(self.model.intercept_, float):
                 intercept = {'_intercept_': self.model.intercept_}
@@ -1109,18 +1163,17 @@ def model_params(self):
         # "0 vs 2", ... "0 vs n", "1 vs 2", "1 vs 3", "1 vs n", ... "n-1 vs n".
         elif isinstance(self._model, SVC) and self._model.kernel == 'linear':
             intercept = {}
+            if isinstance(self.feat_vectorizer, FeatureHasher):
+                self.logger.warning("No feature names are available since this model was trained on hashed features.")
             for i, class_pair in enumerate(combinations(range(len(self.label_list)), 2)):
                 coef = self.model.coef_[i]
                 coef = coef.toarray()
                 coef = self.feat_selector.inverse_transform(coef)[0]
                 class1 = self.label_list[class_pair[0]]
                 class2 = self.label_list[class_pair[1]]
-                for feat, idx in iteritems(self.feat_vectorizer.vocabulary_):
-                    if coef[idx]:
-                        res['{}-vs-{}\t{}'.format(class1, class2, feat)] = coef[idx]
-
+                class_pair_res = self._convert_coef_array_to_feature_names(coef, feature_name_prefix='{}-vs-{}\t'.format(class1, class2))
+                res.update(class_pair_res)
                 intercept['{}-vs-{}'.format(class1, class2)] = self.model.intercept_[i]
-
         else:
             # not supported
             raise ValueError(("{} is not supported by" +

diff --git a/skll/utilities/print_model_weights.py b/skll/utilities/print_model_weights.py
@@ -67,12 +67,18 @@ def main(argv=None):
     if intercept is not None:
         # subclass of LinearModel
         if '_intercept_' in intercept:
-            # Some learners (e.g. LinearSVR) may return a list of intercepts
-            if isinstance(intercept['_intercept_'], np.ndarray):
-                intercept_list = ["%.12f" % i for i in intercept['_intercept_']]
-                print("intercept = {}".format(intercept_list))
-            else:
-                print("intercept = {:.12f}".format(intercept['_intercept_']))
+            # Some learners (e.g. LinearSVR) may return an array of intercepts but
+            # sometimes that array is of length 1 so we don't need to print that
+            # as an array/list. First, let's normalize these cases.
+            model_intercepts = intercept['_intercept_']
+            intercept_is_array = isinstance(model_intercepts, np.ndarray)
+            num_intercepts = len(model_intercepts) if intercept_is_array else 1
+            if intercept_is_array and num_intercepts == 1:
+                model_intercepts = model_intercepts[0]
+                intercept_is_array = False
+
+            # now print out the intercepts
+            print("intercept = {:.12f}".format(model_intercepts))
         else:
             print("== intercept values ==")
             for (label, val) in intercept.items():

diff --git a/tests/test_regression.py b/tests/test_regression.py
@@ -186,18 +186,24 @@ def check_linear_models(name,
     # that we got from make_regression_data. Take the
     # ceiling before comparing since just comparing
     # the ceilings should be enough to make sure nothing
-    # catastrophic happened. Note though that we cannot
-    # test feature weights if we are using feature hashing
-    # since model_params is not defined with a featurehasher.
-    if not use_feature_hashing:
-
-        # get the weights for this trained model
-        learned_weights = learner.model_params[0]
-
-        for feature_name in learned_weights:
-            learned_w = math.ceil(learned_weights[feature_name])
-            given_w = math.ceil(weightdict[feature_name])
-            eq_(learned_w, given_w)
+    # catastrophic happened. However, sometimes with
+    # feature hashing, the ceiling is not exactly identical
+    # so when that fails we want to check that the rounded
+    # feature values are the same. One of those two equalities
+    # _must_ be satisified.
+
+    # get the weights for this trained model
+    learned_weights = learner.model_params[0]
+
+    for feature_name in learned_weights:
+        learned_w_ceil = math.ceil(learned_weights[feature_name])
+        given_w_ceil = math.ceil(weightdict[feature_name])
+        learned_w_round = round(learned_weights[feature_name], 0)
+        given_w_round = round(weightdict[feature_name], 0)
+        ceil_equal = learned_w_ceil == given_w_ceil
+        round_equal = learned_w_round == given_w_round
+        either_equal = ceil_equal or round_equal
+        assert either_equal
 
     # now generate the predictions on the test FeatureSet
     predictions = learner.predict(test_fs)
@@ -207,9 +213,7 @@ def check_linear_models(name,
     # using make_regression_data. To do this, we just
     # make sure that they are correlated with pearson > 0.95
     cor, _ = pearsonr(predictions, test_fs.labels)
-    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
-    assert_greater(cor, expected_cor_range[0])
-    assert_less(cor, expected_cor_range[1])
+    assert_greater(cor, 0.95)
 
 
 # the runner function for linear regression models
@@ -272,12 +276,10 @@ def check_non_linear_models(name,
     # using make_regression_data. To do this, we just
     # make sure that they are correlated with pearson > 0.95
     cor, _ = pearsonr(predictions, test_fs.labels)
-    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
-    assert_greater(cor, expected_cor_range[0])
-    assert_less(cor, expected_cor_range[1])
+    assert_greater(cor, 0.95)
 
 
-# the runner function for linear regression models
+# the runner function for non-linear regression models
 def test_non_linear_models():
 
     for (regressor_name,
@@ -319,25 +321,23 @@ def check_tree_models(name,
 
     # make sure that the feature importances are as expected.
     if name.endswith('DecisionTreeRegressor'):
-        expected_feature_importances = ([0.37483895,
-                                         0.08816508,
-                                         0.25379838,
-                                         0.18337128,
-                                         0.09982631] if use_feature_hashing else
+        expected_feature_importances = ([0.730811,
+                                         0.001834,
+                                         0.247603,
+                                         0.015241,
+                                         0.004511] if use_feature_hashing else
                                         [0.08926899,
                                          0.15585068,
                                          0.75488033])
-        expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0]
     else:
-        expected_feature_importances = ([0.40195798,
-                                         0.06702903,
-                                         0.25816559,
-                                         0.18185518,
-                                         0.09099222] if use_feature_hashing else
+        expected_feature_importances = ([0.733654,
+                                         0.002528,
+                                         0.245527,
+                                         0.013664,
+                                         0.004627] if use_feature_hashing else
                                         [0.07974267,
                                          0.16121895,
                                          0.75903838])
-        expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
 
     feature_importances = learner.model.feature_importances_
     assert_allclose(feature_importances, expected_feature_importances,
@@ -351,8 +351,7 @@ def check_tree_models(name,
     # using make_regression_data. To do this, we just
     # make sure that they are correlated with pearson > 0.95
     cor, _ = pearsonr(predictions, test_fs.labels)
-    assert_greater(cor, expected_cor_range[0])
-    assert_less(cor, expected_cor_range[1])
+    assert_greater(cor, 0.95)
 
 
 # the runner function for tree-based regression models
@@ -396,19 +395,19 @@ def check_ensemble_models(name,
     # make sure that the feature importances are as expected.
     if name.endswith('AdaBoostRegressor'):
         if use_feature_hashing:
-            expected_feature_importances = [0.33718443,
-                                            0.07810721,
-                                            0.25621769,
-                                            0.19489766,
-                                            0.13359301]
+            expected_feature_importances = [0.749811,
+                                            0.001373,
+                                            0.23357,
+                                            0.011691,
+                                            0.003554]
         else:
             expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
     else:
-        expected_feature_importances = ([0.471714,
-                                         0.022797,
-                                         0.283377,
-                                         0.170823,
-                                         0.051288] if use_feature_hashing else
+        expected_feature_importances = ([0.735756,
+                                         0.001034,
+                                         0.242734,
+                                         0.015836,
+                                         0.00464] if use_feature_hashing else
                                         [0.082621,
                                          0.166652,
                                          0.750726])
@@ -425,9 +424,7 @@ def check_ensemble_models(name,
     # using make_regression_data. To do this, we just
     # make sure that they are correlated with pearson > 0.95
     cor, _ = pearsonr(predictions, test_fs.labels)
-    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
-    assert_greater(cor, expected_cor_range[0])
-    assert_less(cor, expected_cor_range[1])
+    assert_greater(cor, 0.95)
 
 
 # the runner function for ensemble regression models