Skip to content

Commit

Permalink
Merge acce084 into 3765a86
Browse files Browse the repository at this point in the history
  • Loading branch information
desilinguist committed Mar 4, 2019
2 parents 3765a86 + acce084 commit bb329d8
Show file tree
Hide file tree
Showing 5 changed files with 260 additions and 106 deletions.
79 changes: 66 additions & 13 deletions skll/learner.py
Expand Up @@ -20,6 +20,7 @@
from collections import Counter, defaultdict
from functools import wraps
from importlib import import_module
from math import floor, log10
from itertools import combinations
from multiprocessing import cpu_count

Expand Down Expand Up @@ -1035,6 +1036,54 @@ def load(self, learner_path):
del self.__dict__
self.__dict__ = Learner.from_file(learner_path).__dict__

def _convert_coef_array_to_feature_names(self, coef, feature_name_prefix=''):
"""
A helper method used by `model_params` to convert the model coefficients
array into a dictionary with feature names as keys and the coefficients
as values.
Parameters
----------
coef : np.array
A numpy array with the model coefficients
feature_name_prefix : str, optional
An optional string that should be prefixed to the feature
name, e.g. the name of the class for LogisticRegression
or the class pair for SVCs with linear kernels.
Returns
-------
res : dict
A dictionary of labeled weights
"""
res = {}

# if we are doing feature hashing, then we need to make up
# the feature names
if isinstance(self.feat_vectorizer, FeatureHasher):
self.logger.warning("No feature names are available since this model was trained on hashed features.")
num_features = len(coef)
index_width_in_feature_name = int(floor(log10(num_features))) + 1
feature_names = []
for idx in range(num_features):
index_str = str(idx + 1).zfill(index_width_in_feature_name)
feature_names.append('hashed_feature_{}'.format(index_str))
feature_indices = range(num_features)
vocabulary = dict(zip(feature_names, feature_indices))

# otherwise we can just use the DictVectorizer vocabulary
# to get the feature names
else:
vocabulary = self.feat_vectorizer.vocabulary_

# create the final result dictionary with the prefixed
# feature names and the corresponding coefficient
for feat, idx in iteritems(vocabulary):
if coef[idx]:
res['{}{}'.format(feature_name_prefix, feat)] = coef[idx]

return res

@property
def model_params(self):
"""
Expand Down Expand Up @@ -1064,16 +1113,19 @@ def model_params(self):
coef = self.model.coef_
intercept = {'_intercept_': self.model.intercept_}

# convert SVR coefficient format (1 x matrix) to array
# convert SVR coefficient from a matrix to a 1D array
# and convert from sparse to dense also if necessary.
# However, this last bit may not be necessary
# if we did feature scaling and coef is already dense.
if isinstance(self._model, SVR):
coef = coef.toarray()[0]
if sp.issparse(coef):
coef = coef.toarray()
coef = coef[0]

# inverse transform to get indices for before feature selection
coef = coef.reshape(1, -1)
coef = self.feat_selector.inverse_transform(coef)[0]
for feat, idx in iteritems(self.feat_vectorizer.vocabulary_):
if coef[idx]:
res[feat] = coef[idx]
res = self._convert_coef_array_to_feature_names(coef)

elif isinstance(self._model, LinearSVC) or isinstance(self._model, LogisticRegression):
label_list = self.label_list
Expand All @@ -1084,13 +1136,15 @@ def model_params(self):
if len(self.label_list) == 2:
label_list = self.label_list[-1:]

if isinstance(self.feat_vectorizer, FeatureHasher):
self.logger.warning("No feature names are available since this model was trained on hashed features.")

for i, label in enumerate(label_list):
coef = self.model.coef_[i]
coef = coef.reshape(1, -1)
coef = self.feat_selector.inverse_transform(coef)[0]
for feat, idx in iteritems(self.feat_vectorizer.vocabulary_):
if coef[idx]:
res['{}\t{}'.format(label, feat)] = coef[idx]
label_res = self._convert_coef_array_to_feature_names(coef, feature_name_prefix='{}\t'.format(label))
res.update(label_res)

if isinstance(self.model.intercept_, float):
intercept = {'_intercept_': self.model.intercept_}
Expand All @@ -1109,18 +1163,17 @@ def model_params(self):
# "0 vs 2", ... "0 vs n", "1 vs 2", "1 vs 3", "1 vs n", ... "n-1 vs n".
elif isinstance(self._model, SVC) and self._model.kernel == 'linear':
intercept = {}
if isinstance(self.feat_vectorizer, FeatureHasher):
self.logger.warning("No feature names are available since this model was trained on hashed features.")
for i, class_pair in enumerate(combinations(range(len(self.label_list)), 2)):
coef = self.model.coef_[i]
coef = coef.toarray()
coef = self.feat_selector.inverse_transform(coef)[0]
class1 = self.label_list[class_pair[0]]
class2 = self.label_list[class_pair[1]]
for feat, idx in iteritems(self.feat_vectorizer.vocabulary_):
if coef[idx]:
res['{}-vs-{}\t{}'.format(class1, class2, feat)] = coef[idx]

class_pair_res = self._convert_coef_array_to_feature_names(coef, feature_name_prefix='{}-vs-{}\t'.format(class1, class2))
res.update(class_pair_res)
intercept['{}-vs-{}'.format(class1, class2)] = self.model.intercept_[i]

else:
# not supported
raise ValueError(("{} is not supported by" +
Expand Down
18 changes: 12 additions & 6 deletions skll/utilities/print_model_weights.py
Expand Up @@ -67,12 +67,18 @@ def main(argv=None):
if intercept is not None:
# subclass of LinearModel
if '_intercept_' in intercept:
# Some learners (e.g. LinearSVR) may return a list of intercepts
if isinstance(intercept['_intercept_'], np.ndarray):
intercept_list = ["%.12f" % i for i in intercept['_intercept_']]
print("intercept = {}".format(intercept_list))
else:
print("intercept = {:.12f}".format(intercept['_intercept_']))
# Some learners (e.g. LinearSVR) may return an array of intercepts but
# sometimes that array is of length 1 so we don't need to print that
# as an array/list. First, let's normalize these cases.
model_intercepts = intercept['_intercept_']
intercept_is_array = isinstance(model_intercepts, np.ndarray)
num_intercepts = len(model_intercepts) if intercept_is_array else 1
if intercept_is_array and num_intercepts == 1:
model_intercepts = model_intercepts[0]
intercept_is_array = False

# now print out the intercepts
print("intercept = {:.12f}".format(model_intercepts))
else:
print("== intercept values ==")
for (label, val) in intercept.items():
Expand Down
89 changes: 43 additions & 46 deletions tests/test_regression.py
Expand Up @@ -186,18 +186,24 @@ def check_linear_models(name,
# that we got from make_regression_data. Take the
# ceiling before comparing since just comparing
# the ceilings should be enough to make sure nothing
# catastrophic happened. Note though that we cannot
# test feature weights if we are using feature hashing
# since model_params is not defined with a featurehasher.
if not use_feature_hashing:

# get the weights for this trained model
learned_weights = learner.model_params[0]

for feature_name in learned_weights:
learned_w = math.ceil(learned_weights[feature_name])
given_w = math.ceil(weightdict[feature_name])
eq_(learned_w, given_w)
# catastrophic happened. However, sometimes with
# feature hashing, the ceiling is not exactly identical
# so when that fails we want to check that the rounded
# feature values are the same. One of those two equalities
# _must_ be satisified.

# get the weights for this trained model
learned_weights = learner.model_params[0]

for feature_name in learned_weights:
learned_w_ceil = math.ceil(learned_weights[feature_name])
given_w_ceil = math.ceil(weightdict[feature_name])
learned_w_round = round(learned_weights[feature_name], 0)
given_w_round = round(weightdict[feature_name], 0)
ceil_equal = learned_w_ceil == given_w_ceil
round_equal = learned_w_round == given_w_round
either_equal = ceil_equal or round_equal
assert either_equal

# now generate the predictions on the test FeatureSet
predictions = learner.predict(test_fs)
Expand All @@ -207,9 +213,7 @@ def check_linear_models(name,
# using make_regression_data. To do this, we just
# make sure that they are correlated with pearson > 0.95
cor, _ = pearsonr(predictions, test_fs.labels)
expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
assert_greater(cor, expected_cor_range[0])
assert_less(cor, expected_cor_range[1])
assert_greater(cor, 0.95)


# the runner function for linear regression models
Expand Down Expand Up @@ -272,12 +276,10 @@ def check_non_linear_models(name,
# using make_regression_data. To do this, we just
# make sure that they are correlated with pearson > 0.95
cor, _ = pearsonr(predictions, test_fs.labels)
expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
assert_greater(cor, expected_cor_range[0])
assert_less(cor, expected_cor_range[1])
assert_greater(cor, 0.95)


# the runner function for linear regression models
# the runner function for non-linear regression models
def test_non_linear_models():

for (regressor_name,
Expand Down Expand Up @@ -319,25 +321,23 @@ def check_tree_models(name,

# make sure that the feature importances are as expected.
if name.endswith('DecisionTreeRegressor'):
expected_feature_importances = ([0.37483895,
0.08816508,
0.25379838,
0.18337128,
0.09982631] if use_feature_hashing else
expected_feature_importances = ([0.730811,
0.001834,
0.247603,
0.015241,
0.004511] if use_feature_hashing else
[0.08926899,
0.15585068,
0.75488033])
expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0]
else:
expected_feature_importances = ([0.40195798,
0.06702903,
0.25816559,
0.18185518,
0.09099222] if use_feature_hashing else
expected_feature_importances = ([0.733654,
0.002528,
0.245527,
0.013664,
0.004627] if use_feature_hashing else
[0.07974267,
0.16121895,
0.75903838])
expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]

feature_importances = learner.model.feature_importances_
assert_allclose(feature_importances, expected_feature_importances,
Expand All @@ -351,8 +351,7 @@ def check_tree_models(name,
# using make_regression_data. To do this, we just
# make sure that they are correlated with pearson > 0.95
cor, _ = pearsonr(predictions, test_fs.labels)
assert_greater(cor, expected_cor_range[0])
assert_less(cor, expected_cor_range[1])
assert_greater(cor, 0.95)


# the runner function for tree-based regression models
Expand Down Expand Up @@ -396,19 +395,19 @@ def check_ensemble_models(name,
# make sure that the feature importances are as expected.
if name.endswith('AdaBoostRegressor'):
if use_feature_hashing:
expected_feature_importances = [0.33718443,
0.07810721,
0.25621769,
0.19489766,
0.13359301]
expected_feature_importances = [0.749811,
0.001373,
0.23357,
0.011691,
0.003554]
else:
expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
else:
expected_feature_importances = ([0.471714,
0.022797,
0.283377,
0.170823,
0.051288] if use_feature_hashing else
expected_feature_importances = ([0.735756,
0.001034,
0.242734,
0.015836,
0.00464] if use_feature_hashing else
[0.082621,
0.166652,
0.750726])
Expand All @@ -425,9 +424,7 @@ def check_ensemble_models(name,
# using make_regression_data. To do this, we just
# make sure that they are correlated with pearson > 0.95
cor, _ = pearsonr(predictions, test_fs.labels)
expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
assert_greater(cor, expected_cor_range[0])
assert_less(cor, expected_cor_range[1])
assert_greater(cor, 0.95)


# the runner function for ensemble regression models
Expand Down

0 comments on commit bb329d8

Please sign in to comment.