Merge pull request #439 from EducationalTestingService/update-scikit-…

…learn Update scikit-learn to v0.20.1
EducationalTestingService · Dec 4, 2018 · 48e5f1f · 48e5f1f
2 parents fc37bc7 + e78fb28
commit 48e5f1f
Show file tree

Hide file tree

Showing 10 changed files with 41 additions and 23 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -25,7 +25,7 @@ before_install:
   - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then export PATH=/home/travis/miniconda2/bin:$PATH; else export PATH=/home/travis/miniconda3/bin:$PATH; fi
   - conda update --yes conda
 install:
-  - conda install --yes --channel defaults --channel conda-forge python=$TRAVIS_PYTHON_VERSION numpy scipy beautifulsoup4 six scikit-learn==0.19.1 joblib prettytable python-coveralls ruamel.yaml
+  - conda install --yes --channel defaults --channel conda-forge python=$TRAVIS_PYTHON_VERSION numpy scipy beautifulsoup4 six scikit-learn==0.20.1 joblib prettytable python-coveralls ruamel.yaml
   - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then conda install --yes --channel defaults configparser mock; fi
   - if [ ${WITH_PANDAS_AND_SEABORN} == "true" ]; then conda install --yes --channel defaults pandas seaborn; fi
   # Have to use pip for nose-cov because its entry points are not supported by conda yet

diff --git a/conda-recipe/skll/meta.yaml b/conda-recipe/skll/meta.yaml
@@ -42,7 +42,7 @@ build:
 requirements:
   build:
     - python
-    - scikit-learn ==0.19.1
+    - scikit-learn ==0.20.1
     - joblib >=0.8
     - setuptools
     - six
@@ -57,7 +57,7 @@ requirements:
 
   run:
     - python
-    - scikit-learn ==0.19.1
+    - scikit-learn ==0.20.1
     - joblib >=0.8
     - six
     - prettytable

diff --git a/conda_requirements.txt b/conda_requirements.txt
@@ -1,4 +1,4 @@
-scikit-learn==0.19.1
+scikit-learn==0.20.1
 six
 PrettyTable
 beautifulsoup4

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-scikit-learn==0.19.1
+scikit-learn==0.20.1
 six
 PrettyTable
 beautifulsoup4

diff --git a/requirements_rtd.txt b/requirements_rtd.txt
@@ -1,7 +1,7 @@
 configparser==3.5.0b2
 logutils
 mock
-scikit-learn==0.19.1
+scikit-learn==0.20.1
 six
 PrettyTable
 beautifulsoup4

diff --git a/skll/data/featureset.py b/skll/data/featureset.py
@@ -414,7 +414,7 @@ def has_labels(self):
             Whether or not this FeatureSet has any finite labels.
         """
         if self.labels is not None:
-            return not (np.issubdtype(self.labels.dtype, float) and
+            return not (np.issubdtype(self.labels.dtype, np.floating) and
                         np.isnan(np.min(self.labels)))
         else:
             return False

diff --git a/skll/experiments.py b/skll/experiments.py
@@ -1374,13 +1374,13 @@ def _generate_learning_curve_plots(experiment_name,
     # each of the featuresets
     for fs_name, df_fs in df_melted.groupby('featureset_name'):
         fig = plt.figure();
-        fig.set_size_inches(2.5*num_learners, 2.5*num_metrics);
+        fig.set_size_inches(2.5 * num_learners, 2.5 * num_metrics);
 
         # compute ylimits for this feature set for each objective
         with sns.axes_style('whitegrid', {"grid.linestyle": ':',
                                           "xtick.major.size": 3.0}):
             g = sns.FacetGrid(df_fs, row="metric", col="learner_name",
-                              hue="variable", size=2.5, aspect=1,
+                              hue="variable", height=2.5, aspect=1,
                               margin_titles=True, despine=True, sharex=False,
                               sharey=False, legend_out=False, palette="Set1")
             colors = train_color, test_color = sns.color_palette("Set1")[:2]

diff --git a/skll/learner.py b/skll/learner.py
@@ -854,6 +854,7 @@ def __init__(self, model_type, probability=False, feature_scaling='none',
         if issubclass(self._model_type, SVC):
             self._model_kwargs['cache_size'] = 1000
             self._model_kwargs['probability'] = self.probability
+            self._model_kwargs['gamma'] = 'auto'
             if self.probability:
                 self.logger.warning('Because LibSVM does an internal '
                                     'cross-validation to produce probabilities, '
@@ -866,14 +867,22 @@ def __init__(self, model_type, probability=False, feature_scaling='none',
             self._model_kwargs['n_estimators'] = 500
         elif issubclass(self._model_type, SVR):
             self._model_kwargs['cache_size'] = 1000
+            self._model_kwargs['gamma'] = 'auto'
         elif issubclass(self._model_type, SGDClassifier):
             self._model_kwargs['loss'] = 'log'
+            self._model_kwargs['max_iter'] = None
+            self._model_kwargs['tol'] = None
+        elif issubclass(self._model_type, SGDRegressor):
+            self._model_kwargs['max_iter'] = None
+            self._model_kwargs['tol'] = None
         elif issubclass(self._model_type, RANSACRegressor):
             self._model_kwargs['loss'] = 'squared_loss'
         elif issubclass(self._model_type, (MLPClassifier, MLPRegressor)):
             self._model_kwargs['learning_rate'] = 'invscaling'
             self._model_kwargs['max_iter'] = 500
-
+        elif issubclass(self._model_type, LogisticRegression):
+            self._model_kwargs['solver'] = 'liblinear'
+            self._model_kwargs['multi_class'] = 'auto'
 
         if issubclass(self._model_type,
                       (AdaBoostClassifier, AdaBoostRegressor,
@@ -911,9 +920,18 @@ def __init__(self, model_type, probability=False, feature_scaling='none',
                            AdaBoostClassifier,
                            RANSACRegressor)) and ('base_estimator' in model_kwargs):
                 base_estimator_name = model_kwargs['base_estimator']
-                base_estimator_kwargs = {} if base_estimator_name in ['LinearRegression',
-                                                                      'MultinomialNB',
-                                                                      'SVR'] else {'random_state': 123456789}
+                if base_estimator_name in ['LinearRegression', 'MultinomialNB']:
+                    base_estimator_kwargs = {}
+                elif base_estimator_name in ['SGDClassifier', 'SGDRegressor']:
+                    base_estimator_kwargs = {'max_iter': None,
+                                             'tol': None,
+                                             'random_state': 123456789}
+                elif base_estimator_name == 'SVR':
+                    base_estimator_kwargs = {'gamma': 'auto'}
+                elif base_estimator_name == 'SVC':
+                    base_estimator_kwargs = {'gamma': 'auto', 'random_state': 123456789}
+                else:
+                    base_estimator_kwargs = {'random_state': 123456789}
                 base_estimator = globals()[base_estimator_name](**base_estimator_kwargs)
                 model_kwargs['base_estimator'] = base_estimator
             self._model_kwargs.update(model_kwargs)

diff --git a/tests/test_classification.py b/tests/test_classification.py
@@ -193,7 +193,7 @@ def test_sparse_predict():
                                              [(0.45, 0.52), (0.52, 0.5),
                                               (0.48, 0.5), (0.49, 0.5),
                                               (0.43, 0), (0.53, 0.57),
-                                              (0.49, 0.49), (0.48, 0.5)]):
+                                              (0.49, 0.49), (0.5, 0.49)]):
         yield check_sparse_predict, learner_name, expected_scores[0], False
         if learner_name != 'MultinomialNB':
             yield check_sparse_predict, learner_name, expected_scores[1], True

diff --git a/tests/test_regression.py b/tests/test_regression.py
@@ -135,7 +135,7 @@ def check_rescaling(name, grid_search=False):
     train_p_std = np.std(train_predictions)
     rescaled_train_p_std = np.std(rescaled_train_predictions)
     assert_less(abs(rescaled_train_p_std - train_y_std),
-                      abs(train_p_std - train_y_std))
+                abs(train_p_std - train_y_std))
 
 
 def test_rescaling():
@@ -403,14 +403,14 @@ def check_ensemble_models(name,
         else:
             expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
     else:
-        expected_feature_importances = ([0.204,
-                                         0.172,
-                                         0.178,
-                                         0.212,
-                                         0.234] if use_feature_hashing else
-                                        [0.262,
-                                         0.288,
-                                         0.45])
+        expected_feature_importances = ([0.471714,
+                                         0.022797,
+                                         0.283377,
+                                         0.170823,
+                                         0.051288] if use_feature_hashing else
+                                        [0.082621,
+                                         0.166652,
+                                         0.750726])
 
     feature_importances = learner.model.feature_importances_
     assert_allclose(feature_importances, expected_feature_importances,