Merge branch 'release/0.22.1'

EducationalTestingService · Dec 5, 2013 · 75351dc · 75351dc
2 parents 8784263 + f005dbc
commit 75351dc
Show file tree

Hide file tree

Showing 18 changed files with 100 additions and 64 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*   text=auto
diff --git a/.travis.yml b/.travis.yml
@@ -20,15 +20,15 @@ before_install:
   - sudo mkdir /scratch/
   - sudo chmod 777 /scratch/
   - travis/miniconda.sh -b
+  - mv travis/.condarc $HOME
   - export PATH=/home/travis/anaconda/bin:$PATH
   - conda update --yes conda
 install:
-  - conda install --yes pip python=$TRAVIS_PYTHON_VERSION atlas numpy scipy beautiful-soup six scikit-learn
-  - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then pip install --use-mirrors configparser; fi
-  - pip install -r requirements.txt --use-mirrors
-  - pip install python-coveralls --use-mirrors
-  - pip install nose-cov --use-mirrors
-  - if [ $GRIDMAP == "true" ]; then pip install --use-mirrors git+git://github.com/dan-blanchard/drmaa-python gridmap; fi
+  - conda install --yes pip python=$TRAVIS_PYTHON_VERSION atlas numpy scipy beautiful-soup six scikit-learn joblib prettytable python-coveralls
+  - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then conda install --yes configparser futures logutils; fi
+  - if [ $GRIDMAP == "true" ]; then conda install --yes drmaa gridmap; fi
+  # Have to use pip for nose-cov because its entry points are not supported by conda yet
+  - pip install --use-mirrors nose-cov
   - sudo rm -rf /dev/shm
   - sudo ln -s /run/shm /dev/shm
   - python setup.py install

diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ PrettyTable
 beautifulsoup4
 numpy
 scipy
+joblib
diff --git a/setup.py b/setup.py
@@ -35,11 +35,15 @@ def requirements():
       author='Daniel Blanchard',
       author_email='dblanchard@ets.org',
       license='BSD 3 clause',
-      packages=['skll'],
-      scripts=['scripts/filter_megam', 'scripts/generate_predictions',
-               'scripts/join_megam', 'scripts/megam_to_libsvm',
-               'scripts/print_model_weights', 'scripts/run_experiment',
-               'scripts/skll_convert', 'scripts/summarize_results'],
+      packages=['skll', 'skll.utilities'],
+      entry_points={'console_scripts': ['filter_megam = skll.utilities.filter_megam:main',
+                                        'generate_predictions = skll.utilities.generate_predictions:main',
+                                        'join_megam = skll.utilities.join_megam:main',
+                                        'megam_to_libsvm = skll.utilities.megam_to_libsvm:main',
+                                        'print_model_weights = skll.utilities.print_model_weights:main',
+                                        'run_experiment = skll.utilities.run_experiment:main',
+                                        'skll_convert = skll.utilities.skll_convert:main',
+                                        'summarize_results = skll.utilities.summarize_results:main']},
       install_requires=requirements(),
       classifiers=['Intended Audience :: Science/Research',
                    'Intended Audience :: Developers',

diff --git a/skll/experiments.py b/skll/experiments.py
@@ -587,30 +587,39 @@ def _classify_featureset(args):
                                                 featureset),
                   file=log_file)
 
+        # check whether a trained model on the same data with the same
+        # featureset already exists if so, load it and then use it on test data
+        modelfile = os.path.join(model_path, '{}.model'.format(job_name))
+
         # load the training and test examples
-        train_examples = _load_featureset(train_path, featureset, suffix,
-                                          label_col=label_col,
-                                          ids_to_floats=ids_to_floats,
-                                          quiet=quiet, class_map=class_map)
+        if task == 'cross_validate' or (not os.path.exists(modelfile) or
+                                        overwrite):
+            train_examples = _load_featureset(train_path, featureset, suffix,
+                                              label_col=label_col,
+                                              ids_to_floats=ids_to_floats,
+                                              quiet=quiet, class_map=class_map)
+            # initialize a classifer object
+            learner = Learner(learner_name,
+                              probability=probability,
+                              feature_scaling=feature_scaling,
+                              model_kwargs=fixed_parameters,
+                              pos_label_str=pos_label_str,
+                              min_feature_count=min_feature_count)
+        # load the model if it already exists
+        else:
+            if os.path.exists(modelfile) and not overwrite:
+                print(('\tloading pre-existing {} ' +
+                       'model: {}').format(learner_name, modelfile))
+            learner = Learner.from_file(modelfile)
+
+        # Load test set if there is one
         if task == 'evaluate' or task == 'predict':
             test_examples = _load_featureset(test_path, featureset, suffix,
                                              label_col=label_col,
                                              ids_to_floats=ids_to_floats,
                                              quiet=quiet, class_map=class_map,
                                              unlabelled=True)
 
-        # initialize a classifer object
-        learner = Learner(learner_name,
-                          probability=probability,
-                          feature_scaling=feature_scaling,
-                          model_kwargs=fixed_parameters,
-                          pos_label_str=pos_label_str,
-                          min_feature_count=min_feature_count)
-
-        # check whether a trained model on the same data with the same
-        # featureset already exists if so, load it (and the feature
-        # vocabulary) and then use it on the test data
-        modelfile = os.path.join(model_path, '{}.model'.format(job_name))
 
         # create a list of dictionaries of the results information
         learner_result_dict_base = {'experiment_name': experiment_name,
@@ -639,14 +648,8 @@ def _classify_featureset(args):
                                                                param_grid=param_grid,
                                                                grid_jobs=grid_search_jobs)
         else:
-            # load the model if it already exists
-            if os.path.exists(modelfile) and not overwrite:
-                print(('\tloading pre-existing {} ' +
-                       'model: {}').format(learner_name, modelfile))
-                learner.load(modelfile)
-
             # if we have do not have a saved model, we need to train one.
-            else:
+            if not os.path.exists(modelfile) or overwrite:
                 print(('\tfeaturizing and training new ' +
                        '{} model').format(learner_name),
                       file=log_file)

diff --git a/skll/learner.py b/skll/learner.py
@@ -18,6 +18,7 @@
 from functools import wraps
 from multiprocessing import cpu_count
 
+import joblib
 import numpy as np
 import scipy.sparse as sp
 from six import iteritems, itervalues
@@ -443,23 +444,22 @@ def from_file(cls, learner_path):
         '''
         :returns: New instance of Learner from the pickle at the specified path.
         '''
-        with open(learner_path, "rb") as f:
-            skll_version, learner = pickle.load(f)
-            # Check that we've actually loaded a Learner (or sub-class)
-            if not isinstance(learner, cls):
-                raise ValueError(('The pickle stored at {} does not contain ' +
-                                  'a {} object.').format(learner_path, cls))
-            # Check that versions are compatible. (Currently, this just checks
-            # that major versions match)
-            elif skll_version[0] == VERSION[0]:
-                return learner
-            else:
-                raise Exception(("{} stored in pickle file {} was " +
-                                 "created with version {} of SKLL, which is " +
-                                 "incompatible with the current version " +
-                                 "{}").format(cls, learner_path,
-                                              '.'.join(skll_version),
-                                              '.'.join(VERSION)))
+        skll_version, learner = joblib.load(learner_path)
+        # Check that we've actually loaded a Learner (or sub-class)
+        if not isinstance(learner, cls):
+            raise ValueError(('The pickle stored at {} does not contain ' +
+                              'a {} object.').format(learner_path, cls))
+        # Check that versions are compatible. (Currently, this just checks
+        # that major versions match)
+        elif skll_version[0] == VERSION[0]:
+            return learner
+        else:
+            raise ValueError(("{} stored in pickle file {} was " +
+                              "created with version {} of SKLL, which is " +
+                              "incompatible with the current version " +
+                              "{}").format(cls, learner_path,
+                                           '.'.join(skll_version),
+                                           '.'.join(VERSION)))
 
     @property
     def model_type(self):
@@ -548,8 +548,7 @@ def save(self, learner_path):
         if not os.path.exists(learner_dir):
             os.makedirs(learner_dir)
         # write out the files
-        with open(learner_path, "wb") as f:
-            pickle.dump((VERSION, self), f, -1)
+        joblib.dump((VERSION, self), learner_path)
 
     def _create_estimator(self):
         '''

diff --git a/skll/utilities/__init__.py b/skll/utilities/__init__.py
diff --git a/scripts/filter_megam → skll/utilities/filter_megam.py b/scripts/filter_megam → skll/utilities/filter_megam.py
@@ -37,7 +37,10 @@
 from skll.version import __version__
 
 
-if __name__ == '__main__':
+def main():
+    '''
+    Handles command line arguments and gets things started.
+    '''
     # Get command line arguments
     parser = argparse.ArgumentParser(description="Filter MegaM file to remove\
                                                   features with names in stop\
@@ -98,3 +101,7 @@
                     print(" ", end='')
                 print('{} {}'.format(feature, value), end="")
         print()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/generate_predictions → skll/utilities/generate_predictions.py b/scripts/generate_predictions → skll/utilities/generate_predictions.py
diff --git a/scripts/join_megam → skll/utilities/join_megam.py b/scripts/join_megam → skll/utilities/join_megam.py
@@ -96,7 +96,10 @@ def get_unique_name(feature_name, prev_feature_set, filename):
     return new_feature_name
 
 
-if __name__ == '__main__':
+def main():
+    '''
+    Handles command line arguments and gets things started.
+    '''
     # Get command line arguments
     parser = argparse.ArgumentParser(description="Combine MegaM files that \
                                                   contain features for the same\
@@ -213,3 +216,7 @@ def get_unique_name(feature_name, prev_feature_set, filename):
         print("# {}".format(curr_filename).encode('utf-8'))
         print("{}\t{}".format(class_dict[curr_filename],
                               feature_dict[curr_filename].strip()).encode('utf-8'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/megam_to_libsvm → skll/utilities/megam_to_libsvm.py b/scripts/megam_to_libsvm → skll/utilities/megam_to_libsvm.py
diff --git a/scripts/print_model_weights → skll/utilities/print_model_weights.py b/scripts/print_model_weights → skll/utilities/print_model_weights.py
@@ -36,7 +36,10 @@
 from skll.version import __version__
 
 
-if __name__ == '__main__':
+def main():
+    '''
+    Handles command line arguments and gets things started.
+    '''
     parser = argparse.ArgumentParser(description="Prints out the weights of a \
                                                   given model.",
                                      conflict_handler='resolve',
@@ -63,3 +66,7 @@
 
     for feat, val in sorted(iteritems(weights), key=lambda x: -abs(x[1]))[:k]:
         print("{:.12f}\t{}".format(val, feat))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/run_experiment → skll/utilities/run_experiment.py b/scripts/run_experiment → skll/utilities/run_experiment.py
diff --git a/scripts/skll_convert → skll/utilities/skll_convert.py b/scripts/skll_convert → skll/utilities/skll_convert.py
@@ -37,7 +37,10 @@
 from skll.version import __version__
 
 
-if __name__ == '__main__':
+def main():
+    '''
+    Handles command line arguments and gets things started.
+    '''
     # Get command line arguments
     parser = argparse.ArgumentParser(description="Takes an input feature file \
                                                   and converts it to another \
@@ -111,3 +114,6 @@
     write_feature_file(args.outfile, ids, classes, feature_dicts,
                        arff_regression=args.arff_regression,
                        arff_relation=args.arff_relation)
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/summarize_results → skll/utilities/summarize_results.py b/scripts/summarize_results → skll/utilities/summarize_results.py
diff --git a/skll/version.py b/skll/version.py
@@ -7,5 +7,5 @@
 :organization: ETS
 '''
 
-__version__ = '0.22.0'
+__version__ = '0.22.1'
 VERSION = tuple(int(x) for x in __version__.split('.'))
diff --git a/tests/test_skll.py b/tests/test_skll.py
@@ -912,9 +912,6 @@ def check_convert_featureset(from_suffix, to_suffix):
     # the path to the unmerged feature files
     dirpath = os.path.join(_my_dir, 'train', 'test_conversion')
 
-    # get the path to the conversion script
-    converter_path = os.path.abspath(os.path.join(_my_dir, '..', 'scripts', 'skll_convert'))
-
     # get the feature name prefix
     feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'), to_suffix.lstrip('.'))
 
@@ -925,9 +922,8 @@ def check_convert_featureset(from_suffix, to_suffix):
                                                                  feature, from_suffix))
         output_file_path = os.path.join(dirpath, '{}_{}{}'.format(feature_name_prefix,
                                                                   feature, to_suffix))
-        convert_cmd = shlex.split('{} --quiet {} {}'.format(converter_path,
-                                                            input_file_path,
-                                                            output_file_path))
+        convert_cmd = shlex.split('skll_convert --quiet {} {}'.format(input_file_path,
+                                                                      output_file_path))
         subprocess.check_call(convert_cmd)
 
     # now load and merge all unmerged, converted features in the `to_suffix` format

diff --git a/travis/.condarc b/travis/.condarc
@@ -0,0 +1,5 @@
+# a condarc file should be placed in $HOME/.condarc
+
+channels:
+  - https://conda.binstar.org/dan_blanchard
+  - defaults