EducationalTestingService · desilinguist · Sep 12, 2022 · Sep 8, 2022 · Sep 8, 2022 · Sep 9, 2022
diff --git a/doc/run_experiment.rst b/doc/run_experiment.rst
@@ -249,7 +249,7 @@ field in each section is provided below, but to summarize:
 *   A :ref:`list of classifiers/regressors <learners>` to try on your feature
     files is required.
 
-Example configuration files are available `here <https://github.com/EducationalTestingService/skll/tree/main/examples/>`__ under the ``boston``, ``iris``, and ``titanic`` sub-directories.
+Example configuration files are available `here <https://github.com/EducationalTestingService/skll/tree/main/examples/>`__ under the ``california``, ``iris``, and ``titanic`` sub-directories.
 
 .. _general:
 
@@ -352,7 +352,7 @@ Regressors:
         *   ``estimator_sampler_list`` which can be used to specify any feature sampling algorithms for the underlying learners, and
         *   ``estimator_sampler_parameters`` which can be used to specify any additional parameters for any specified samplers.
 
-        Refer to this `example voting configuration file <https://github.com/EducationalTestingService/skll/blob/main/examples/boston/voting.cfg>`__ to see how these parameters are used.
+        Refer to this `example voting configuration file <https://github.com/EducationalTestingService/skll/blob/main/examples/california/voting.cfg>`__ to see how these parameters are used.
 
     For all regressors *except* ``VotingRegressor``, you can also prepend
     ``Rescaled`` to the beginning of the full name (e.g., ``RescaledSVR``)
@@ -1075,7 +1075,7 @@ BayesianRidge
 DecisionTreeClassifier and DecisionTreeRegressor
     .. code-block:: python
 
-       {'max_features': ["auto", None]}
+       {'max_features': ["sqrt", None]}
 
 ElasticNet
     .. code-block:: python
@@ -1286,7 +1286,7 @@ Here's an example of how to use this attribute.
     learner1 = Learner('LogisticRegression', pipeline=True)
     _ = learner1.train(fs1, grid_search=True, grid_objective='f1_score_macro')
 
-    fs2 = Reader.for_path('examples/boston/train/example_boston_features.jsonlines').read()
+    fs2 = Reader.for_path('examples/california/train/example_california_features.jsonlines').read()
     learner2 = Learner('RescaledSVR', feature_scaling='both', pipeline=True)
     _ = learner2.train(fs2, grid_search=True, grid_objective='pearson')
 
@@ -1299,7 +1299,7 @@ Here's an example of how to use this attribute.
     enc.inverse_transform(pipeline1.predict(D1))
 
     # then, the regressor
-    D2 = {"f0": 0.09178, "f1": 0.0, "f2": 4.05, "f3": 0.0, "f4": 0.51, "f5": 6.416, "f6": 84.1, "f7": 2.6463, "f8": 5.0, "f9": 296.0, "f10": 16.6, "f11": 395.5, "f12": 9.04}
+    D2 = {"f0": 4.1344, "f1": 36.0, "f2": 4.1, "f3": 0.98, "f4": 1245.0, "f5": 3.0, "f6": 33.9, "f7": -118.32}
     pipeline2 = learner2.pipeline
     pipeline2.predict(D2)
 

diff --git a/examples/Tutorial.ipynb b/examples/Tutorial.ipynb
@@ -53,9 +53,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Tutorial.ipynb               \u001b[31mmake_boston_example_data.py\u001b[m\u001b[m\n",
+      "Tutorial.ipynb               \u001b[31mmake_california_example_data.py\u001b[m\u001b[m\n",
       "__init__.py                  \u001b[31mmake_iris_example_data.py\u001b[m\u001b[m\n",
-      "\u001b[34mboston\u001b[m\u001b[m                       \u001b[31mmake_titanic_example_data.py\u001b[m\u001b[m\n",
+      "\u001b[34mcalifornia\u001b[m\u001b[m                   \u001b[31mmake_titanic_example_data.py\u001b[m\u001b[m\n",
       "\u001b[34miris\u001b[m\u001b[m                         \u001b[34mtitanic\u001b[m\u001b[m\n"
      ]
     }

diff --git a/examples/boston/cross_val.cfg → examples/california/cross_val.cfg b/examples/boston/cross_val.cfg → examples/california/cross_val.cfg
@@ -5,9 +5,9 @@ task = cross_validate
 [Input]
 # this could also be an absolute path instead (and must be if you're not running things in local mode)
 train_directory = train
-featuresets = [["example_boston_features"]]
+featuresets = [["example_california_features"]]
 # there is only set of features to try with one feature file in it here.
-featureset_names = ["example_boston"]
+featureset_names = ["example_california"]
 # when the feature values are numeric and on different scales
 # it is good to have feature scaling to put various features in same scale
 

diff --git a/examples/boston/evaluate.cfg → examples/california/evaluate.cfg b/examples/boston/evaluate.cfg → examples/california/evaluate.cfg
@@ -6,9 +6,9 @@ task = evaluate
 # this could also be an absolute path instead (and must be if you're not running things in local mode)
 train_directory = train
 test_directory = test
-featuresets = [["example_boston_features"]]
+featuresets = [["example_california_features"]]
 # there is only set of features to try with one feature file in it here.
-featureset_names = ["example_boston"]
+featureset_names = ["example_california"]
 # when the feature values are numeric and on different scales
 # it is good to have feature scaling to put various features in same scale
 feature_scaling = both

diff --git a/examples/boston/voting.cfg → examples/california/voting.cfg b/examples/boston/voting.cfg → examples/california/voting.cfg
@@ -1,14 +1,14 @@
 [General]
-experiment_name = Boston_Voting_Evaluate
+experiment_name = California_Voting_Evaluate
 task = evaluate
 
 [Input]
 # this could also be an absolute path instead (and must be if you're not running things in local mode)
 train_directory = train
 test_directory = test
-featuresets = [["example_boston_features"]]
+featuresets = [["example_california_features"]]
 # there is only set of features to try with one feature file in it here.
-featureset_names = ["example_boston"]
+featureset_names = ["example_california"]
 # when the feature values are numeric and on different scales
 # it is good to have feature scaling to put various features in same scale
 feature_scaling = both

diff --git a/examples/make_boston_example_data.py b/examples/make_boston_example_data.py
diff --git a/examples/make_california_example_data.py b/examples/make_california_example_data.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+"""
+This is a simple script to download and transform some example data from
+sklearn.datasets.
+
+:author: Michael Heilman (mheilman@ets.org)
+:author: Aoife Cahill (acahill@ets.org)
+:author: Nitin Madnani (nmadnani@ets.org)
+:organization: ETS
+"""
+
+import json
+import os
+
+import numpy as np
+import sklearn.datasets
+from sklearn.model_selection import train_test_split
+
+
+def main():
+    """
+    Download some example data and split it into training and test data.
+    The california data set is meant for regression modeling.
+    """
+    print("Retrieving california data from servers...", end="")
+    california = sklearn.datasets.fetch_california_housing()
+    print("done")
+
+    # this dataset contains 20,640 samples which is too many
+    # let's just sample this dataset to get 500 samples
+    rng = np.random.default_rng(42)
+    chosen_indices = rng.integers(0, california.target.shape[0], size=500)
+    X = california.data[chosen_indices, :]
+    Y = california.target[chosen_indices]
+
+    # crate example jsonlines dictionaries
+    examples = [
+        {"id": f"EXAMPLE_{i}", "y": y, "x": {f"f{j}": x_val for j, x_val in enumerate(x)}}
+        for i, (x, y) in enumerate(zip(X, Y))
+    ]
+
+    (examples_train, examples_test) = train_test_split(examples, test_size=0.33, random_state=42)
+
+    print("Writing training and testing files...", end="")
+    for examples, suffix in [(examples_train, "train"), (examples_test, "test")]:
+        california_dir = os.path.join("california", suffix)
+        if not os.path.exists(california_dir):
+            os.makedirs(california_dir)
+        jsonlines_path = os.path.join(california_dir, "example_california_features.jsonlines")
+        with open(jsonlines_path, "w") as f:
+            for ex in examples:
+                f.write(f"{json.dumps(ex)}\n")
+    print("done")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skll/experiments/output.py b/skll/experiments/output.py
@@ -104,7 +104,7 @@ def generate_learning_curve_plots(experiment_name,
     num_metrics = len(df['metric'].unique())
     df_melted = pd.melt(df, id_vars=[c for c in df.columns
                                      if c not in ['train_score_mean', 'test_score_mean']])
-    # make sure the "variable" column is cateogrical since it will be
+    # make sure the "variable" column is categorical since it will be
     # mapped to hue levels in the learning curve below
     df_melted["variable"] = df_melted["variable"].astype("category")
 

diff --git a/tests/other/expected/Boston_Voting_Evaluate_example_boston_VotingRegressor.results.json b/tests/other/expected/Boston_Voting_Evaluate_example_boston_VotingRegressor.results.json
diff --git a/...other/expected/California_Voting_Evaluate_example_california_VotingRegressor.results.json b/...other/expected/California_Voting_Evaluate_example_california_VotingRegressor.results.json
@@ -0,0 +1 @@
+[{"experiment_name": "California_Voting_Evaluate", "train_set_name": "train", "train_set_size": 335, "test_set_name": "test", "test_set_size": 165, "featureset": "[\"example_california_features\"]", "featureset_name": "example_california", "shuffle": false, "learner_name": "VotingRegressor", "task": "evaluate", "start_timestamp": "08 Sep 2022 16:46:00.730349", "version": "3.0.0", "feature_scaling": "both", "folds_file": "", "grid_search": true, "grid_objective": "pearson", "grid_search_folds": "3", "min_feature_count": 1, "cv_folds": "None", "using_folds_file": false, "save_cv_folds": true, "save_cv_models": false, "use_folds_file_for_grid_search": true, "stratified_folds": true, "scikit_learn_version": "1.1.2", "end_timestamp": "08 Sep 2022 16:46:01.258898", "total_time": "0:00:00.528549", "result_table": "", "accuracy": "", "pearson": 0.8166881235318064, "score": 0.8166881235318064, "fold": "", "model_params": "{\"estimators\": [[\"SVR\", \"{'vectorizer': DictVectorizer(sparse=False), 'selector': SelectByMinCount(), 'scaler': StandardScaler(), 'estimator': SVR(C=0.01, cache_size=1000)}\"], [\"LinearRegression\", \"{'vectorizer': DictVectorizer(sparse=False), 'selector': SelectByMinCount(), 'scaler': StandardScaler(), 'estimator': LinearRegression()}\"], [\"DecisionTreeRegressor\", \"{'vectorizer': DictVectorizer(sparse=False), 'selector': SelectByMinCount(), 'scaler': StandardScaler(), 'estimator': DecisionTreeRegressor(criterion='poisson', max_features='log2', random_state=123456789)}\"]], \"n_jobs\": null, \"verbose\": false, \"weights\": null}", "descriptive": {"actual": {"min": 0.425, "max": 5.00001, "avg": 2.0123758181818183, "std": 1.0334681372156997}, "predicted": {"min": 0.8218118503116726, "max": 3.7041058491575156, "avg": 1.9474491829550034, "std": 0.643682340272272}}, "additional_scores": {}}]