MDL-59988 analytics: Files marked as used only if valid

- Basic unit test for minimum machine learning backends requirements - Warning return messages now include not enough data - Clear models when the predictions processor is changed - Refined the name of a couple of constants / methods
moodle · Oct 13, 2017 · 325b3bd · 325b3bd
1 parent 9a316f3
commit 325b3bd
Show file tree

Hide file tree

Showing 11 changed files with 165 additions and 34 deletions.
diff --git a/admin/tool/analytics/classes/output/renderer.php b/admin/tool/analytics/classes/output/renderer.php
@@ -159,11 +159,12 @@ public function render_get_predictions_results($trainresults = false, $trainlogs
             if ($trainresults->status == 0) {
                 $output .= $OUTPUT->notification(get_string('trainingprocessfinished', 'tool_analytics'),
                     \core\output\notification::NOTIFY_SUCCESS);
-            } else if ($trainresults->status === \core_analytics\model::NO_DATASET) {
+            } else if ($trainresults->status === \core_analytics\model::NO_DATASET ||
+                    $trainresults->status === \core_analytics\model::NOT_ENOUGH_DATA) {
                 $output .= $OUTPUT->notification(get_string('nodatatotrain', 'tool_analytics'),
                     \core\output\notification::NOTIFY_WARNING);
             } else {
-                $output .= $OUTPUT->notification(get_string('generalerror', 'analytics', $trainresults->status),
+                $output .= $OUTPUT->notification(get_string('generalerror', 'tool_analytics', $trainresults->status),
                     \core\output\notification::NOTIFY_ERROR);
             }
         }
@@ -183,11 +184,12 @@ public function render_get_predictions_results($trainresults = false, $trainlogs
             if ($predictresults->status == 0) {
                 $output .= $OUTPUT->notification(get_string('predictionprocessfinished', 'tool_analytics'),
                     \core\output\notification::NOTIFY_SUCCESS);
-            } else if ($predictresults->status === \core_analytics\model::NO_DATASET) {
+            } else if ($predictresults->status === \core_analytics\model::NO_DATASET ||
+                    $predictresults->status === \core_analytics\model::NOT_ENOUGH_DATA) {
                 $output .= $OUTPUT->notification(get_string('nodatatopredict', 'tool_analytics'),
                     \core\output\notification::NOTIFY_WARNING);
             } else {
-                $output .= $OUTPUT->notification(get_string('generalerror', 'analytics', $predictresults->status),
+                $output .= $OUTPUT->notification(get_string('generalerror', 'tool_analytics', $predictresults->status),
                     \core\output\notification::NOTIFY_ERROR);
             }
         }

diff --git a/analytics/classes/admin_setting_predictor.php b/analytics/classes/admin_setting_predictor.php
@@ -58,6 +58,15 @@ public function write_setting($data) {
             return get_string('errorprocessornotready', 'analytics', $isready);
         }
 
+        $currentvalue = get_config('analytics', 'predictionsprocessor');
+        if (!empty($currentvalue) && $currentvalue != str_replace('\\\\', '\\', $data)) {
+            // Clear all models data.
+            $models = \core_analytics\manager::get_all_models();
+            foreach ($models as $model) {
+                $model->clear();
+            }
+        }
+
         return ($this->config_write($this->name, $data) ? '' : get_string('errorsetting', 'admin'));
     }
 }
diff --git a/analytics/classes/local/target/base.php b/analytics/classes/local/target/base.php
@@ -261,7 +261,7 @@ protected function min_prediction_score() {
     }
 
     /**
-     * Should the model callback be triggered?
+     * This method determines if a prediction is interesing for the model or not.
      *
      * @param mixed $predictedvalue
      * @param float $predictionscore

diff --git a/analytics/classes/local/target/discrete.php b/analytics/classes/local/target/discrete.php
@@ -152,7 +152,10 @@ protected function ignored_predicted_classes() {
     }
 
     /**
-     * Should the model callback be triggered?
+     * This method determines if a prediction is interesing for the model or not.
+     *
+     * This method internally calls ignored_predicted_classes to skip classes
+     * flagged by the target as not important for users.
      *
      * @param mixed $predictedvalue
      * @param float $predictionscore

diff --git a/analytics/classes/local/target/linear.php b/analytics/classes/local/target/linear.php
@@ -84,7 +84,7 @@ public static function get_min_value() {
     }
 
     /**
-     * Should the model callback be triggered?
+     * This method determines if a prediction is interesing for the model or not.
      *
      * @param mixed $predictedvalue
      * @param float $predictionscore

diff --git a/analytics/classes/model.php b/analytics/classes/model.php
@@ -53,12 +53,12 @@ class model {
     /**
      * Model with low prediction accuracy.
      */
-    const EVALUATE_LOW_SCORE = 4;
+    const LOW_SCORE = 4;
 
     /**
      * Not enough data to evaluate the model properly.
      */
-    const EVALUATE_NOT_ENOUGH_DATA = 8;
+    const NOT_ENOUGH_DATA = 8;
 
     /**
      * Invalid analysable for the time splitting method.
@@ -437,7 +437,7 @@ public function update($enabled, $indicators = false, $timesplittingid = '') {
                 $this->model->indicators !== $indicatorsstr) {
 
             // Delete generated predictions before changing the model version.
-            $this->clear_model();
+            $this->clear();
 
             // It needs to be reset as the version changes.
             $this->uniqueid = null;
@@ -474,9 +474,9 @@ public function delete() {
 
         \core_analytics\manager::check_can_manage_models();
 
-        $this->clear_model();
+        $this->clear();
 
-        // Method self::clear_model is already clearing the current model version.
+        // Method self::clear is already clearing the current model version.
         $predictor = \core_analytics\manager::get_predictions_processor();
         $predictor->delete_output_dir($this->get_output_dir(array(), true));
 
@@ -633,6 +633,10 @@ public function train() {
         $result->status = $predictorresult->status;
         $result->info = $predictorresult->info;
 
+        if ($result->status !== self::OK) {
+            return $result;
+        }
+
         $this->flag_file_as_used($samplesfile, 'trained');
 
         // Mark the model as trained if it wasn't.
@@ -717,6 +721,10 @@ public function predict() {
             $result->predictions = $this->format_predictor_predictions($predictorresult);
         }
 
+        if ($result->status !== self::OK) {
+            return $result;
+        }
+
         if ($result->predictions) {
             $samplecontexts = $this->execute_prediction_callbacks($result->predictions, $indicatorcalculations);
         }
@@ -780,15 +788,16 @@ protected function execute_prediction_callbacks($predictions, $indicatorcalculat
 
         // Here we will store all predictions' contexts, this will be used to limit which users will see those predictions.
         $samplecontexts = array();
+        $records = array();
 
         foreach ($predictions as $uniquesampleid => $prediction) {
 
-            if ($this->get_target()->triggers_callback($prediction->prediction, $prediction->predictionscore)) {
+            // The unique sample id contains both the sampleid and the rangeindex.
+            list($sampleid, $rangeindex) = $this->get_time_splitting()->infer_sample_info($uniquesampleid);
 
-                // The unique sample id contains both the sampleid and the rangeindex.
-                list($sampleid, $rangeindex) = $this->get_time_splitting()->infer_sample_info($uniquesampleid);
+            if ($this->get_target()->triggers_callback($prediction->prediction, $prediction->predictionscore)) {
 
-                // Store the predicted values.
+                // Prepare the record to store the predicted values.
                 list($record, $samplecontext) = $this->prepare_prediction_record($sampleid, $rangeindex, $prediction->prediction,
                     $prediction->predictionscore, json_encode($indicatorcalculations[$uniquesampleid]));
 
@@ -990,7 +999,7 @@ public function enable($timesplittingid = false) {
             }
 
             // Delete generated predictions before changing the model version.
-            $this->clear_model();
+            $this->clear();
 
             // It needs to be reset as the version changes.
             $this->uniqueid = null;
@@ -1268,7 +1277,7 @@ protected function get_output_dir($subdirs = array(), $onlymodelid = false) {
             $outputdir = rtrim($CFG->dataroot, '/') . DIRECTORY_SEPARATOR . 'models';
         }
 
-        // Append model id
+        // Append model id.
         $outputdir .= DIRECTORY_SEPARATOR . $this->model->id;
         if (!$onlymodelid) {
             // Append version + subdirs.
@@ -1435,9 +1444,11 @@ private static function indicator_classes($indicators) {
      *
      * @return void
      */
-    private function clear_model() {
+    public function clear() {
         global $DB;
 
+        \core_analytics\manager::check_can_manage_models();
+
         // Delete current model version stored stuff.
         $predictor = \core_analytics\manager::get_predictions_processor();
         $predictor->clear_model($this->get_unique_id(), $this->get_output_dir());

diff --git a/analytics/tests/model_test.php b/analytics/tests/model_test.php
@@ -155,13 +155,10 @@ public function test_clear() {
         $modelversionoutputdir = $this->model->get_output_dir();
         $this->assertTrue(is_dir($modelversionoutputdir));
 
-        // Update to an empty time splitting method to force clear_model execution.
-        $this->model->update(1, false, '');
+        // Update to an empty time splitting method to force model::clear execution.
+        $this->model->clear();
         $this->assertFalse(is_dir($modelversionoutputdir));
 
-        // Restore previous time splitting method.
-        $this->model->enable('\core\analytics\time_splitting\no_splitting');
-
         // Check that most of the stuff got deleted.
         $this->assertEquals(1, $DB->count_records('analytics_models', array('id' => $this->modelobj->id)));
         $this->assertEquals(1, $DB->count_records('analytics_models_log', array('modelid' => $this->modelobj->id)));

diff --git a/analytics/tests/prediction_test.php b/analytics/tests/prediction_test.php
@@ -273,6 +273,96 @@ public function provider_ml_training_and_prediction() {
         return $this->add_prediction_processors($cases);
     }
 
+    /**
+     * Test the system classifiers returns.
+     *
+     * This test checks that all mlbackend plugins in the system are able to return proper status codes
+     * even under weird situations.
+     *
+     * @dataProvider provider_ml_classifiers_return
+     * @param int $success
+     * @param int $nsamples
+     * @param int $classes
+     * @param string $predictionsprocessorclass
+     * @return void
+     */
+    public function test_ml_classifiers_return($success, $nsamples, $classes, $predictionsprocessorclass) {
+        $this->resetAfterTest();
+
+        $predictionsprocessor = \core_analytics\manager::get_predictions_processor($predictionsprocessorclass, false);
+        if ($predictionsprocessor->is_ready() !== true) {
+            $this->markTestSkipped('Skipping ' . $predictionsprocessorclass . ' as the predictor is not ready.');
+        }
+
+        if ($nsamples % count($classes) != 0) {
+            throw new \coding_exception('The number of samples should be divisible by the number of classes');
+        }
+        $samplesperclass = $nsamples / count($classes);
+
+        // Metadata (we pass 2 classes even if $classes only provides 1 class samples as we want to test
+        // what the backend does in this case.
+        $dataset = "nfeatures,targetclasses,targettype" . PHP_EOL;
+        $dataset .= "3,\"[0,1]\",\"discrete\"" . PHP_EOL;
+
+        // Headers.
+        $dataset .= "feature1,feature2,feature3,target" . PHP_EOL;
+        foreach ($classes as $class) {
+            for ($i = 0; $i < $samplesperclass; $i++) {
+                $dataset .= "1,0,1,$class" . PHP_EOL;
+            }
+        }
+
+        $trainingfile = array(
+            'contextid' => \context_system::instance()->id,
+            'component' => 'analytics',
+            'filearea' => 'labelled',
+            'itemid' => 123,
+            'filepath' => '/',
+            'filename' => 'whocares.csv'
+        );
+        $fs = get_file_storage();
+        $dataset = $fs->create_file_from_string($trainingfile, $dataset);
+
+        // Training should work correctly if at least 1 sample of each class is included.
+        $dir = make_request_directory();
+        $result = $predictionsprocessor->train_classification('whatever', $dataset, $dir);
+
+        switch ($success) {
+            case 'yes':
+                $this->assertEquals(\core_analytics\model::OK, $result->status);
+                break;
+            case 'no':
+                $this->assertNotEquals(\core_analytics\model::OK, $result->status);
+                break;
+            case 'maybe':
+            default:
+                // We just check that an object is returned so we don't have an empty check,
+                // what we really want to check is that an exception was not thrown.
+                $this->assertInstanceOf(\stdClass::class, $result);
+        }
+    }
+
+    /**
+     * test_ml_classifiers_return provider
+     *
+     * We can not be very specific here as test_ml_classifiers_return only checks that
+     * mlbackend plugins behave and expected and control properly backend errors even
+     * under weird situations.
+     *
+     * @return array
+     */
+    public function provider_ml_classifiers_return() {
+        // Using verbose options as the first argument for readability.
+        $cases = array(
+            '1-samples' => array('maybe', 1, [0]),
+            '2-samples-same-class' => array('maybe', 2, [0]),
+            '2-samples-different-classes' => array('yes', 2, [0, 1]),
+            '4-samples-different-classes' => array('yes', 4, [0, 1])
+        );
+
+        // We need to test all system prediction processors.
+        return $this->add_prediction_processors($cases);
+    }
 
     /**
      * Basic test to check that prediction processors work as expected.
@@ -426,8 +516,8 @@ public function provider_ml_test_evaluation() {
                 'expectedresults' => array(
                     // The course duration is too much to be processed by in weekly basis.
                     '\core\analytics\time_splitting\weekly' => \core_analytics\model::NO_DATASET,
-                    '\core\analytics\time_splitting\single_range' => \core_analytics\model::EVALUATE_LOW_SCORE,
-                    '\core\analytics\time_splitting\quarters' => \core_analytics\model::EVALUATE_LOW_SCORE,
+                    '\core\analytics\time_splitting\single_range' => \core_analytics\model::LOW_SCORE,
+                    '\core\analytics\time_splitting\quarters' => \core_analytics\model::LOW_SCORE,
                 )
             ),
             'good' => array(

diff --git a/lang/en/analytics.php b/lang/en/analytics.php
@@ -82,7 +82,7 @@
 $string['onlycli'] = 'Analytics processes execution via command line only';
 $string['onlycliinfo'] = 'Analytics processes like evaluating models, training machine learning algorithms or getting predictions can take some time, they will run as cron tasks and they can be forced via command line. Disable this setting if you want your site managers to be able to run these processes manually via web interface';
 $string['predictionsprocessor'] = 'Predictions processor';
-$string['predictionsprocessor_help'] = 'Prediction processors are the machine learning backends that process the datasets generated by calculating models\' indicators and targets.';
+$string['predictionsprocessor_help'] = 'A predictions processor is the machine-learning backend that processes the datasets generated by calculating models\' indicators and targets. All trained algorithms and predictions will be deleted if you change to another predictions processor.';
 $string['processingsitecontents'] = 'Processing site contents';
 $string['successfullyanalysed'] = 'Successfully analysed';
 $string['timesplittingmethod'] = 'Time-splitting method';

diff --git a/lib/mlbackend/php/classes/processor.php b/lib/mlbackend/php/classes/processor.php
@@ -129,16 +129,27 @@ public function train_classification($uniqueid, \stored_file $dataset, $outputdi
             $samples[] = array_slice($sampledata, 0, $metadata['nfeatures']);
             $targets[] = intval($data[$metadata['nfeatures']]);
 
-            if (count($samples) === self::BATCH_SIZE) {
+            $nsamples = count($samples);
+            if ($nsamples === self::BATCH_SIZE) {
                 // Training it batches to avoid running out of memory.
 
                 $classifier->partialTrain($samples, $targets, array(0, 1));
                 $samples = array();
                 $targets = array();
             }
+            if (empty($morethan1sample) && $nsamples > 1) {
+                $morethan1sample = true;
+            }
         }
         fclose($fh);
 
+        if (empty($morethan1sample)) {
+            $resultobj = new \stdClass();
+            $resultobj->status = \core_analytics\model::NO_DATASET;
+            $resultobj->info = array();
+            return $resultobj;
+        }
+
         // Train the remaining samples.
         if ($samples) {
             $classifier->partialTrain($samples, $targets, array(0, 1));
@@ -288,7 +299,7 @@ public function evaluate_classification($uniqueid, $maxdeviation, $niterations,
         }
         if (!empty($notenoughdata)) {
             $resultobj = new \stdClass();
-            $resultobj->status = \core_analytics\model::EVALUATE_NOT_ENOUGH_DATA;
+            $resultobj->status = \core_analytics\model::NOT_ENOUGH_DATA;
             $resultobj->score = 0;
             $resultobj->info = array(get_string('errornotenoughdata', 'mlbackend_php'));
             return $resultobj;
@@ -350,15 +361,15 @@ protected function get_evaluation_result_object(\stored_file $dataset, $phis, $m
 
         // If each iteration results varied too much we need more data to confirm that this is a valid model.
         if ($modeldev > $maxdeviation) {
-            $resultobj->status = $resultobj->status + \core_analytics\model::EVALUATE_NOT_ENOUGH_DATA;
+            $resultobj->status = $resultobj->status + \core_analytics\model::NOT_ENOUGH_DATA;
             $a = new \stdClass();
             $a->deviation = $modeldev;
             $a->accepteddeviation = $maxdeviation;
             $resultobj->info[] = get_string('errornotenoughdatadev', 'mlbackend_php', $a);
         }
 
         if ($resultobj->score < \core_analytics\model::MIN_SCORE) {
-            $resultobj->status = $resultobj->status + \core_analytics\model::EVALUATE_LOW_SCORE;
+            $resultobj->status = $resultobj->status + \core_analytics\model::LOW_SCORE;
             $a = new \stdClass();
             $a->score = $resultobj->score;
             $a->minscore = \core_analytics\model::MIN_SCORE;