modularizes, adds verbosity, and switches back to rmse scoring

ClimbsRocks · Aug 17, 2016 · e0f451d · e0f451d
1 parent 98b0d4a
commit e0f451d
Showing 1 changed file with 23 additions and 9 deletions.
diff --git a/auto_ml/predictor.py b/auto_ml/predictor.py
@@ -95,9 +95,9 @@ def train(self, raw_training_data, user_input_func=None, optimize_entire_pipelin
             scoring = make_scorer(brier_score_loss, greater_is_better=True)
 
         else:
-            scoring = None
-            # scoring = 'mean_squared_error'
-            # scoring = utils.rmse_scoring
+            # scoring = None
+            # # scoring = 'mean_squared_error'
+            scoring = utils.rmse_scoring
 
         # We will be performing GridSearchCV every time, even if the space we are searching over is null
         gs = GridSearchCV(
@@ -144,9 +144,7 @@ def _get_estimator_names(self, ml_for_analytics=False):
         else:
             raise('TypeError: type_of_algo must be either "classifier" or "regressor".')
 
-
-    def ml_for_analytics(self, raw_training_data, user_input_func=None, optimize_entire_pipeline=False, optimize_final_model=False, write_gs_param_results_to_file=True, perform_feature_selection=True):
-
+    def _prepare_for_training(self, raw_training_data, write_gs_param_results_to_file=True):
         if write_gs_param_results_to_file:
             gs_param_file_name = 'most_recent_pipeline_grid_search_result.csv'
             try:
@@ -190,20 +188,33 @@ def ml_for_analytics(self, raw_training_data, user_input_func=None, optimize_ent
                 indices_to_delete = set(indices_to_delete)
                 X = [row for idx, row in enumerate(X) if idx not in indices_to_delete]
 
+        return X, y, gs_param_file_name
+
+    def ml_for_analytics(self, raw_training_data, user_input_func=None, optimize_entire_pipeline=False, optimize_final_model=False, write_gs_param_results_to_file=True, perform_feature_selection=True, verbose=True):
+
+        X, y, gs_param_file_name = self._prepare_for_training(raw_training_data, write_gs_param_results_to_file)
+        if verbose:
+            print('Successfully performed basic preparations and y-value cleaning')
+
         ppl = self._construct_pipeline(user_input_func, optimize_final_model=optimize_final_model, ml_for_analytics=True, perform_feature_selection=perform_feature_selection)
 
+        if verbose:
+            print('Successfully constructed the pipeline')
+
         estimator_names = self._get_estimator_names(ml_for_analytics=True)
 
         if self.type_of_algo == 'classifier':
             # scoring = 'roc_auc'
             scoring = make_scorer(brier_score_loss, greater_is_better=True)
             self._scorer = scoring
         else:
-            scoring = None
-            # scoring = 'mean_squared_error'
-            # scoring = utils.rmse_scoring
+            # scoring = None
+            # # scoring = 'mean_squared_error'
+            scoring = utils.rmse_scoring
             self._scorer = scoring
 
+        if verbose:
+            print('Created estimator_names and scoring')
 
         for model_name in estimator_names:
 
@@ -225,6 +236,9 @@ def ml_for_analytics(self, raw_training_data, user_input_func=None, optimize_ent
                 scoring=scoring
             )
 
+            if verbose:
+                print('About to fit the GridSearchCV on the pipeline for the model ' + model_name)
+
             gs.fit(X, y)
             self.trained_pipeline = gs.best_estimator_