Skip to content

Commit

Permalink
switches back to using sklearn for prediction intervals
Browse files Browse the repository at this point in the history
  • Loading branch information
ClimbsRocks committed Dec 8, 2017
1 parent d8859ab commit 2c61a49
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 24 deletions.
19 changes: 13 additions & 6 deletions auto_ml/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,11 @@ def _construct_pipeline(self, model_name='LogisticRegression', trained_pipeline=

if prediction_interval is not False:
params = {}
params['objective'] = 'quantile'
params['loss'] = 'quantile'
params['alpha'] = prediction_interval
params['n_estimators'] = 100
params['learning_rate'] = 0.15
params.update(self.prediction_interval_params)
training_prediction_intervals = True

elif feature_learning == False:
Expand Down Expand Up @@ -308,7 +311,7 @@ def _consolidate_pipeline(self, transformation_pipeline, final_model=None):

return trained_pipeline_without_feature_selection

def set_params_and_defaults(self, X_df, user_input_func=None, optimize_final_model=None, write_gs_param_results_to_file=True, perform_feature_selection=None, verbose=True, X_test=None, y_test=None, ml_for_analytics=True, take_log_of_y=None, model_names=None, perform_feature_scaling=True, calibrate_final_model=False, _scorer=None, scoring=None, verify_features=False, training_params=None, grid_search_params=None, compare_all_models=False, cv=2, feature_learning=False, fl_data=None, optimize_feature_learning=False, train_uncertainty_model=None, uncertainty_data=None, uncertainty_delta=None, uncertainty_delta_units=None, calibrate_uncertainty=False, uncertainty_calibration_settings=None, uncertainty_calibration_data=None, uncertainty_delta_direction='both', advanced_analytics=True, analytics_config=None, prediction_intervals=None, predict_intervals=None, ensemble_config=None, trained_transformation_pipeline=None, transformed_X=None, transformed_y=None, return_transformation_pipeline=False, X_test_already_transformed=False, skip_feature_responses=None):
def set_params_and_defaults(self, X_df, user_input_func=None, optimize_final_model=None, write_gs_param_results_to_file=True, perform_feature_selection=None, verbose=True, X_test=None, y_test=None, ml_for_analytics=True, take_log_of_y=None, model_names=None, perform_feature_scaling=True, calibrate_final_model=False, _scorer=None, scoring=None, verify_features=False, training_params=None, grid_search_params=None, compare_all_models=False, cv=2, feature_learning=False, fl_data=None, optimize_feature_learning=False, train_uncertainty_model=None, uncertainty_data=None, uncertainty_delta=None, uncertainty_delta_units=None, calibrate_uncertainty=False, uncertainty_calibration_settings=None, uncertainty_calibration_data=None, uncertainty_delta_direction='both', advanced_analytics=True, analytics_config=None, prediction_intervals=None, predict_intervals=None, ensemble_config=None, trained_transformation_pipeline=None, transformed_X=None, transformed_y=None, return_transformation_pipeline=False, X_test_already_transformed=False, skip_feature_responses=None, prediction_interval_params=None):

self.user_input_func = user_input_func
self.optimize_final_model = optimize_final_model
Expand Down Expand Up @@ -416,6 +419,10 @@ def set_params_and_defaults(self, X_df, user_input_func=None, optimize_final_mod
if predict_intervals is not None and prediction_intervals is None:
prediction_intervals = predict_intervals

if prediction_interval_params is None:
self.prediction_interval_params = {}
else:
self.prediction_interval_params = prediction_interval_params

if prediction_intervals is None:
self.calculate_prediction_intervals = False
Expand Down Expand Up @@ -595,9 +602,9 @@ def fit_feature_learning_and_transformation_pipeline(self, X_df, fl_data, y):
return X_df


def train(self, raw_training_data, user_input_func=None, optimize_final_model=None, write_gs_param_results_to_file=True, perform_feature_selection=None, verbose=True, X_test=None, y_test=None, ml_for_analytics=True, take_log_of_y=None, model_names=None, perform_feature_scaling=None, calibrate_final_model=False, _scorer=None, scoring=None, verify_features=False, training_params=None, grid_search_params=None, compare_all_models=False, cv=2, feature_learning=False, fl_data=None, optimize_feature_learning=False, train_uncertainty_model=False, uncertainty_data=None, uncertainty_delta=None, uncertainty_delta_units=None, calibrate_uncertainty=False, uncertainty_calibration_settings=None, uncertainty_calibration_data=None, uncertainty_delta_direction=None, advanced_analytics=None, analytics_config=None, prediction_intervals=None, predict_intervals=None, ensemble_config=None, trained_transformation_pipeline=None, transformed_X=None, transformed_y=None, return_transformation_pipeline=False, X_test_already_transformed=False, skip_feature_responses=None):
def train(self, raw_training_data, user_input_func=None, optimize_final_model=None, write_gs_param_results_to_file=True, perform_feature_selection=None, verbose=True, X_test=None, y_test=None, ml_for_analytics=True, take_log_of_y=None, model_names=None, perform_feature_scaling=None, calibrate_final_model=False, _scorer=None, scoring=None, verify_features=False, training_params=None, grid_search_params=None, compare_all_models=False, cv=2, feature_learning=False, fl_data=None, optimize_feature_learning=False, train_uncertainty_model=False, uncertainty_data=None, uncertainty_delta=None, uncertainty_delta_units=None, calibrate_uncertainty=False, uncertainty_calibration_settings=None, uncertainty_calibration_data=None, uncertainty_delta_direction=None, advanced_analytics=None, analytics_config=None, prediction_intervals=None, predict_intervals=None, ensemble_config=None, trained_transformation_pipeline=None, transformed_X=None, transformed_y=None, return_transformation_pipeline=False, X_test_already_transformed=False, skip_feature_responses=None, prediction_interval_params=None):

self.set_params_and_defaults(raw_training_data, user_input_func=user_input_func, optimize_final_model=optimize_final_model, write_gs_param_results_to_file=write_gs_param_results_to_file, perform_feature_selection=perform_feature_selection, verbose=verbose, X_test=X_test, y_test=y_test, ml_for_analytics=ml_for_analytics, take_log_of_y=take_log_of_y, model_names=model_names, perform_feature_scaling=perform_feature_scaling, calibrate_final_model=calibrate_final_model, _scorer=_scorer, scoring=scoring, verify_features=verify_features, training_params=training_params, grid_search_params=grid_search_params, compare_all_models=compare_all_models, cv=cv, feature_learning=feature_learning, fl_data=fl_data, optimize_feature_learning=False, train_uncertainty_model=train_uncertainty_model, uncertainty_data=uncertainty_data, uncertainty_delta=uncertainty_delta, uncertainty_delta_units=uncertainty_delta_units, calibrate_uncertainty=calibrate_uncertainty, uncertainty_calibration_settings=uncertainty_calibration_settings, uncertainty_calibration_data=uncertainty_calibration_data, uncertainty_delta_direction=uncertainty_delta_direction, prediction_intervals=prediction_intervals, predict_intervals=predict_intervals, ensemble_config=ensemble_config, trained_transformation_pipeline=trained_transformation_pipeline, transformed_X=transformed_X, transformed_y=transformed_y, return_transformation_pipeline=return_transformation_pipeline, X_test_already_transformed=X_test_already_transformed, skip_feature_responses=skip_feature_responses)
self.set_params_and_defaults(raw_training_data, user_input_func=user_input_func, optimize_final_model=optimize_final_model, write_gs_param_results_to_file=write_gs_param_results_to_file, perform_feature_selection=perform_feature_selection, verbose=verbose, X_test=X_test, y_test=y_test, ml_for_analytics=ml_for_analytics, take_log_of_y=take_log_of_y, model_names=model_names, perform_feature_scaling=perform_feature_scaling, calibrate_final_model=calibrate_final_model, _scorer=_scorer, scoring=scoring, verify_features=verify_features, training_params=training_params, grid_search_params=grid_search_params, compare_all_models=compare_all_models, cv=cv, feature_learning=feature_learning, fl_data=fl_data, optimize_feature_learning=False, train_uncertainty_model=train_uncertainty_model, uncertainty_data=uncertainty_data, uncertainty_delta=uncertainty_delta, uncertainty_delta_units=uncertainty_delta_units, calibrate_uncertainty=calibrate_uncertainty, uncertainty_calibration_settings=uncertainty_calibration_settings, uncertainty_calibration_data=uncertainty_calibration_data, uncertainty_delta_direction=uncertainty_delta_direction, prediction_intervals=prediction_intervals, predict_intervals=predict_intervals, ensemble_config=ensemble_config, trained_transformation_pipeline=trained_transformation_pipeline, transformed_X=transformed_X, transformed_y=transformed_y, return_transformation_pipeline=return_transformation_pipeline, X_test_already_transformed=X_test_already_transformed, skip_feature_responses=skip_feature_responses, prediction_interval_params=prediction_interval_params)

if verbose:
print('Welcome to auto_ml! We\'re about to go through and make sense of your data using machine learning, and give you a production-ready pipeline to get predictions with.\n')
Expand Down Expand Up @@ -647,7 +654,7 @@ def train(self, raw_training_data, user_input_func=None, optimize_final_model=No
# TODO: parallelize these!
interval_predictors = []
for percentile in self.prediction_intervals:
interval_predictor = self.train_ml_estimator(['LGBMRegressor'], self._scorer, X_df, y, prediction_interval=percentile)
interval_predictor = self.train_ml_estimator(['GradientBoostingRegressor'], self._scorer, X_df, y, prediction_interval=percentile)
predictor_tup = ('interval_{}'.format(percentile), interval_predictor)
interval_predictors.append(predictor_tup)

Expand Down Expand Up @@ -1189,7 +1196,7 @@ def create_gs_params(self, model_name):
def train_ml_estimator(self, estimator_names, scoring, X_df, y, feature_learning=False, prediction_interval=False):

if prediction_interval is not False:
estimator_names = ['LGBMRegressor']
# estimator_names = ['GradientBoostingRegressor']
trained_final_model = self.fit_single_pipeline(X_df, y, estimator_names[0], feature_learning=feature_learning, prediction_interval=prediction_interval)

# Use Case 1: Super straightforward: just train a single, non-optimized model
Expand Down
28 changes: 10 additions & 18 deletions tests/core_tests/test_prediction_intervals.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ def test_predict_uncertainty_true():
, 'CHAS': 'categorical'
}

df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)

ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

ml_predictor.train(df_boston_train, predict_intervals=True)
Expand Down Expand Up @@ -81,34 +79,32 @@ def test_prediction_intervals_actually_work():
, 'CHAS': 'categorical'
}

df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)

ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

ml_predictor.train(df_boston_train, predict_intervals=[0.1, 0.9])
ml_predictor.train(df_boston_train, predict_intervals=[0.05, 0.95])

df_boston_test = df_boston_test.reset_index(drop=True)
intervals = ml_predictor.predict_intervals(df_boston_test)
actuals = df_boston_test.MEDV

count_under = 0
count_over = 0
print(intervals)
# print(intervals)
for idx, row in intervals.iterrows():
actual = actuals.iloc[idx]

if actual < row['interval_0.1']:
if actual < row['interval_0.05']:
count_under += 1
if actual > row['interval_0.9']:
if actual > row['interval_0.95']:
count_over += 1

len_intervals = len(intervals)

pct_under = count_under * 1.0 / len_intervals
pct_over = count_over * 1.0 / len_intervals
# There's a decent bit of noise since this is such a small dataset
assert pct_under < 0.3
assert pct_over < 0.3
assert pct_under < 0.15
assert pct_over < 0.1


def test_prediction_intervals_lets_the_user_specify_number_of_intervals():
Expand All @@ -121,8 +117,6 @@ def test_prediction_intervals_lets_the_user_specify_number_of_intervals():
, 'CHAS': 'categorical'
}

df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)

ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

ml_predictor.train(df_boston_train, predict_intervals=True, prediction_intervals=[.2])
Expand All @@ -142,8 +136,6 @@ def test_predict_intervals_should_fail_if_not_trained():
, 'CHAS': 'categorical'
}

df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)

ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

ml_predictor.train(df_boston_train)
Expand All @@ -167,10 +159,10 @@ def test_predict_intervals_takes_in_custom_intervals():
, 'CHAS': 'categorical'
}

df_boston_train, uncertainty_data = train_test_split(df_boston_train, test_size=0.5)

ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

# df_boston_train = pd.concat([df_boston_train, df_boston_train, df_boston_train])

ml_predictor.train(df_boston_train, predict_intervals=[0.4, 0.6])

custom_intervals = ml_predictor.predict_intervals(df_boston_test, return_type='list')
Expand Down Expand Up @@ -207,6 +199,7 @@ def test_predict_intervals_takes_in_custom_intervals():
default_intervals = ml_predictor.predict_intervals(df_boston_test, return_type='list')

# This is a super flaky test, because we've got such a small datasize, and we're trying to get distributions from it
len_intervals = len(custom_intervals)
num_failures = 0
for idx, custom_row in enumerate(custom_intervals):
default_row = default_intervals[idx]
Expand All @@ -218,5 +211,4 @@ def test_predict_intervals_takes_in_custom_intervals():
print('{} should be lower than {}'.format(custom_row[1], default_row[1]))
num_failures += 1

len_intervals = len(custom_intervals)
assert num_failures < 0.25 * len_intervals
assert num_failures < 0.18 * len_intervals

0 comments on commit 2c61a49

Please sign in to comment.