Skip to content

Commit

Permalink
code clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
ClimbsRocks committed Sep 17, 2016
1 parent 7419a71 commit 5d56897
Showing 1 changed file with 13 additions and 18 deletions.
31 changes: 13 additions & 18 deletions auto_ml/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,14 @@ def _validate_input_col_descriptions(self, column_descriptions):
raise ValueError('In your column_descriptions, please make sure exactly one column has the value "output", which is the value we will be training models to predict.')


def _construct_pipeline(self, user_input_func=None, model_name='LogisticRegression', perform_feature_selection=True, impute_missing_values=True, ml_for_analytics=True, perform_feature_scaling=True):
def _construct_pipeline(self, model_name='LogisticRegression', impute_missing_values=True, perform_feature_scaling=True):

pipeline_list = []

if len(self.subpredictors) > 0:
pipeline_list.append(('subpredictors', utils.AddSubpredictorPredictions(trained_subpredictors=self.subpredictors)))
if user_input_func is not None:
pipeline_list.append(('user_func', FunctionTransformer(func=user_input_func, pass_y=False, validate=False) ))
if self.user_input_func is not None:
pipeline_list.append(('user_func', FunctionTransformer(func=self.user_input_func, pass_y=False, validate=False) ))

if len(self.date_cols) > 0:
pipeline_list.append(('date_feature_engineering', date_feature_engineering.FeatureEngineer(date_cols=self.date_cols)))
Expand All @@ -94,7 +94,7 @@ def _construct_pipeline(self, user_input_func=None, model_name='LogisticRegressi

pipeline_list.append(('dv', DictVectorizer(sparse=True, sort=True)))

if perform_feature_selection:
if self.perform_feature_selection:
# pipeline_list.append(('pca', TruncatedSVD()))
pipeline_list.append(('feature_selection', utils.FeatureSelectionTransformer(type_of_estimator=self.type_of_estimator, feature_selection_model='SelectFromModel') ))

Expand All @@ -103,7 +103,7 @@ def _construct_pipeline(self, user_input_func=None, model_name='LogisticRegressi

final_model = utils.get_model_from_name(model_name)

pipeline_list.append(('final_model', utils.FinalModelATC(model=final_model, model_name=model_name, type_of_estimator=self.type_of_estimator, ml_for_analytics=ml_for_analytics)))
pipeline_list.append(('final_model', utils.FinalModelATC(model=final_model, model_name=model_name, type_of_estimator=self.type_of_estimator, ml_for_analytics=self.ml_for_analytics)))

constructed_pipeline = Pipeline(pipeline_list)
return constructed_pipeline
Expand Down Expand Up @@ -177,7 +177,6 @@ def _prepare_for_training(self, raw_training_data):
y = y_ints
except:
pass
pass
else:
indices_to_delete = []
y_floats = []
Expand All @@ -203,6 +202,7 @@ def _prepare_for_training(self, raw_training_data):

return X, y


def _make_sub_column_descriptions(self, column_descriptions, sub_name):
# TODO: make this work for multiple subpredictors. right now it will grab all 'regressor' or 'classifier' values at once, instead of only grabbing on.
subpredictor_types = set(['classifier', 'regressor'])
Expand All @@ -226,6 +226,7 @@ def _make_sub_column_descriptions(self, column_descriptions, sub_name):

return dup_descs, sub_type_of_estimator


def make_sub_x_and_y_test(self, X_test, sub_name):
vals_to_ignore = set([None, float('nan'), float('Inf'), 'ignore'])
clean_X_test = []
Expand Down Expand Up @@ -272,7 +273,6 @@ def _train_subpredictor(self, sub_name, X_subpredictors, sub_idx, sub_model_name
self.subpredictors[sub_idx] = ml_predictor



def train(self, raw_training_data, user_input_func=None, optimize_entire_pipeline=False, optimize_final_model=False, write_gs_param_results_to_file=True, perform_feature_selection=True, verbose=True, X_test=None, y_test=None, print_training_summary_to_viewer=True, ml_for_analytics=True, only_analytics=False, compute_power=3, take_log_of_y=True, model_names=None, add_cluster_prediction=False, num_weak_estimators=0):

self.user_input_func = user_input_func
Expand Down Expand Up @@ -362,18 +362,12 @@ def train(self, raw_training_data, user_input_func=None, optimize_entire_pipelin
if verbose:
print('Successfully performed basic preparations and y-value cleaning')

# ppl = self._construct_pipeline(user_input_func, perform_feature_selection=perform_feature_selection, ml_for_analytics=self.ml_for_analytics)

# if verbose:
# print('Successfully constructed the pipeline')

if model_names:
estimator_names = model_names
else:
estimator_names = self._get_estimator_names()

if self.type_of_estimator == 'classifier':
# scoring = make_scorer(brier_score_loss, greater_is_better=True)
scoring = utils.brier_score_loss_wrapper
self._scorer = scoring
else:
Expand Down Expand Up @@ -404,19 +398,16 @@ def train(self, raw_training_data, user_input_func=None, optimize_entire_pipelin
del self.X_test
del self.grid_search_pipelines

def perform_grid_search_by_model_names(self, estimator_names, scoring, X, y):

def perform_grid_search_by_model_names(self, estimator_names, scoring, X, y):

for model_name in estimator_names:
ppl = self._construct_pipeline(self.user_input_func, model_name=model_name, perform_feature_selection=self.perform_feature_selection, ml_for_analytics=self.ml_for_analytics)
ppl = self._construct_pipeline(model_name=model_name)

self.grid_search_params = self._construct_pipeline_search_params(user_defined_model_names=estimator_names)

self.grid_search_params['final_model__model_name'] = [model_name]

# This must be set here, rather than in construct_pipeline.
# self.grid_search_params['final_model__model'] = utils.get_model_from_name(model_name)

if self.optimize_final_model or self.compute_power >= 5:
raw_search_params = utils.get_search_params(model_name)
for param_name, param_list in raw_search_params.items():
Expand Down Expand Up @@ -518,6 +509,7 @@ def _get_xgb_feat_importances(self, clf):
for feature in sorted_feature_infos[-50:]:
print(feature[0] + ': ' + str(round(feature[1] / sum_of_all_feature_importances, 4)))


def _print_ml_analytics_results_random_forest(self):
print('\n\nHere are the results from our ' + self.trained_pipeline.named_steps['final_model'].model_name)

Expand All @@ -539,6 +531,7 @@ def _print_ml_analytics_results_random_forest(self):
for feature in sorted_feature_infos[-50:]:
print(feature[0] + ': ' + str(round(feature[1], 4)))


def _get_trained_feature_names(self):
if self.trained_pipeline.named_steps.get('feature_selection', False):

Expand Down Expand Up @@ -618,6 +611,7 @@ def predict(self, prediction_data):
predicted_vals[idx] = math.exp(val)
return predicted_vals


def predict_proba(self, prediction_data):

# TODO(PRESTON): investigate if we need to handle input of a single dictionary differently than a list of dictionaries.
Expand All @@ -638,6 +632,7 @@ def score(self, X_test, y_test):
else:
return self.trained_pipeline.score(X_test, y_test)


def save(self, file_name='auto_ml_saved_pipeline.pkl', verbose=True):
with open(file_name, 'wb') as open_file_name:
pickle.dump(self.trained_pipeline, open_file_name, protocol=pickle.HIGHEST_PROTOCOL)
Expand Down

0 comments on commit 5d56897

Please sign in to comment.