-
Notifications
You must be signed in to change notification settings - Fork 312
/
predictor.py
1943 lines (1510 loc) · 104 KB
/
predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from collections import OrderedDict
import datetime
import math
import multiprocessing
import os
import random
import sys
import types
import warnings
from deap.base import Toolbox
import dill
import pathos
import numpy as np
import pandas as pd
from tabulate import tabulate
# Ultimately, we (the authors of auto_ml) are responsible for building a project that's robust against warnings.
# The classes of warnings below are ones we've deemed acceptable. The user should be able to sit at a high level of abstraction, and not be bothered with the internals of how we're handing these things.
# Ignore all warnings that are UserWarnings or DeprecationWarnings. We'll fix these ourselves as necessary.
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None # default='warn'
import scipy
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import mean_squared_error, brier_score_loss, make_scorer, accuracy_score
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from auto_ml import DataFrameVectorizer
from auto_ml import utils
from auto_ml import utils_categorical_ensembling
from auto_ml import utils_data_cleaning
from auto_ml import utils_ensembling
from auto_ml import utils_feature_selection
from auto_ml import utils_model_training
from auto_ml import utils_models
from auto_ml import utils_scaling
from auto_ml import utils_scoring
from evolutionary_search import EvolutionaryAlgorithmSearchCV
xgb_installed = False
try:
import xgboost as xgb
xgb_installed = True
except ImportError:
pass
# def _pickle_method(m):
# if m.im_self is None:
# return getattr, (m.im_class, m.im_func.func_name)
# else:
# return getattr, (m.im_self, m.im_func.func_name)
# try:
# import copy_reg
# copy_reg.pickle(types.MethodType, _pickle_method)
# except:
# import copyreg
# copyreg.pickle(types.MethodType, _pickle_method)
class Predictor(object):
def __init__(self, type_of_estimator, column_descriptions, verbose=True, name=None):
if type_of_estimator.lower() in ['regressor','regression', 'regressions', 'regressors', 'number', 'numeric', 'continuous']:
self.type_of_estimator = 'regressor'
elif type_of_estimator.lower() in ['classifier', 'classification', 'categorizer', 'categorization', 'categories', 'labels', 'labeled', 'label']:
self.type_of_estimator = 'classifier'
else:
print('Invalid value for "type_of_estimator". Please pass in either "regressor" or "classifier". You passed in: ' + type_of_estimator)
raise ValueError('Invalid value for "type_of_estimator". Please pass in either "regressor" or "classifier". You passed in: ' + type_of_estimator)
self.column_descriptions = column_descriptions
self.verbose = verbose
self.trained_pipeline = None
self._scorer = None
self.date_cols = []
# Later on, if this is a regression problem, we will possibly take the natural log of our y values for training, but we will still want to return the predictions in their normal scale (not the natural log values)
self.took_log_of_y = False
self.take_log_of_y = False
self._validate_input_col_descriptions()
self.name = name
def _validate_input_col_descriptions(self):
found_output_column = False
self.cols_to_ignore = []
expected_vals = set(['categorical', 'text', 'nlp'])
for key, value in self.column_descriptions.items():
value = value.lower()
self.column_descriptions[key] = value
if value == 'output':
self.output_column = key
found_output_column = True
elif value == 'date':
self.date_cols.append(key)
elif value == 'ignore':
self.cols_to_ignore.append(key)
elif value in expected_vals:
pass
else:
raise ValueError('We are not sure how to process this column of data: ' + str(value) + '. Please pass in "output", "categorical", "ignore", "nlp", or "date".')
if found_output_column is False:
print('Here is the column_descriptions that was passed in:')
print(self.column_descriptions)
raise ValueError('In your column_descriptions, please make sure exactly one column has the value "output", which is the value we will be training models to predict.')
# We will be adding one new categorical variable for each date col
# Be sure to add it here so the rest of the pipeline knows to handle it as a categorical column
for date_col in self.date_cols:
self.column_descriptions[date_col + '_day_part'] = 'categorical'
self.cols_to_ignore = set(self.cols_to_ignore)
# We use _construct_pipeline at both the start and end of our training.
# At the start, it constructs the pipeline from scratch
# At the end, it takes FeatureSelection out after we've used it to restrict DictVectorizer, and adds final_model back in if we did grid search on it
def _construct_pipeline(self, model_name='LogisticRegression', trained_pipeline=None, final_model=None, feature_learning=False, final_model_step_name='final_model', prediction_interval=False, keep_cat_features=False, is_hp_search=False):
pipeline_list = []
if self.user_input_func is not None:
if trained_pipeline is not None:
pipeline_list.append(('user_func', trained_pipeline.named_steps['user_func']))
elif self.transformation_pipeline is None:
print('Including the user_input_func in the pipeline! Please remember to return X, and not modify the length or order of X at all.')
print('Your function will be called as the first step of the pipeline at both training and prediction times.')
pipeline_list.append(('user_func', FunctionTransformer(func=self.user_input_func, validate=False)))
# These parts will be included no matter what.
if trained_pipeline is not None:
pipeline_list.append(('basic_transform', trained_pipeline.named_steps['basic_transform']))
else:
pipeline_list.append(('basic_transform', utils_data_cleaning.BasicDataCleaning(column_descriptions=self.column_descriptions)))
if self.perform_feature_scaling is True:
if trained_pipeline is not None:
pipeline_list.append(('scaler', trained_pipeline.named_steps['scaler']))
else:
if model_name[:12] == 'DeepLearning':
min_percentile = 0.0
max_percentile = 1.0
pipeline_list.append(('scaler', utils_scaling.CustomSparseScaler(self.column_descriptions, truncate_large_values=True)))
else:
pipeline_list.append(('scaler', utils_scaling.CustomSparseScaler(self.column_descriptions)))
if trained_pipeline is not None:
pipeline_list.append(('dv', trained_pipeline.named_steps['dv']))
else:
pipeline_list.append(('dv', DataFrameVectorizer.DataFrameVectorizer(sparse=True, column_descriptions=self.column_descriptions, keep_cat_features=keep_cat_features)))
if self.perform_feature_selection == True:
if trained_pipeline is not None:
# This is the step we are trying to remove from the trained_pipeline, since it has already been combined with dv using dv.restrict
pass
else:
pipeline_list.append(('feature_selection', utils_feature_selection.FeatureSelectionTransformer(type_of_estimator=self.type_of_estimator, column_descriptions=self.column_descriptions, feature_selection_model='SelectFromModel') ))
if trained_pipeline is not None:
# First, check and see if we have any steps with some version of keyword matching on something like 'intermediate_model_predictions' or 'feature_learning_model' or 'ensemble_model' or something like that in them.
# add all of those steps
# then try to add in the final_model that was passed in as a param
# if it's none, then we've already added in the final model with our keyword matching above!
for step in trained_pipeline.steps:
step_name = step[0]
if step_name[-6:] == '_model':
pipeline_list.append((step_name, trained_pipeline.named_steps[step_name]))
# Handling the case where we have run gscv on just the final model itself, and we now need to integrate it back into the rest of the pipeline
if final_model is not None:
pipeline_list.append((final_model_step_name, final_model))
else:
try:
training_features = self._get_trained_feature_names()
except:
training_features = None
training_prediction_intervals = False
params = None
if prediction_interval is not False:
params = {}
params['objective'] = 'quantile'
params['alpha'] = prediction_interval
training_prediction_intervals = True
elif feature_learning == False:
# Do not pass in our training_params for the feature_learning model
params = self.training_params
final_model = utils_models.get_model_from_name(model_name, training_params=params)
pipeline_list.append(('final_model', utils_model_training.FinalModelATC(model=final_model, type_of_estimator=self.type_of_estimator, ml_for_analytics=self.ml_for_analytics, name=self.name, _scorer=self._scorer, feature_learning=feature_learning, uncertainty_model=self.need_to_train_uncertainty_model, training_prediction_intervals=training_prediction_intervals, column_descriptions=self.column_descriptions, training_features=training_features, keep_cat_features=keep_cat_features, is_hp_search=is_hp_search, X_test=self.X_test, y_test=self.y_test)))
constructed_pipeline = utils.ExtendedPipeline(pipeline_list, keep_cat_features=keep_cat_features)
return constructed_pipeline
def _get_estimator_names(self):
if self.type_of_estimator == 'regressor':
base_estimators = ['GradientBoostingRegressor']
if self.compare_all_models != True:
return base_estimators
else:
base_estimators.append('RANSACRegressor')
base_estimators.append('RandomForestRegressor')
base_estimators.append('LinearRegression')
base_estimators.append('AdaBoostRegressor')
base_estimators.append('ExtraTreesRegressor')
return base_estimators
elif self.type_of_estimator == 'classifier':
base_estimators = ['GradientBoostingClassifier']
if self.compare_all_models != True:
return base_estimators
else:
base_estimators.append('LogisticRegression')
base_estimators.append('RandomForestClassifier')
return base_estimators
else:
raise('TypeError: type_of_estimator must be either "classifier" or "regressor".')
def _prepare_for_training(self, X):
# We accept input as either a DataFrame, or as a list of dictionaries. Internally, we use DataFrames. So if the user gave us a list, convert it to a DataFrame here.
if isinstance(X, list):
X_df = pd.DataFrame(X)
del X
else:
X_df = X
# To keep this as light in memory as possible, immediately remove any columns that the user has already told us should be ignored
if len(self.cols_to_ignore) > 0:
X_df = utils.safely_drop_columns(X_df, self.cols_to_ignore)
# Having duplicate columns can really screw things up later. Remove them here, with user logging to tell them what we're doing
X_df = utils.drop_duplicate_columns(X_df)
# If we're writing training results to file, create the new empty file name here
if self.write_gs_param_results_to_file:
self.gs_param_file_name = 'most_recent_pipeline_grid_search_result.csv'
try:
os.remove(self.gs_param_file_name)
except:
pass
# Remove the output column from the dataset, and store it into the y varaible
y = list(X_df[self.output_column])
X_df = X_df.drop(self.output_column, axis=1)
# Drop all rows that have an empty value for our output column
# User logging so they can adjust if they pass in a bunch of bad values:
X_df, y = utils.drop_missing_y_vals(X_df, y, self.output_column)
# If this is a classifier, try to turn all the y values into proper ints
# Some classifiers play more nicely if you give them category labels as ints rather than strings, so we'll make our jobs easier here if we can.
if self.type_of_estimator == 'classifier':
# The entire column must be turned into floats. If any value fails, don't convert anything in the column to floats
try:
y_ints = []
for val in y:
y_ints.append(int(val))
y = y_ints
except:
pass
else:
# If this is a regressor, turn all the values into floats if possible, and remove this row if they cannot be turned into floats
indices_to_delete = []
y_floats = []
bad_vals = []
for idx, val in enumerate(y):
try:
float_val = utils_data_cleaning.clean_val(val)
y_floats.append(float_val)
except ValueError as err:
indices_to_delete.append(idx)
bad_vals.append(val)
y = y_floats
# Even more verbose logging here since these values are not just missing, they're strings for a regression problem
if len(indices_to_delete) > 0:
print('The y values given included some bad values that the machine learning algorithms will not be able to train on.')
print('The rows at these indices have been deleted because their y value could not be turned into a float:')
print(indices_to_delete)
print('These were the bad values')
print(bad_vals)
X_df = X_df.drop(X_df.index(indices_to_delete))
return X_df, y
def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
# First, restrict our DictVectorizer or DataFrameVectorizer
# This goes through and has DV only output the items that have passed our support mask
# This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
# It also significantly reduces the size of dv.vocabulary_ which can get quite large
try:
feature_selection = transformation_pipeline.named_steps['feature_selection']
feature_selection_mask = feature_selection.support_mask
transformation_pipeline.named_steps['dv'].restrict(feature_selection_mask)
except KeyError:
pass
# We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
# In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)
return trained_pipeline_without_feature_selection
def set_params_and_defaults(self, X_df, user_input_func=None, optimize_final_model=None, write_gs_param_results_to_file=True, perform_feature_selection=None, verbose=True, X_test=None, y_test=None, ml_for_analytics=True, take_log_of_y=None, model_names=None, perform_feature_scaling=True, calibrate_final_model=False, _scorer=None, scoring=None, verify_features=False, training_params=None, grid_search_params=None, compare_all_models=False, cv=2, feature_learning=False, fl_data=None, optimize_feature_learning=False, train_uncertainty_model=None, uncertainty_data=None, uncertainty_delta=None, uncertainty_delta_units=None, calibrate_uncertainty=False, uncertainty_calibration_settings=None, uncertainty_calibration_data=None, uncertainty_delta_direction='both', advanced_analytics=True, analytics_config=None, prediction_intervals=None, predict_intervals=None, ensemble_config=None, trained_transformation_pipeline=None, transformed_X=None, transformed_y=None, return_transformation_pipeline=False, X_test_already_transformed=False):
self.user_input_func = user_input_func
self.optimize_final_model = optimize_final_model
self.write_gs_param_results_to_file = write_gs_param_results_to_file
self.ml_for_analytics = ml_for_analytics
if X_test is not None:
X_test, y_test = utils.drop_missing_y_vals(X_test, y_test, self.output_column)
self.X_test = X_test
self.y_test = y_test
self.X_test_already_transformed = X_test_already_transformed
if self.type_of_estimator == 'regressor':
self.take_log_of_y = take_log_of_y
self.compare_all_models = compare_all_models
# We expect model_names to be a list of strings
if isinstance(model_names, str):
# If the user passes in a single string, put it in a list
self.model_names = [model_names]
else:
self.model_names = model_names
# If the user passed in a valid value for model_names (not None, and not a list where the only thing is None)
if self.model_names is None or (len(self.model_names) == 1 and self.model_names[0] is None):
self.model_names = self._get_estimator_names()
if 'DeepLearningRegressor' in self.model_names or 'DeepLearningClassifier' in self.model_names:
if perform_feature_scaling is None or perform_feature_scaling == True:
self.perform_feature_scaling = True
else:
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
print('Heard that we should not perform feature_scaling, but we should train a Deep Learning model. Note that feature_scaling is typically useful and frequently essential for deep learning. We STRONGLY suggest not setting perform_feature_scaling=False.')
warnings.warn('It is a best practice, and often necessary for training, to perform_feature_scaling while doing Deep Learning.')
self.perform_feature_scaling = perform_feature_scaling
else:
self.perform_feature_scaling = perform_feature_scaling
self.calibrate_final_model = calibrate_final_model
self.scoring = scoring
if training_params is None:
self.training_params = {}
else:
self.training_params = training_params
self.user_gs_params = grid_search_params
if self.user_gs_params is not None:
self.optimize_final_model = True
self.cv = cv
if ensemble_config is None:
self.ensemble_config = []
else:
self.ensemble_config = ensemble_config
self.calibrate_uncertainty = calibrate_uncertainty
self.uncertainty_calibration_data = uncertainty_calibration_data
if uncertainty_delta_direction is None:
uncertainty_delta_direction = 'both'
self.uncertainty_delta_direction = uncertainty_delta_direction.lower()
if self.uncertainty_delta_direction not in ['both', 'directional']:
raise ValueError('Please pass in either "both" or "directional" for uncertainty_delta_direction')
if uncertainty_calibration_settings is None:
self.uncertainty_calibration_settings = {
'num_buckets': 10
, 'percentiles': [25, 50, 75]
}
else:
self.uncertainty_calibration_settings = uncertainty_calibration_settings
if advanced_analytics is None:
self.advanced_analytics = True
else:
self.advanced_analytics = advanced_analytics
default_analytics_config = {
'percent_rows': 0.1
, 'min_rows': 10000
, 'cols_to_ignore': []
, 'file_name': 'auto_ml_analytics_results_' + self.output_column + '.csv'
, 'col_std_multiplier': 0.5
}
if analytics_config is None:
self.analytics_config = default_analytics_config
else:
updated_analytics_config = default_analytics_config.copy()
updated_analytics_config = updated_analytics_config.update(analytics_config)
self.analytics_config = updated_analytics_config
self.perform_feature_selection = perform_feature_selection
# Let the user pass in 'prediction_intervals' and 'predict_intervals' interchangeably
if predict_intervals is not None and prediction_intervals is None:
prediction_intervals = predict_intervals
if prediction_intervals is None:
self.calculate_prediction_intervals = False
else:
if isinstance(prediction_intervals, bool):
# This is to allow the user to pass in their own bounds here, rather than having to just use our 5% and 95% bounds
self.calculate_prediction_intervals = prediction_intervals
else:
self.calculate_prediction_intervals = True
if prediction_intervals == True:
self.prediction_intervals = [0.05, 0.95]
else:
self.prediction_intervals = prediction_intervals
self.train_uncertainty_model = train_uncertainty_model
if self.train_uncertainty_model == True and self.type_of_estimator == 'classifier':
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
print('Right now uncertainty predictions are only supported for regressors. The ".predict_proba()" method of classifiers is a reasonable workaround if you are looking for uncertainty predictions for a classifier')
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
raise ValueError('train_uncertainty_model is only supported for regressors')
self.need_to_train_uncertainty_model = train_uncertainty_model
self.uncertainty_data = uncertainty_data
# TODO: more input validation for calibrate_uncertainty
# make sure we have all the base features in place before taking in the advanced settings
# make sure people include num_buckets and 'percentiles' in their uc_settings
# make sure the uc_data has the output column we need for the base predictor
if uncertainty_delta is not None:
if uncertainty_delta_units is None:
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
print('We received an uncertainty_delta, but do not know the units this is measured in. Please pass in one of ["absolute", "percentage"]')
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
raise ValueError('We received a value for uncertainty_delta, but the data passed in for uncertainty_delta_units is missing')
self.uncertainty_delta = uncertainty_delta
self.uncertainty_delta_units = uncertainty_delta_units
else:
self.uncertainty_delta = 'std'
self.uncertainty_delta_units = 'absolute'
if self.train_uncertainty_model == True and self.uncertainty_data is None:
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
print('Saw that train_uncertainty_model is True, but there is no data passed in for uncertainty_data, which is needed to train the uncertainty estimator')
warnings.warn('Please pass in uncertainty_data which is the dataset that will be used to train the uncertainty estimator.')
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
raise ValueError('The data passed in for uncertainty_data is missing')
self.optimize_feature_learning = optimize_feature_learning
self.feature_learning = feature_learning
if self.feature_learning == True:
if fl_data is None:
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
print('Saw that feature_learning is True, but there is no data passed in for fl_data, which is needed to train the feature_learning estimator')
warnings.warn('Please pass in fl_data which is the dataset that will be used to train the feature_learning estimator.')
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
raise ValueError('The data passed in for fl_data is missing')
self.fl_data = fl_data
if self.perform_feature_scaling == False:
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
print('Heard that we should not perform feature_scaling, but we should perform feature_learning. Note that feature_scaling is typically useful for deep learning, which is what we use for feature_learning. If you want a little more model accuracy from the feature_learning step, consider not passing in perform_feature_scaling=False')
warnings.warn('Consider allowing auto_ml to perform_feature_scaling in conjunction with feature_learning')
if self.perform_feature_selection == True:
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
print('We are not currently supporting perform_feature_selection with this release of feature_learning. We will override perform_feature_selection to False and continue with training.')
warnings.warn('perform_feature_selection=True is not currently supported with feature_learning.')
self.perform_feature_selection = False
if (isinstance(X_df, pd.DataFrame) and X_df.equals(fl_data)) or (isinstance(X_df, list) and X_df == fl_data):
print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
print('You must pass in different data for fl_data and your training data. This is true both philosophically (you are much more likely to overfit if fl_data == training_data), and logistically (we remove the y column from both datasets, which will throw an error)')
print('If you are looking for a quick and easy way of splitting the data, use scikit-learn\'s train_test_split: df_train, fl_data = train_test_split(df_train, test_size=0.33) ')
print('Or, if you insist on using the same dataset for both, you must at least copy it:')
print('ml_predictor.train(df_train, feature_learning=True, fl_data=df_train.copy())')
warnings.warn('Your fl_data and df_train must be different datasets. Use train_test_split, or at least copy the data for your fl_data')
if trained_transformation_pipeline is None:
self.transformation_pipeline = None
else:
print('We will be using the previously trained transformation pipeline you passed in')
print('Be cautious when passing in a trained transformation pipeline- make sure that it is trained on exactly the same data.')
self.transformation_pipeline = trained_transformation_pipeline
if transformed_X is not None and transformed_y is None:
print('Please pass in both a transformed_X and transformed_y')
raise(ValueError('Please pass in transformed_y if you are passing in transformed_X'))
self.return_transformation_pipeline = return_transformation_pipeline
# We are taking in scoring here to deal with the unknown behavior around multilabel classification below
def _clean_data_and_prepare_for_training(self, data, scoring):
X_df, y = self._prepare_for_training(data)
if self.take_log_of_y:
y = [math.log(val) for val in y]
self.took_log_of_y = True
self.X_df = X_df
self.y = y
# Unless the user has told us to, don't perform feature selection unless we have a pretty decent amount of data
if self.perform_feature_selection is None:
if len(X_df.columns) < 50 or len(X_df) < 100000:
self.perform_feature_selection = False
else:
self.perform_feature_selection = True
self.set_scoring(y, scoring=scoring)
return X_df, y
def set_scoring(self, y, scoring=None):
# TODO: we're not using ClassificationScorer for multilabel classification. Why?
# Probably has to do with self.scoring vs self._scorer = scoring
if self.type_of_estimator == 'classifier':
if len(set(y)) > 2 and self.scoring is None:
self.scoring = 'accuracy_score'
else:
scoring = utils_scoring.ClassificationScorer(self.scoring)
self._scorer = scoring
else:
scoring = utils_scoring.RegressionScorer(self.scoring)
self._scorer = scoring
def fit_feature_learning_and_transformation_pipeline(self, X_df, fl_data, y):
fl_data_cleaned, fl_y = self._clean_data_and_prepare_for_training(fl_data, self.scoring)
# Only import this if we have to, because it takes a while to import in some environments
from keras.models import Model
len_X_df = len(X_df)
combined_training_data = pd.concat([X_df, fl_data_cleaned], axis=0)
combined_y = y + fl_y
if self.type_of_estimator == 'classifier':
fl_estimator_names = ['DeepLearningClassifier']
elif self.type_of_estimator == 'regressor':
fl_estimator_names = ['DeepLearningRegressor']
# For performance reasons, I believe it is critical to only have one transformation pipeline, no matter how many estimators we eventually build on top. Getting predictions from a trained estimator is typically super quick. We can easily get predictions from 10 trained models in a production-ready amount of time.But the transformation pipeline is not so quick that we can duplicate it 10 times.
combined_transformed_data = self.fit_transformation_pipeline(combined_training_data, combined_y, fl_estimator_names)
fl_indices = [i for i in range(len_X_df, combined_transformed_data.shape[0])]
fl_data_transformed = combined_transformed_data[fl_indices]
# fit a train_final_estimator
feature_learning_step = self.train_ml_estimator(fl_estimator_names, self._scorer, fl_data_transformed, fl_y, feature_learning=True)
# Split off the final layer/find a way to get the output from the penultimate layer
fl_model = feature_learning_step.model
feature_output_model = Model(inputs=fl_model.model.input, outputs=fl_model.model.get_layer('penultimate_layer').output)
feature_learning_step.model = feature_output_model
# Add those to the list in our DV so we know what to do with them for analytics purposes
feature_learning_names = []
for idx in range(10):
feature_learning_names.append('feature_learning_' + str(idx + 1))
# TODO:
self.transformation_pipeline.named_steps['dv'].feature_names_ += feature_learning_names
# add the estimator to the end of our transformation pipeline
self.transformation_pipeline = self._construct_pipeline(trained_pipeline=self.transformation_pipeline, final_model=feature_learning_step, final_model_step_name='feature_learning_model')
# Pass our already-transformed X_df just through the feature_learning_step.transform. This avoids duplicate computationn
indices = [i for i in range(len_X_df)]
X_df_transformed = combined_transformed_data[indices]
X_df = feature_learning_step.transform(X_df_transformed)
return X_df
def train(self, raw_training_data, user_input_func=None, optimize_final_model=None, write_gs_param_results_to_file=True, perform_feature_selection=None, verbose=True, X_test=None, y_test=None, ml_for_analytics=True, take_log_of_y=None, model_names=None, perform_feature_scaling=True, calibrate_final_model=False, _scorer=None, scoring=None, verify_features=False, training_params=None, grid_search_params=None, compare_all_models=False, cv=2, feature_learning=False, fl_data=None, optimize_feature_learning=False, train_uncertainty_model=False, uncertainty_data=None, uncertainty_delta=None, uncertainty_delta_units=None, calibrate_uncertainty=False, uncertainty_calibration_settings=None, uncertainty_calibration_data=None, uncertainty_delta_direction=None, advanced_analytics=None, analytics_config=None, prediction_intervals=None, predict_intervals=None, ensemble_config=None, trained_transformation_pipeline=None, transformed_X=None, transformed_y=None, return_transformation_pipeline=False, X_test_already_transformed=False):
self.set_params_and_defaults(raw_training_data, user_input_func=user_input_func, optimize_final_model=optimize_final_model, write_gs_param_results_to_file=write_gs_param_results_to_file, perform_feature_selection=perform_feature_selection, verbose=verbose, X_test=X_test, y_test=y_test, ml_for_analytics=ml_for_analytics, take_log_of_y=take_log_of_y, model_names=model_names, perform_feature_scaling=perform_feature_scaling, calibrate_final_model=calibrate_final_model, _scorer=_scorer, scoring=scoring, verify_features=verify_features, training_params=training_params, grid_search_params=grid_search_params, compare_all_models=compare_all_models, cv=cv, feature_learning=feature_learning, fl_data=fl_data, optimize_feature_learning=False, train_uncertainty_model=train_uncertainty_model, uncertainty_data=uncertainty_data, uncertainty_delta=uncertainty_delta, uncertainty_delta_units=uncertainty_delta_units, calibrate_uncertainty=calibrate_uncertainty, uncertainty_calibration_settings=uncertainty_calibration_settings, uncertainty_calibration_data=uncertainty_calibration_data, uncertainty_delta_direction=uncertainty_delta_direction, prediction_intervals=prediction_intervals, predict_intervals=predict_intervals, ensemble_config=ensemble_config, trained_transformation_pipeline=trained_transformation_pipeline, transformed_X=transformed_X, transformed_y=transformed_y, return_transformation_pipeline=return_transformation_pipeline, X_test_already_transformed=X_test_already_transformed)
if verbose:
print('Welcome to auto_ml! We\'re about to go through and make sense of your data using machine learning, and give you a production-ready pipeline to get predictions with.\n')
print('If you have any issues, or new feature ideas, let us know at https://github.com/ClimbsRocks/auto_ml')
if transformed_X is None:
X_df, y = self._clean_data_and_prepare_for_training(raw_training_data, scoring)
if self.transformation_pipeline is None:
if self.feature_learning == True:
X_df = self.fit_feature_learning_and_transformation_pipeline(X_df, fl_data, y)
else:
# If the user passed in a valid value for model_names (not None, and not a list where the only thing is None)
if self.model_names is not None and not (len(self.model_names) == 1 and self.model_names[0] is None):
estimator_names = self.model_names
else:
estimator_names = self._get_estimator_names()
X_df = self.fit_transformation_pipeline(X_df, y, estimator_names)
else:
X_df = self.transformation_pipeline.transform(X_df)
else:
X_df, y = utils.drop_missing_y_vals(transformed_X, transformed_y)
self.set_scoring(y)
if self.X_test is not None and self.X_test_already_transformed == False:
self.X_test = self.transformation_pipeline.transform(self.X_test)
# This is our main logic for how we train the final model
self.trained_final_model = self.train_ml_estimator(self.model_names, self._scorer, X_df, y)
if self.ensemble_config is not None and len(self.ensemble_config) > 0:
self._train_ensemble(X_df, y)
if self.need_to_train_uncertainty_model == True:
self._create_uncertainty_model(uncertainty_data, scoring, y, uncertainty_calibration_data)
# Calibrate the probability predictions from our final model
if self.calibrate_final_model is True:
self.trained_final_model.model = self._calibrate_final_model(self.trained_final_model.model, X_test, y_test)
if self.calculate_prediction_intervals is True:
# TODO: parallelize these!
interval_predictors = []
for percentile in self.prediction_intervals:
interval_predictor = self.train_ml_estimator(['LGBMRegressor'], self._scorer, X_df, y, prediction_interval=percentile)
predictor_tup = ('interval_{}'.format(percentile), interval_predictor)
interval_predictors.append(predictor_tup)
self.trained_final_model.interval_predictors = interval_predictors
self.trained_pipeline = self._consolidate_pipeline(self.transformation_pipeline, self.trained_final_model)
# verify_features is not enabled by default. It adds a significant amount to the file size of the saved pipelines.
# If you are interested in submitting a PR to reduce the saved file size, there are definitely some optimizations you can make!
if verify_features == True:
self._prepare_for_verify_features()
# Delete values that we no longer need that are just taking up space.
del self.X_test
del self.y_test
del self.X_test_already_transformed
del X_df
if self.return_transformation_pipeline:
return self.transformation_pipeline
return self
def _create_uncertainty_model(self, uncertainty_data, scoring, y, uncertainty_calibration_data):
# 1. Add base_prediction to our dv for analytics purposes
# Note that we will have to be cautious that things all happen in the exact same order as we expand what we do post-DV over time
# Adding this one directly- we don't want dfv to transform it necessarily, we just want it there for getting feature_names later for printing feature_importances
self.transformation_pipeline.named_steps['dv'].feature_names_.append('base_prediction')
# 2. Get predictions from our base predictor on our uncertainty data
uncertainty_data, y_uncertainty = self._clean_data_and_prepare_for_training(uncertainty_data, scoring)
uncertainty_data_transformed = self.transformation_pipeline.transform(uncertainty_data)
base_predictions = self.trained_final_model.predict(uncertainty_data_transformed)
base_predictions = [[val] for val in base_predictions]
base_predictions = np.array(base_predictions)
uncertainty_data_transformed = scipy.sparse.hstack([uncertainty_data_transformed, base_predictions], format='csr')
# 2A. Grab the user's definition of uncertainty, and create the output values 'is_uncertain_prediction'
# post-mvp: allow the user to pass in stuff like 1.5*std
if self.uncertainty_delta == 'std':
y_std = np.std(y)
self.uncertainty_delta = 0.5 * y_std
is_uncertain_predictions = self.define_uncertain_predictions(base_predictions, y_uncertainty)
analytics_results = pd.Series(is_uncertain_predictions)
print('\n\nHere is the percent of values in our uncertainty training data that are classified as uncertain:')
percent_uncertain = sum(is_uncertain_predictions) * 1.0 / len(is_uncertain_predictions)
print(percent_uncertain)
if percent_uncertain == 1.0:
print('Using the current definition, all rows are classified as uncertain')
print('Here is our current definition:')
print('self.uncertainty_delta')
print(self.uncertainty_delta)
print('self.uncertainty_delta_units')
print(self.uncertainty_delta_units)
print('And here is a summary of our predictions:')
print(pd.Series(y_uncertainty).describe(include='all'))
warnings.warn('All predictions in ojur uncertainty training data are classified as uncertain. Please redefine uncertainty so there is a mix of certain and uncertain predictions to train an uncertainty model.')
return self
# 3. train our uncertainty predictor
uncertainty_estimator_names = ['GradientBoostingClassifier']
self.trained_uncertainty_model = self.train_ml_estimator(uncertainty_estimator_names, self._scorer, uncertainty_data_transformed, is_uncertain_predictions)
# 4. grab the entire uncertainty FinalModelATC object, and put it as a property on our base predictor's FinalModelATC (something like .trained_uncertainty_model). It's important to grab this entire object, for all of the edge-case handling we've built in.
self.trained_final_model.uncertainty_model = self.trained_uncertainty_model
if self.calibrate_uncertainty == True:
uncertainty_calibration_data_transformed = self.transformation_pipeline.transform(self.uncertainty_calibration_data)
uncertainty_calibration_predictions = self.trained_final_model.predict_uncertainty(uncertainty_calibration_data_transformed)
actuals = list(uncertainty_calibration_data[self.output_column])
predictions = uncertainty_calibration_predictions['base_prediction']
deltas = predictions - actuals
uncertainty_calibration_predictions['actual_deltas'] = deltas
probas = uncertainty_calibration_predictions.uncertainty_prediction
num_buckets = self.uncertainty_calibration_settings['num_buckets']
# If we have overlapping bucket definitions, pandas will drop those duplicates, but won't drop the duplicate labels
# So we'll try bucketing one time, then get the actual number of bins from that
try:
bucket_results, bins = pd.qcut(probas, q=num_buckets, retbins=True, duplicates='drop')
except TypeError:
bucket_results, bins = pd.qcut(probas, q=num_buckets, retbins=True)
# now that we know the actual number of bins, we can create our labels, then use those to create our final set of buckets
bucket_labels = range(1, len(bins))
try:
bucket_results = pd.qcut(probas, q=num_buckets, labels=bucket_labels, duplicates='drop')
except TypeError:
bucket_results = pd.qcut(probas, q=num_buckets, labels=bucket_labels)
uncertainty_calibration_predictions['bucket_num'] = bucket_results
uc_results = OrderedDict()
for bucket in bucket_labels:
dataset = uncertainty_calibration_predictions[uncertainty_calibration_predictions['bucket_num'] == bucket]
deltas = dataset['actual_deltas']
uc_results[bucket] = OrderedDict()
uc_results[bucket]['bucket_num'] = bucket
# FUTURE: add in rmse and maybe something like median_ae
# FUTURE: add in max_value for each bucket_num
uc_results[bucket]['max_proba'] = np.max(dataset['uncertainty_prediction'])
for perc in self.uncertainty_calibration_settings['percentiles']:
delta_at_percentile = np.percentile(deltas, perc)
uc_results[bucket]['percentile_' + str(perc) + '_delta'] = delta_at_percentile
# make the max_proba of our last bucket_num 1
uc_results[bucket_labels[-1]]['max_proba'] = 1
print('Here are the uncertainty_calibration results, for each bucket of predicted probabilities')
for num in uc_results:
print(uc_results[num])
self.trained_final_model.uc_results = uc_results
self.need_to_train_uncertainty_model = False
def _prepare_for_verify_features(self):
# Save the features we used for training to our FinalModelATC instance.
# This lets us provide useful information to the user when they call .predict(data, verbose=True)
trained_feature_names = self._get_trained_feature_names()
self.trained_pipeline.set_params(final_model__training_features=trained_feature_names)
# We will need to know which columns are categorical/ignored/nlp when verifying features
self.trained_pipeline.set_params(final_model__column_descriptions=self.column_descriptions)
def _calibrate_final_model(self, trained_model, X_test, y_test):
if X_test is None or y_test is None:
print('X_test or y_test was not present while trying to calibrate the final model')
print('Please pass in both X_test and y_test to calibrate the final model')
print('Skipping model calibration')
return trained_model
print('Now calibrating the final model so the probability predictions line up with the observed probabilities in the X_test and y_test datasets you passed in.')
print('Note: the validation scores printed above are truly validation scores: they were scored before the model was calibrated to this data.')
print('However, now that we are calibrating on the X_test and y_test data you gave us, it is no longer accurate to call this data validation data, since the model is being calibrated to it. As such, you must now report a validation score on a different dataset, or report the validation score used above before the model was calibrated to X_test and y_test. ')
if len(X_test) < 1000:
calibration_method = 'sigmoid'
else:
calibration_method = 'isotonic'
calibrated_classifier = CalibratedClassifierCV(trained_model, method=calibration_method, cv='prefit')
# We need to make sure X_test has been processed the exact same way y_test has.
X_test_processed = self.transformation_pipeline.transform(X_test)
try:
calibrated_classifier = calibrated_classifier.fit(X_test_processed, y_test)
except TypeError as e:
if scipy.sparse.issparse(X_test_processed):
X_test_processed = X_test_processed.toarray()
calibrated_classifier = calibrated_classifier.fit(X_test_processed, y_test)
else:
raise(e)
return calibrated_classifier
def fit_single_pipeline(self, X_df, y, model_name, feature_learning=False, prediction_interval=False):
full_pipeline = self._construct_pipeline(model_name=model_name, feature_learning=feature_learning, prediction_interval=prediction_interval, keep_cat_features=self.transformation_pipeline.keep_cat_features)
ppl = full_pipeline.named_steps['final_model']
if self.verbose:
print('\n\n********************************************************************************************')
if self.name is not None:
print(self.name)
if prediction_interval is not False:
print('About to fit a {} quantile regressor to predict the prediction_interval for the {}th percentile'.format(model_name, int(prediction_interval * 100)))
else:
print('About to fit the pipeline for the model ' + model_name + ' to predict ' + self.output_column)
print('Started at:')
start_time = datetime.datetime.now().replace(microsecond=0)
print(start_time)
ppl.fit(X_df, y)
if self.verbose:
print('Finished training the pipeline!')
print('Total training time:')
print(datetime.datetime.now().replace(microsecond=0) - start_time)
# Don't report feature_responses (or nearly anything else) if this is just the feature_learning stage
# That saves a considerable amount of time
if feature_learning == False:
self.print_results(model_name, ppl, X_df, y)
return ppl
# We have broken our model training into separate components. The first component is always going to be fitting a transformation pipeline. The great part about separating the feature transformation step is that now we can perform other work on the final step, and not have to repeat the sometimes time-consuming step of the transformation pipeline.
# NOTE: if included, we will be fitting a feature selection step here. This can get messy later on with ensembling if we end up training on different y values.
def fit_transformation_pipeline(self, X_df, y, model_names):
keep_cat_features = True
for model_name in model_names:
keep_cat_features = keep_cat_features and model_name in ['LGBMRegressor', 'LGBMClassifier', 'CatBoostRegressor', 'CatBoostClassifier']
self.keep_cat_features = keep_cat_features
ppl = self._construct_pipeline(model_name=model_names[0], keep_cat_features=self.keep_cat_features)
ppl.steps.pop()
# We are intentionally overwriting X_df here to try to save some memory space
X_df = ppl.fit_transform(X_df, y)
self.transformation_pipeline = self._consolidate_pipeline(ppl)
return X_df
def create_feature_responses(self, model, X_transformed, y, top_features=None):
print('Calculating feature responses, for advanced analytics.')
if top_features is None:
top_features = self._get_trained_feature_names()
# figure out how many rows to keep
orig_row_count = X_transformed.shape[0]
orig_column_count = X_transformed.shape[1]
# If we have fewer than 10000 rows, use all of them, regardless of user input
# This approach only works if there are a decent number of rows, so we will try to put some safeguard in place to help the user from getting results that are too misleading
row_multiplier = 1
if orig_column_count > 1000:
row_multiplier = 0.25
if orig_row_count <= 10000:
num_rows_to_use = orig_row_count
if row_multiplier < 1:
X, ignored_X, y, ignored_y = train_test_split(X_transformed, y, train_size=row_multiplier )
else:
X = X_transformed
else:
percent_row_count = int(self.analytics_config['percent_rows'] * orig_row_count)
num_rows_to_use = min(orig_row_count, percent_row_count, 10000)
num_rows_to_use = int(num_rows_to_use * row_multiplier)
X, ignored_X, y, ignored_y = train_test_split(X_transformed, y, train_size=num_rows_to_use)
if scipy.sparse.issparse(X):
X = X.toarray()
# Get our baseline predictions
if self.type_of_estimator == 'regressor':
base_predictions = model.predict(X)
elif self.type_of_estimator == 'classifier':
base_predictions = model.predict_proba(X)
base_predictions = [x[1] for x in base_predictions]
feature_names = self._get_trained_feature_names()
all_results = []
for col_idx, col_name in enumerate(feature_names):
if col_name not in top_features:
continue
col_result = {}
col_result['Feature Name'] = col_name
if col_name[:4] != 'nlp_' and '=' not in col_name and self.column_descriptions.get(col_name, False) != 'categorical':
col_std = np.nanstd(X[:, col_idx])
col_delta = self.analytics_config['col_std_multiplier'] * col_std
col_result['Delta'] = col_delta
# TODO: min_delta
# get the unique vals
# sort them
# go through and find the min_delta between consecutive vals
# make srue col_std is greater than min_delta
# if it is not, set col_std to min_delta
# Increment the values of this column by the std
X[:, col_idx] += col_delta
if self.type_of_estimator == 'regressor':
predictions = model.predict(X)
elif self.type_of_estimator == 'classifier':
predictions = model.predict_proba(X)
predictions = [x[1] for x in predictions]
deltas = []
for pred_idx, pred in enumerate(predictions):
delta = pred - base_predictions[pred_idx]
deltas.append(delta)
col_result['FR_Incrementing'] = np.mean(deltas)
absolute_prediction_deltas = np.absolute(deltas)
col_result['FRI_abs'] = np.mean(absolute_prediction_deltas)
median_prediction = np.median(absolute_prediction_deltas)
col_result['FRI_MAD'] = median_prediction
X[:, col_idx] -= 2 * col_delta
if self.type_of_estimator == 'regressor':
predictions = model.predict(X)
elif self.type_of_estimator == 'classifier':
predictions = model.predict_proba(X)
predictions = [x[1] for x in predictions]
deltas = []
for pred_idx, pred in enumerate(predictions):
delta = pred - base_predictions[pred_idx]
deltas.append(delta)
col_result['FR_Decrementing'] = np.mean(deltas)
absolute_prediction_deltas = np.absolute(deltas)
col_result['FRD_abs'] = np.mean(absolute_prediction_deltas)
median_prediction = np.median(absolute_prediction_deltas)
col_result['FRD_MAD'] = median_prediction
# Put the column back to it's original state
X[:, col_idx] += col_delta
all_results.append(col_result)
df_all_results = pd.DataFrame(all_results)
return df_all_results
def print_results(self, model_name, model, X, y):
if self.ml_for_analytics and model_name in ('LogisticRegression', 'RidgeClassifier', 'LinearRegression', 'Ridge'):
df_model_results = self._print_ml_analytics_results_linear_model(model)
sorted_model_results = df_model_results.sort_values(by='Coefficients', ascending=False)
sorted_model_results = sorted_model_results.reset_index(drop=True)
# only grab the top 100 features from X
top_features = set(sorted_model_results.head(n=100)['Feature Name'])
feature_responses = self.create_feature_responses(model, X, y, top_features)
self._join_and_print_analytics_results(feature_responses, sorted_model_results, sort_field='Coefficients')
elif self.ml_for_analytics and model_name in ['RandomForestClassifier', 'RandomForestRegressor', 'XGBClassifier', 'XGBRegressor', 'GradientBoostingRegressor', 'GradientBoostingClassifier', 'LGBMRegressor', 'LGBMClassifier', 'CatBoostRegressor', 'CatBoostClassifier']:
try:
df_model_results = self._print_ml_analytics_results_random_forest(model)
sorted_model_results = df_model_results.sort_values(by='Importance', ascending=False)