In [2]:
!pip install xgboost

Keyring is skipped due to an exception: 'keyring.backends'
[0m

In [2]:
import sys
import os

# Hardcode so we can use in any notebook
module_path = "/root/HAIP/services"
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
# from sharepoint_transfer.src import main as sharepoint_transfer_svc
from data_consolidation.src import main as data_consolidation_svc
from feature_engineering.src import main as feature_engineering_svc
from model_selection.src import main as model_selection_svc
from outcome_analysis.src import main as outcome_analysis_svc
from model_training.src import main as model_training_svc

## Set Measure Specific Settings

These will augment and override settings in "/root/HAIP/notebooks/services/config.json"

In [4]:
settings = {
    # Measure specific options
    "SHAREPOINT_MEASURE_DIR": "SEP-1 Clean",
    "MEASURE_SPECIFIC_FILENAME": "SEP-1.csv",
    "FULL_MEASURE_S3_PREFIX": "SEP-1",
    "filter_measure": "SEP_1", # make sure this is _ not -
    
    # Feature Engineering options
    "save_modeling_dataset_path": '/root/HAIP/data/SEP-1.csv',
    "backfill_prov_mean": False, # set to false to not backfill prov mean
    "backfill_lag": False, # set to false to not backfill lag
    "lag_to_add": 2, # add lag1/lag2
}

## Move Sharepoint files to S3

This is not possible in SageMaker Studio because we are trying to figure out Client ID/Secret for the Sharepoint REST API. For now, run the service locally using your own Office365 username/password

## Data Consolidation

Make sure the measure `settings` are changed.


Call `data_consolidation_main.main(settings)`


This will download the full-measure-data in S3, load it into a single data frame, normalize the columns, filter to only include `filter_measure` and then upload the result to S3.


Result csv and metadata are uploaded to https://s3.console.aws.amazon.com/s3/buckets/haip-measure-specific-data?region=us-east-1&tab=objects

In [6]:
# data_consolidation_svc.main(settings)

Downloading from haip-full-measure-data: SEP-1/01_2017.csv to tmp/split/01_2017.csv
Downloading from haip-full-measure-data: SEP-1/Timely and Effective Care - Hospital_01_2015.csv to tmp/split/Timely and Effective Care - Hospital_01_2015.csv
Downloading from haip-full-measure-data: SEP-1/Timely and Effective Care - Hospital_01_2018.csv to tmp/split/Timely and Effective Care - Hospital_01_2018.csv
Downloading from haip-full-measure-data: SEP-1/Timely and Effective Care - Hospital_01_2020.csv to tmp/split/Timely and Effective Care - Hospital_01_2020.csv
Downloading from haip-full-measure-data: SEP-1/Timely and Effective Care - Hospital_03_2019.csv to tmp/split/Timely and Effective Care - Hospital_03_2019.csv
Downloading from haip-full-measure-data: SEP-1/Timely and Effective Care - Hospital_04_2015.csv to tmp/split/Timely and Effective Care - Hospital_04_2015.csv
Downloading from haip-full-measure-data: SEP-1/Timely and Effective Care - Hospital_04_2017.csv to tmp/split/Timely and Effect

## Feature Engineering

Call `feature_engineering_main.main(settings)`

This will download the consolidated measure-specific-data in S3, normalize the columns, and add the new features, then upload the result to S3.
Results should also be saved locally automatically, but that is not yet implemented. For now download from the link below and upload the file to SageMaker Studio.


Result csv and metadata are uploaded to https://s3.console.aws.amazon.com/s3/buckets/haip-modeling-dataset?region=us-east-1&tab=objects

In [7]:
# feature_engineering_svc.main(settings)

Downloading from haip-measure-specific-data: SEP-1.csv to tmp/SEP-1.csv
Normalizing Provider Ids
Normalizing Scores
Normalizing Measure Start Date
Normalizing Measure End Date
Dropping NaN rows for columns score
Dropping NaN rows for columns measure_start_date
Dropping NaN rows for columns measure_end_date
Adding  lag1
Adding  lag2
backfill_lag is false, skipping fill
backfill_prov_mean is false, skipping fill
   provider_id  score  lag1  lag2  year  quarter  lag_diff  prov_mean  \
0       100001   18.0   NaN   NaN  2017        3       NaN        NaN   
1       100001   19.0  18.0   NaN  2017        4       1.0  18.000000   
2       100001   18.0  19.0  18.0  2018        1      -1.0  18.500000   
3       100001   26.0  18.0  19.0  2018        2       8.0  18.333333   
4       100001   32.0  26.0  18.0  2018        3       6.0  20.250000   
5       100001   36.0  32.0  26.0  2018        4       4.0  22.600000   
6       100001   45.0  36.0  32.0  2019        1       9.0  24.833333   
7 

## Model Training

### XGBoost Regression

In [5]:
xgbr_training_settings = {
    "run_save_path": "model_runs/xgboost_regression",
    "model_dataset_path": settings.get('save_modeling_dataset_path'),
    "model_type": "XGBRegression", 
    "xgbr_parameters": {
        'eta': [[0.05, 0.1, 0.2, 0.3]], # [[0.05, 0.1, 0.2, 0.3]]
        'gamma': [range(0, 3)],  # [range(0, 2)]
        'max_depth': [range(5, 10)],  # [range(5, 9)]
        'min_child_weight': [range(3, 10)],  # [range(3, 9)]
        'subsample': [[1]],
        'alpha': [[0, 1, 2]]
    },
    "drop_cols": ['provider_id', 'lag2', 'lag_diff', 'prov_mean_diff', 'year', 'quarter'], # use just lag1, prov_mean
}

settings = {**settings, **xgbr_training_settings}

In [None]:
trainer = model_training_svc.get_model_trainer(settings)
candidate = trainer.tune()

Fitting 50 folds for each of 1260 candidates, totalling 63000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   22.7s
[Parallel(n_jobs=-1)]: Done 224 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 448 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1088 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 1504 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 1984 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 2528 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 3808 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done 4544 tasks      | elapsed: 22.4min
[Parallel(n_jobs=-1)]: Done 5344 tasks      | elapsed: 26.5min
[Parallel(n_jobs=-1)]: Done 6208 tasks      | elapsed: 28.8min
[Parallel(n_jobs=-1)]: Done 7136 tasks      | elapsed: 31.6min
[Parallel(n_jobs=-1)]: Done 8128 tasks      |

[0]	validation_0-rmse:55.34928
[1]	validation_0-rmse:52.64477
[2]	validation_0-rmse:50.07874
[3]	validation_0-rmse:47.64412
[4]	validation_0-rmse:45.33471
[5]	validation_0-rmse:43.14394
[6]	validation_0-rmse:41.06600
[7]	validation_0-rmse:39.09546


[Parallel(n_jobs=-1)]: Done 63000 out of 63000 | elapsed: 158.7min finished


[8]	validation_0-rmse:37.22784
[9]	validation_0-rmse:35.45755
[10]	validation_0-rmse:33.78019
[11]	validation_0-rmse:32.19119
[12]	validation_0-rmse:30.68607
[13]	validation_0-rmse:29.26074
[14]	validation_0-rmse:27.91170
[15]	validation_0-rmse:26.63586
[16]	validation_0-rmse:25.42908
[17]	validation_0-rmse:24.28811
[18]	validation_0-rmse:23.20994
[19]	validation_0-rmse:22.19239
[20]	validation_0-rmse:21.23169
[21]	validation_0-rmse:20.32548
[22]	validation_0-rmse:19.47052
[23]	validation_0-rmse:18.66508
[24]	validation_0-rmse:17.90782
[25]	validation_0-rmse:17.19560
[26]	validation_0-rmse:16.52556
[27]	validation_0-rmse:15.89703
[28]	validation_0-rmse:15.30732
[29]	validation_0-rmse:14.75489
[30]	validation_0-rmse:14.23712
[31]	validation_0-rmse:13.75400
[32]	validation_0-rmse:13.30181
[33]	validation_0-rmse:12.88044
[34]	validation_0-rmse:12.48735
[35]	validation_0-rmse:12.12174
[36]	validation_0-rmse:11.78107
[37]	validation_0-rmse:11.46540
[38]	validation_0-rmse:11.17210
[39]	valid

In [8]:
candidate

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_eta,param_gamma,param_max_depth,param_min_child_weight,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,split10_test_score,split11_test_score,split12_test_score,split13_test_score,split14_test_score,split15_test_score,split16_test_score,split17_test_score,split18_test_score,split19_test_score,split20_test_score,split21_test_score,split22_test_score,split23_test_score,split24_test_score,split25_test_score,split26_test_score,split27_test_score,split28_test_score,split29_test_score,split30_test_score,split31_test_score,split32_test_score,split33_test_score,split34_test_score,split35_test_score,split36_test_score,split37_test_score,split38_test_score,split39_test_score,split40_test_score,split41_test_score,split42_test_score,split43_test_score,split44_test_score,split45_test_score,split46_test_score,split47_test_score,split48_test_score,split49_test_score,mean_test_score,std_test_score,rank_test_score
0,7.934652,2.260836,0.058230,0.021454,0,0.05,0,5,3,1,"{'alpha': 0, 'eta': 0.05, 'gamma': 0, 'max_dep...",-8.140461,-8.187289,-7.798636,-8.083535,-8.168523,-8.105336,-7.863071,-8.119880,-8.231062,-8.059890,-8.376515,-8.127560,-8.071932,-8.081451,-7.695405,-8.007006,-8.285622,-7.907081,-7.957414,-8.230015,-7.891311,-7.968098,-8.073309,-8.300922,-8.140089,-8.075180,-8.358170,-8.090919,-7.933867,-7.919118,-8.277072,-8.149328,-7.963385,-8.132658,-7.800621,-8.277230,-7.959003,-7.891425,-8.231524,-7.957405,-8.049130,-7.915347,-8.200578,-8.040045,-8.135330,-8.179151,-7.801004,-7.853494,-8.403825,-8.187932,-8.073083,0.162404,50
1,7.737218,2.365366,0.055222,0.022342,0,0.05,0,5,4,1,"{'alpha': 0, 'eta': 0.05, 'gamma': 0, 'max_dep...",-8.134936,-8.189322,-7.798141,-8.080913,-8.170604,-8.126913,-7.857103,-8.121669,-8.232268,-8.051750,-8.376571,-8.124515,-8.085525,-8.077131,-7.701693,-8.008120,-8.292365,-7.915758,-7.969376,-8.224557,-7.886506,-7.953926,-8.081201,-8.303375,-8.136875,-8.079429,-8.358950,-8.089005,-7.925996,-7.921735,-8.270957,-8.154412,-7.967916,-8.129335,-7.803165,-8.276365,-7.957649,-7.902183,-8.233311,-7.969103,-8.051440,-7.917257,-8.210047,-8.040222,-8.127976,-8.185869,-7.799837,-7.849204,-8.401081,-8.172778,-8.073927,0.162008,80
2,7.887402,2.291543,0.061908,0.025236,0,0.05,0,5,5,1,"{'alpha': 0, 'eta': 0.05, 'gamma': 0, 'max_dep...",-8.140580,-8.181684,-7.792750,-8.083266,-8.165784,-8.122398,-7.858600,-8.116951,-8.233043,-8.055811,-8.371094,-8.126733,-8.080252,-8.088395,-7.701666,-8.007395,-8.291373,-7.913790,-7.970750,-8.230482,-7.871483,-7.956593,-8.083835,-8.304557,-8.145853,-8.078533,-8.359797,-8.076881,-7.927938,-7.916463,-8.280425,-8.161659,-7.965914,-8.130713,-7.804054,-8.278198,-7.961724,-7.908751,-8.238087,-7.956462,-8.053737,-7.913613,-8.208201,-8.049040,-8.129020,-8.184305,-7.805758,-7.857608,-8.408219,-8.181091,-8.074626,0.162829,113
3,8.095480,1.954047,0.061581,0.022887,0,0.05,0,5,6,1,"{'alpha': 0, 'eta': 0.05, 'gamma': 0, 'max_dep...",-8.140229,-8.187224,-7.795494,-8.088616,-8.168381,-8.123915,-7.863423,-8.118351,-8.226206,-8.049789,-8.371340,-8.126704,-8.081499,-8.086804,-7.702174,-8.011038,-8.291852,-7.896814,-7.972258,-8.210368,-7.880010,-7.961358,-8.076121,-8.307373,-8.147387,-8.076378,-8.356090,-8.088198,-7.910803,-7.927843,-8.261253,-8.157634,-7.962296,-8.124818,-7.805223,-8.277492,-7.965026,-7.902994,-8.244582,-7.963137,-8.055343,-7.909575,-8.205197,-8.047565,-8.137979,-8.184167,-7.798365,-7.854351,-8.415545,-8.186268,-8.074057,0.162733,86
4,8.321019,2.479329,0.065201,0.027717,0,0.05,0,5,7,1,"{'alpha': 0, 'eta': 0.05, 'gamma': 0, 'max_dep...",-8.144709,-8.185712,-7.798005,-8.069653,-8.169084,-8.115467,-7.863531,-8.117221,-8.228422,-8.044804,-8.378979,-8.122973,-8.086254,-8.081348,-7.699243,-8.006353,-8.284858,-7.904341,-7.966617,-8.234182,-7.872736,-7.963649,-8.068237,-8.313432,-8.139981,-8.070936,-8.361944,-8.084782,-7.919909,-7.924907,-8.267061,-8.161134,-7.963017,-8.128295,-7.808789,-8.278084,-7.960544,-7.894024,-8.245524,-7.956977,-8.056070,-7.904726,-8.204962,-8.045207,-8.136977,-8.183615,-7.794958,-7.858325,-8.420406,-8.176118,-8.073342,0.164194,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1255,1.477933,0.054986,0.013142,0.002837,2,0.3,2,9,5,1,"{'alpha': 2, 'eta': 0.3, 'gamma': 2, 'max_dept...",-8.196248,-8.272733,-7.855485,-8.137968,-8.203892,-8.159380,-7.962641,-8.148788,-8.310103,-8.127814,-8.433777,-8.179867,-8.141224,-8.132996,-7.746550,-8.127000,-8.362300,-7.975302,-8.009316,-8.248365,-7.951513,-8.026495,-8.139680,-8.370000,-8.196813,-8.135817,-8.440400,-8.158255,-8.013895,-7.967233,-8.363103,-8.203307,-8.000107,-8.175757,-7.877585,-8.335227,-8.040630,-7.964188,-8.285061,-8.020632,-8.094814,-7.995439,-8.299385,-8.112335,-8.200520,-8.232119,-7.850985,-7.912539,-8.470973,-8.232798,-8.135987,0.162943,1237
1256,1.491880,0.052690,0.012946,0.000586,2,0.3,2,9,6,1,"{'alpha': 2, 'eta': 0.3, 'gamma': 2, 'max_dept...",-8.208954,-8.267534,-7.844866,-8.149640,-8.217564,-8.161272,-7.934400,-8.162844,-8.306099,-8.108685,-8.428537,-8.179795,-8.143463,-8.126227,-7.742545,-8.085706,-8.345264,-8.001431,-7.988073,-8.273902,-7.949537,-7.998334,-8.119418,-8.368896,-8.199095,-8.142415,-8.428150,-8.139511,-7.999082,-7.968488,-8.359768,-8.216512,-8.012148,-8.190075,-7.867680,-8.327935,-8.016136,-7.963853,-8.291235,-8.012852,-8.092413,-7.973336,-8.256540,-8.109392,-8.162366,-8.235329,-7.843417,-7.922100,-8.482467,-8.224210,-8.130990,0.164749,1213
1257,1.498222,0.057667,0.013168,0.000643,2,0.3,2,9,7,1,"{'alpha': 2, 'eta': 0.3, 'gamma': 2, 'max_dept...",-8.204619,-8.273095,-7.871577,-8.168999,-8.220816,-8.149996,-7.929505,-8.154533,-8.310768,-8.110686,-8.409528,-8.156209,-8.135071,-8.133701,-7.737648,-8.088048,-8.327232,-7.965132,-8.002346,-8.253070,-7.930513,-8.001682,-8.100154,-8.361815,-8.200830,-8.138093,-8.418655,-8.127627,-7.991793,-7.971107,-8.339937,-8.191599,-8.007522,-8.178135,-7.866672,-8.325078,-8.044666,-7.953887,-8.284646,-7.998840,-8.083640,-7.961496,-8.267674,-8.103034,-8.189148,-8.253644,-7.856745,-7.906414,-8.479112,-8.228871,-8.127312,0.163067,1195
1258,1.493556,0.061440,0.013165,0.000901,2,0.3,2,9,8,1,"{'alpha': 2, 'eta': 0.3, 'gamma': 2, 'max_dept...",-8.217006,-8.271803,-7.876116,-8.138927,-8.213860,-8.144654,-7.934754,-8.155461,-8.289721,-8.083686,-8.398205,-8.155836,-8.135185,-8.145474,-7.734433,-8.113357,-8.335855,-7.962481,-7.999588,-8.246506,-7.931372,-8.016381,-8.112808,-8.355244,-8.183973,-8.130456,-8.400259,-8.135193,-8.009396,-7.967949,-8.343861,-8.216043,-8.015527,-8.175326,-7.863199,-8.323776,-8.027530,-7.939253,-8.298517,-8.021787,-8.105238,-7.963807,-8.256188,-8.089363,-8.169080,-8.217244,-7.831526,-7.922026,-8.480090,-8.207232,-8.125251,0.160960,1175


### Isolation Forest

In [6]:
if_training_settings = {
    "run_save_path": "model_runs/isolation_forest",
    "model_dataset_path": settings.get('save_modeling_dataset_path'),
    "model_type": "Isolation Forest", 
    "if_parameters": {
        # Features to include
        'features_included': [['lag_diff', 'prov_mean_diff']],
        # The number of base estimators in the ensemble.
        'n_estimators': [50, 100, 200, 300],
        # The number of samples to draw from X to train each base estimator.
        'max_samples': ['auto', 200, 175, 128],
        # The number of features to draw from X to train each base estimator.
        'max_features': [1],
        # If True, individual trees are fit on random subsets of the training data sampled with replacement.
        # If False, sampling without replacement is performed.
        'bootstrap': [False, True]
    },
    "drop_cols": ['provider_id', 'lag2', 'year', 'quarter'], # use just lag1, prov_mean
    "shift_range": range(30, 50) # 30-50 for SEP-01
}

settings = {**settings, **if_training_settings}

In [7]:
trainer = model_training_svc.get_model_trainer(settings)
candidate = trainer.tune()

Testing Model #5
Testing Model #10
Testing Model #15
Testing Model #20
Testing Model #25
Testing Model #30
Model tuning complete!


In [9]:
candidate

Unnamed: 0,model_id,features,model_params,auroc_total,unweighted_aat_score_0.5,weighted_aat_score_0.5,auroc_threshold_0.5,unweighted_aat_score_0.51,weighted_aat_score_0.51,auroc_threshold_0.51,unweighted_aat_score_0.52,weighted_aat_score_0.52,auroc_threshold_0.52,unweighted_aat_score_0.53,weighted_aat_score_0.53,auroc_threshold_0.53,unweighted_aat_score_0.54,weighted_aat_score_0.54,auroc_threshold_0.54,unweighted_aat_score_0.55,weighted_aat_score_0.55,auroc_threshold_0.55,unweighted_aat_score_0.56,weighted_aat_score_0.56,auroc_threshold_0.56,unweighted_aat_score_0.57,weighted_aat_score_0.57,auroc_threshold_0.57,unweighted_aat_score_0.58,weighted_aat_score_0.58,auroc_threshold_0.58,unweighted_aat_score_0.59,weighted_aat_score_0.59,auroc_threshold_0.59,unweighted_aat_score_0.6,weighted_aat_score_0.6,auroc_threshold_0.6,unweighted_aat_score_0.61,weighted_aat_score_0.61,auroc_threshold_0.61,unweighted_aat_score_0.62,weighted_aat_score_0.62,auroc_threshold_0.62,unweighted_aat_score_0.63,weighted_aat_score_0.63,auroc_threshold_0.63,unweighted_aat_score_0.64,weighted_aat_score_0.64,auroc_threshold_0.64,unweighted_aat_score_0.65,weighted_aat_score_0.65,auroc_threshold_0.65
0,1,"['lag_diff', 'prov_mean_diff']","{'bootstrap': False, 'features_included': ['la...",0.994794,0.997115,0.997409,0.91351,0.996305,0.996691,0.924655,0.995315,0.995806,0.935982,0.99348,0.99418,0.943365,0.991415,0.992299,0.949032,0.9894,0.990485,0.95394,0.98756,0.988799,0.95819,0.985655,0.987075,0.962372,0.98389,0.985479,0.965443,0.981635,0.983434,0.96775,0.97921,0.981251,0.96984,0.976245,0.978552,0.971037,0.97348,0.976076,0.972032,0.96949,0.972494,0.971947,0.96383,0.967343,0.970598,0.95731,0.961434,0.96902
1,2,"['lag_diff', 'prov_mean_diff']","{'bootstrap': False, 'features_included': ['la...",0.995566,0.997685,0.997898,0.908675,0.997195,0.997459,0.921678,0.996665,0.996982,0.932865,0.996045,0.996434,0.94155,0.995145,0.995645,0.94987,0.99406,0.994681,0.955773,0.992245,0.993072,0.96043,0.9897,0.990791,0.964355,0.98782,0.989074,0.967653,0.986035,0.987469,0.970313,0.98362,0.98528,0.972062,0.98104,0.982947,0.973523,0.97781,0.980043,0.97431,0.97334,0.976009,0.973915,0.968415,0.971595,0.97318,0.96302,0.966743,0.97182
2,3,"['lag_diff', 'prov_mean_diff']","{'bootstrap': False, 'features_included': ['la...",0.995761,0.997545,0.997771,0.916465,0.997095,0.997354,0.927632,0.99661,0.99692,0.938207,0.996065,0.996434,0.946052,0.995375,0.995815,0.953055,0.994555,0.995084,0.958698,0.993035,0.993766,0.963072,0.99099,0.991944,0.96682,0.988525,0.989749,0.96873,0.98586,0.987318,0.970965,0.983565,0.985231,0.972785,0.98083,0.98276,0.973723,0.97749,0.979741,0.974475,0.973155,0.975816,0.9742,0.96902,0.97212,0.973475,0.96319,0.966883,0.97214
3,4,"['lag_diff', 'prov_mean_diff']","{'bootstrap': False, 'features_included': ['la...",0.99596,0.997615,0.997837,0.917468,0.997135,0.997407,0.92918,0.99672,0.997038,0.939203,0.99619,0.996555,0.946755,0.995495,0.995946,0.953588,0.994565,0.995136,0.95899,0.99345,0.994173,0.963263,0.99142,0.992359,0.96705,0.98879,0.990009,0.96925,0.98629,0.98774,0.97131,0.98387,0.985557,0.972998,0.981225,0.983152,0.973983,0.978265,0.980516,0.97481,0.97446,0.97704,0.974695,0.97058,0.97358,0.97418,0.96539,0.968974,0.973048
4,5,"['lag_diff', 'prov_mean_diff']","{'bootstrap': False, 'features_included': ['la...",0.995357,0.99767,0.99789,0.907332,0.997115,0.997393,0.920192,0.99634,0.996714,0.931185,0.995405,0.995881,0.940427,0.994075,0.994711,0.947832,0.99226,0.993104,0.953402,0.98995,0.991015,0.95781,0.988325,0.989505,0.9616,0.98666,0.988008,0.965662,0.985055,0.986573,0.968032,0.981865,0.983725,0.969352,0.978255,0.980472,0.969935,0.974455,0.976968,0.970817,0.970785,0.973652,0.97096,0.966925,0.970127,0.970707,0.963095,0.966694,0.969943
5,6,"['lag_diff', 'prov_mean_diff']","{'bootstrap': False, 'features_included': ['la...",0.995839,0.99798,0.998196,0.904173,0.997575,0.997835,0.91933,0.997095,0.9974,0.929732,0.99658,0.996945,0.940142,0.995855,0.996306,0.947228,0.99487,0.995423,0.953873,0.993595,0.994296,0.959565,0.991125,0.992094,0.963773,0.988865,0.990039,0.966813,0.98689,0.98826,0.969565,0.985385,0.986908,0.971705,0.98297,0.984756,0.97316,0.9786,0.980776,0.973625,0.97477,0.977291,0.973962,0.970905,0.973807,0.973663,0.965485,0.96889,0.97231
6,7,"['lag_diff', 'prov_mean_diff']","{'bootstrap': False, 'features_included': ['la...",0.995927,0.997865,0.998067,0.90648,0.997525,0.997761,0.92009,0.99699,0.997278,0.93126,0.996445,0.996784,0.939625,0.99583,0.996239,0.947327,0.994975,0.995482,0.95413,0.993735,0.994381,0.959963,0.99215,0.992957,0.964172,0.98958,0.990672,0.967005,0.987135,0.98845,0.969307,0.985365,0.986865,0.97161,0.98276,0.984496,0.972877,0.97955,0.981598,0.973638,0.975765,0.978144,0.97421,0.97184,0.974666,0.97374,0.96715,0.970465,0.972895
7,8,"['lag_diff', 'prov_mean_diff']","{'bootstrap': False, 'features_included': ['la...",0.995957,0.99793,0.998137,0.909848,0.99758,0.997822,0.92155,0.997085,0.997376,0.933245,0.996695,0.997031,0.9418,0.99599,0.996396,0.94966,0.99506,0.995566,0.955965,0.994005,0.994651,0.960695,0.992025,0.992913,0.96473,0.989485,0.990636,0.967395,0.987025,0.988391,0.969555,0.98489,0.986451,0.971463,0.982175,0.984023,0.972633,0.978855,0.981012,0.973325,0.974895,0.977379,0.973545,0.971555,0.974384,0.973592,0.96736,0.970625,0.972735
8,9,"['lag_diff', 'prov_mean_diff']","{'bootstrap': False, 'features_included': ['la...",0.995457,0.99764,0.997885,0.910522,0.99707,0.997376,0.923492,0.996345,0.996739,0.933878,0.99539,0.995893,0.941908,0.99399,0.994649,0.94865,0.991935,0.992832,0.95415,0.989985,0.991054,0.95869,0.988235,0.989477,0.963368,0.986385,0.987798,0.966272,0.98346,0.985183,0.968487,0.97931,0.981417,0.96902,0.97572,0.978061,0.969905,0.97317,0.975759,0.97092,0.96939,0.972396,0.97114,0.9651,0.968537,0.970878,0.96021,0.964074,0.9697
9,10,"['lag_diff', 'prov_mean_diff']","{'bootstrap': False, 'features_included': ['la...",0.99577,0.99793,0.998164,0.90002,0.997535,0.997807,0.914072,0.997005,0.99733,0.926788,0.9962,0.996619,0.936207,0.995355,0.995859,0.945137,0.994355,0.994988,0.951943,0.992435,0.993286,0.95768,0.990545,0.99157,0.961495,0.988915,0.990105,0.96581,0.987085,0.988447,0.9687,0.98491,0.986521,0.970865,0.98203,0.983927,0.972132,0.97821,0.980521,0.972747,0.974505,0.977116,0.97335,0.97111,0.974058,0.973485,0.96568,0.969206,0.972132


### Ensamble Model

## Model Selection

Make sure `model_run_data_path` is set in settings to the tuning result path.
Make sure `model_type` is set in settings to `XGBRegression`, `Isolation Forest`, or `Ensemble`

Call `model_selector = model_selection_main.get_model_selector(settings)` to get the appropriate selector.
Then call `model_selector.get_top_models()` to get the top models

To get the candidate model that matches one of the top_model:
     Call `model_selector.get_candidate_model_by_index(INDEX)` to get the candidate model
     Call `model_selector.get_candidate_model_params_by_index(INDEX)` to get the candidate model params

### XG Boost Regression

For `model_run_data_path` right click the file in side nav and choose `Copy Path`. Paste that and add a `/root/` before it.

In [9]:
xg_boost_regression_settings = {
    "model_run_data_path": '/root/HAIP/measures/sep-01/model_runs/xgboost_regression/tuning_results_run_2022-11-22-17h28m.csv',    
    "model_type": "XGBRegression"
}


settings = {**settings, **xg_boost_regression_settings}

In [10]:
model_selector = model_selection_svc.get_model_selector(settings)
model_selector.get_top_models()

Unnamed: 0,mean_test_score,std_test_score,rank_test_score,param_alpha,param_eta,param_gamma,param_max_depth,param_min_child_weight,param_subsample,params
881,-8.0717,0.1632,1,2,0.05,1,5,9,1,"{'alpha': 2, 'eta': 0.05, 'gamma': 1, 'max_dep..."
6,-8.0718,0.1636,2,0,0.05,0,5,9,1,"{'alpha': 0, 'eta': 0.05, 'gamma': 0, 'max_dep..."
840,-8.0718,0.1633,3,2,0.05,0,5,3,1,"{'alpha': 2, 'eta': 0.05, 'gamma': 0, 'max_dep..."
949,-8.0719,0.1628,5,2,0.1,0,5,7,1,"{'alpha': 2, 'eta': 0.1, 'gamma': 0, 'max_dept..."
41,-8.0719,0.1636,4,0,0.05,1,5,9,1,"{'alpha': 0, 'eta': 0.05, 'gamma': 1, 'max_dep..."
875,-8.072,0.1632,6,2,0.05,1,5,3,1,"{'alpha': 2, 'eta': 0.05, 'gamma': 1, 'max_dep..."
910,-8.072,0.1632,7,2,0.05,2,5,3,1,"{'alpha': 2, 'eta': 0.05, 'gamma': 2, 'max_dep..."
455,-8.072,0.1633,10,1,0.05,1,5,3,1,"{'alpha': 1, 'eta': 0.05, 'gamma': 1, 'max_dep..."
35,-8.072,0.1622,9,0,0.05,1,5,3,1,"{'alpha': 0, 'eta': 0.05, 'gamma': 1, 'max_dep..."
496,-8.072,0.1646,8,1,0.05,2,5,9,1,"{'alpha': 1, 'eta': 0.05, 'gamma': 2, 'max_dep..."


In [11]:
selected_model_index = 881
model_selector.get_candidate_model_by_index(selected_model_index)

mean_fit_time         7.8975
std_fit_time          2.4792
mean_score_time       0.0649
std_score_time        0.0322
param_alpha                2
                       ...  
split48_test_score   -8.4154
split49_test_score   -8.1826
mean_test_score      -8.0717
std_test_score        0.1632
rank_test_score            1
Name: 881, Length: 64, dtype: object

In [12]:
model_selector.get_candidate_model_params_by_index(selected_model_index)

{'alpha': 2,
 'eta': 0.05,
 'gamma': 1,
 'max_depth': 5,
 'min_child_weight': 9,
 'subsample': 1}

### Isolation Forest

For `model_run_data_path` right click the file in side nav and choose `Copy Path`. Paste that and add a `/root/` before it.

In [5]:
if_settings = {
    "model_run_data_path": '/root/HAIP/measures/sep-01/model_runs/isolation_forest/tuning_results_run_2022-11-23-18h38m.csv',    
    "model_type": "Isolation Forest"
}


settings = {**settings, **if_settings}

In [6]:
model_selector = model_selection_svc.get_model_selector(settings)
model_selector.get_top_models()

Unnamed: 0,bootstrap,features_included,max_features,max_samples,n_estimators,model_id,auroc_total,auroc_threshold_0.5,auroc_threshold_0.51,auroc_threshold_0.52,auroc_threshold_0.53,auroc_threshold_0.54,auroc_threshold_0.55,auroc_threshold_0.56,auroc_threshold_0.57,auroc_threshold_0.58,auroc_threshold_0.59,auroc_threshold_0.6,auroc_threshold_0.61,auroc_threshold_0.62,auroc_threshold_0.63,auroc_threshold_0.64,auroc_threshold_0.65
3,False,"[lag_diff, prov_mean_diff]",1,auto,300,4,0.996,0.9175,0.9292,0.9392,0.9468,0.9536,0.959,0.9633,0.967,0.9692,0.9713,0.973,0.974,0.9748,0.9747,0.9742,0.973
7,False,"[lag_diff, prov_mean_diff]",1,200,300,8,0.996,0.9098,0.9216,0.9332,0.9418,0.9497,0.956,0.9607,0.9647,0.9674,0.9696,0.9715,0.9726,0.9733,0.9735,0.9736,0.9727
26,True,"[lag_diff, prov_mean_diff]",1,175,200,27,0.996,0.9011,0.9149,0.9275,0.9374,0.9455,0.9523,0.9585,0.9626,0.9659,0.9686,0.9709,0.9723,0.9733,0.9737,0.9736,0.9733
6,False,"[lag_diff, prov_mean_diff]",1,200,200,7,0.9959,0.9065,0.9201,0.9313,0.9396,0.9473,0.9541,0.96,0.9642,0.967,0.9693,0.9716,0.9729,0.9736,0.9742,0.9737,0.9729
19,True,"[lag_diff, prov_mean_diff]",1,auto,300,20,0.9959,0.917,0.9281,0.9382,0.9464,0.9525,0.9585,0.9631,0.9668,0.9695,0.9714,0.9731,0.9741,0.9746,0.9743,0.9737,0.9726
2,False,"[lag_diff, prov_mean_diff]",1,auto,200,3,0.9958,0.9165,0.9276,0.9382,0.9461,0.9531,0.9587,0.9631,0.9668,0.9687,0.971,0.9728,0.9737,0.9745,0.9742,0.9735,0.9721
5,False,"[lag_diff, prov_mean_diff]",1,200,100,6,0.9958,0.9042,0.9193,0.9297,0.9401,0.9472,0.9539,0.9596,0.9638,0.9668,0.9696,0.9717,0.9732,0.9736,0.974,0.9737,0.9723
9,False,"[lag_diff, prov_mean_diff]",1,175,100,10,0.9958,0.9,0.9141,0.9268,0.9362,0.9451,0.9519,0.9577,0.9615,0.9658,0.9687,0.9709,0.9721,0.9727,0.9733,0.9735,0.9721
23,True,"[lag_diff, prov_mean_diff]",1,200,300,24,0.9958,0.9092,0.922,0.9328,0.9415,0.9494,0.9558,0.9604,0.9647,0.9674,0.9698,0.9716,0.9728,0.9734,0.9734,0.9732,0.9723
29,True,"[lag_diff, prov_mean_diff]",1,128,100,30,0.9958,0.9022,0.9149,0.9246,0.9341,0.943,0.9494,0.9546,0.9591,0.9628,0.9661,0.9689,0.9709,0.9717,0.9723,0.9728,0.9723


In [7]:
selected_model_id = 30
model_selector.get_candidate_model_by_index(selected_model_id)

Unnamed: 0,bootstrap,features_included,max_features,max_samples,n_estimators,model_id,auroc_total,auroc_threshold_0.5,auroc_threshold_0.51,auroc_threshold_0.52,auroc_threshold_0.53,auroc_threshold_0.54,auroc_threshold_0.55,auroc_threshold_0.56,auroc_threshold_0.57,auroc_threshold_0.58,auroc_threshold_0.59,auroc_threshold_0.6,auroc_threshold_0.61,auroc_threshold_0.62,auroc_threshold_0.63,auroc_threshold_0.64,auroc_threshold_0.65
29,True,"[lag_diff, prov_mean_diff]",1,128,100,30,0.9958,0.9022,0.9149,0.9246,0.9341,0.943,0.9494,0.9546,0.9591,0.9628,0.9661,0.9689,0.9709,0.9717,0.9723,0.9728,0.9723


In [8]:
model_selector.get_candidate_model_params_by_index(selected_model_id)

{'bootstrap': True,
 'features_included': ['lag_diff', 'prov_mean_diff'],
 'max_features': 1,
 'max_samples': 128,
 'n_estimators': 100}

### Ensemble (Not yet Implemented)

For `model_run_data_path` right click the file in side nav and choose `Copy Path`. Paste that and add a `/root/` before it.

In [15]:
# ensemble_settings = {
#     "model_run_data_path": '',    
#     "model_type": "Ensemble"
# }


# settings = {**settings, **ensemble_settings}

In [16]:
# model_selector = model_selection_svc.get_model_selector(settings)
# model_selector.get_top_models()

if_final_fit_settings = {
    "model_type": "Isolation Forest",
    "model_dataset_path": settings.get('save_modeling_dataset_path'),
    "if_final_params": if_model_selector.get_candidate_model_params_by_index(if_model_id),
    "drop_cols": ['provider_id', 'lag2'],
}

del if_final_fit_settings['if_final_params']['features_included']

settings = {**settings, **if_final_fit_settings}## TEMP

In [9]:
if_final_fit_settings = {
    "model_type": "Isolation Forest",
    "model_dataset_path": settings.get('save_modeling_dataset_path'),
    "if_final_params": if_model_selector.get_candidate_model_params_by_index(if_model_id),
    "drop_cols": ['provider_id', 'lag2'],
}

del if_final_fit_settings['if_final_params']['features_included']

settings = {**settings, **if_final_fit_settings}

NameError: name 'if_model_selector' is not defined

## Outcome Analysis

In [17]:
outcome_analysis_settings = {
    # "modeling_dataset_path": "/root/HAIP/notebooks/op-10/data/OP-10.csv",
    "modeling_dataset_path": "/root/HAIP/notebooks/tmp/filtered_sep_01_dataset.csv",
    # either set model_pkl_path to the pkl location
    "if_model_pkl_path": "/root/HAIP/notebooks/sep-01-models/isolation_forest/candidate_models/run_2022-10-28-02h31m/model_id36.pkl",
    # OR set model here
    "if_model": None,
    "if_outlier_prediction_threshold": 0.59,

    "xgbr_x_cols": ['year', 'quarter', 'lag1'],
    "xgbr_y_cols": ['score'],
    "xgbr_outlier_prediction_threshold": 15,
    "xgbr_final_params": {
        'alpha': 2,
        'eta': 0.1,
        'gamma': 2,
        'max_depth': 5,
        'min_child_weight': 8,
        'subsample': 0.8999999999999999
    },
}


settings = {**settings, **outcome_analysis_settings}

In [18]:
xgb_modeling_dataset = outcome_analysis_svc.xgboost_regression(settings)
xgb_modeling_dataset

Unnamed: 0,score,lag1,y_quarter,providerId,outlier,predicted_score,lag1_diff,model
0,62,62.0,2017Q3,10001,False,62.008629,0.0,XGBoost Regression
1,61,62.0,2017Q4,10001,False,62.136074,-1.0,XGBoost Regression
2,60,61.0,2018Q1,10001,False,62.264008,-1.0,XGBoost Regression
3,61,60.0,2018Q2,10001,False,61.337521,1.0,XGBoost Regression
4,62,61.0,2018Q3,10001,False,62.609524,1.0,XGBoost Regression
...,...,...,...,...,...,...,...,...
42290,66,72.0,2021Q3,670128,False,70.838768,-6.0,XGBoost Regression
42291,91,91.0,2020Q3,670131,False,87.310577,0.0,XGBoost Regression
42292,87,91.0,2020Q4,670131,False,87.929871,-4.0,XGBoost Regression
42293,86,87.0,2021Q1,670131,False,84.335396,-1.0,XGBoost Regression


In [19]:
if_modeling_dataset = outcome_analysis_svc.isolation_forest(settings)
if_modeling_dataset

Unnamed: 0,score,lag1,y_quarter,providerId,outlier,lag1_diff,model
0,62,62.0,2017Q3,10001,False,0.0,Isolation Forest
1,61,62.0,2017Q4,10001,False,-1.0,Isolation Forest
2,60,61.0,2018Q1,10001,False,-1.0,Isolation Forest
3,61,60.0,2018Q2,10001,False,1.0,Isolation Forest
4,62,61.0,2018Q3,10001,False,1.0,Isolation Forest
...,...,...,...,...,...,...,...
42290,66,72.0,2021Q3,670128,False,-6.0,Isolation Forest
42291,91,91.0,2020Q3,670131,True,0.0,Isolation Forest
42292,87,91.0,2020Q4,670131,True,-4.0,Isolation Forest
42293,86,87.0,2021Q1,670131,True,-1.0,Isolation Forest


In [20]:
comp = outcome_analysis_svc.get_comparison(xgb_modeling_dataset, if_modeling_dataset)
comp

Unnamed: 0,score,lag1,y_quarter,providerId,outlier,predicted_score,lag1_diff,model,provider_mean_score
0,62,62.0,2017Q3,10001,False,62.008629,0.0,XGBoost Regression,59.142857
1,61,62.0,2017Q4,10001,False,62.136074,-1.0,XGBoost Regression,59.142857
2,60,61.0,2018Q1,10001,False,62.264008,-1.0,XGBoost Regression,59.142857
3,61,60.0,2018Q2,10001,False,61.337521,1.0,XGBoost Regression,59.142857
4,62,61.0,2018Q3,10001,False,62.609524,1.0,XGBoost Regression,59.142857
...,...,...,...,...,...,...,...,...,...
84585,66,72.0,2021Q3,670128,False,,-6.0,Isolation Forest,79.500000
84586,91,91.0,2020Q3,670131,True,,0.0,Isolation Forest,83.500000
84587,87,91.0,2020Q4,670131,True,,-4.0,Isolation Forest,83.500000
84588,86,87.0,2021Q1,670131,True,,-1.0,Isolation Forest,83.500000


In [21]:
##################