In [2]:
import sys
import os

# Hardcode so we can use in any notebook
module_path = "/root/HAIP/notebooks/services"
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
# from sharepoint_transfer.src import main as sharepoint_transfer_svc
from data_consolidation.src import main as data_consolidation_svc
from feature_engineering.src import main as feature_engineering_svc
from model_selection.src import main as model_selection_svc
from outcome_analysis.src import main as outcome_analysis_svc

## Set Measure Specific Settings

These will augment and override settings in "/root/HAIP/notebooks/services/config.json"

In [4]:
settings = {
    # Measure specific options
    "SHAREPOINT_MEASURE_DIR": "OP-10 Clean",
    "MEASURE_SPECIFIC_FILENAME": "OP-10.csv",
    "FULL_MEASURE_S3_PREFIX": "OP-10",
    "filter_measure": "OP_10", # make sure this is _ not -
    
    # Feature Engineering options
    "save_modeling_dataset_path": '/root/HAIP/notebooks/op-10/data/OP-10.csv',
    "backfill_prov_mean": True, # set to false to not backfill prov mean
    "backfill_lag": True, # set to false to not backfill lag
    "lag_to_add": 2, # add lag1/lag2
}

## Move Sharepoint files to S3

This is not possible in SageMaker Studio because we are trying to figure out Client ID/Secret for the Sharepoint REST API. For now, run the service locally using your own Office365 username/password

## Data Consolidation

Make sure the measure `settings` are changed.


Call `data_consolidation_main.main(settings)`


This will download the full-measure-data in S3, load it into a single data frame, normalize the columns, filter to only include `filter_measure` and then upload the result to S3.


Result csv and metadata are uploaded to https://s3.console.aws.amazon.com/s3/buckets/haip-measure-specific-data?region=us-east-1&tab=objects

In [5]:
# data_consolidation_svc.main(settings)

## Feature Engineering

Call `feature_engineering_main.main(settings)`

This will download the consolidated measure-specific-data in S3, normalize the columns, and add the new features, then upload the result to S3.
Results should also be saved locally automatically, but that is not yet implemented. For now download from the link below and upload the file to SageMaker Studio.


Result csv and metadata are uploaded to https://s3.console.aws.amazon.com/s3/buckets/haip-modeling-dataset?region=us-east-1&tab=objects

In [6]:
feature_engineering_svc.main(settings)

Downloading from haip-measure-specific-data: OP-10.csv to tmp/OP-10.csv
Normalizing Provider Ids
Normalizing Scores
Normalizing Measure Start Date
Normalizing Measure End Date
Dropping NaN rows for columns score
Dropping NaN rows for columns measure_start_date
Dropping NaN rows for columns measure_end_date
Adding  lag1
Adding  lag2
Lag filling   lag1
Lag filling   lag2
Prov mean filling
    provider_id  score  lag1  lag2  year  quarter  lag_diff  prov_mean  \
0         10001    4.6   4.6   4.6  2013        2       0.0   4.600000   
1         10001    6.5   4.6   4.6  2014        2       1.9   4.600000   
2         10001    6.4   6.5   4.6  2015        2      -0.1   5.550000   
3         10001    9.0   6.4   6.5  2016        2       2.6   5.833333   
4         10001    6.5   9.0   6.4  2017        2      -2.5   6.625000   
5         10001    8.0   6.5   9.0  2018        2       1.5   6.600000   
6         10001    7.2   8.0   6.5  2019        2      -0.8   6.833333   
7         10001   

## Model Selection

Make sure `model_run_data_path` is set in settings to the tuning result path.
Make sure `model_type` is set in settings to `XGBRegression`, `Isolation Forest`, or `Ensemble`

Call `model_selector = model_selection_main.get_model_selector(settings)` to get the appropriate selector.
Then call `model_selector.get_top_models()` to get the top models

To get the candidate model that matches one of the top_model:
     Call `model_selector.get_candidate_model_by_index(INDEX)` to get the candidate model
     Call `model_selector.get_candidate_model_params_by_index(INDEX)` to get the candidate model params

### XG Boost Regression

For `model_run_data_path` right click the file in side nav and choose `Copy Path`. Paste that and add a `/root/` before it.

In [7]:
xg_boost_regression_settings = {
    "model_run_data_path": '/root/HAIP/notebooks/op-10/xgboost regression/xgbr_tuning_results_run_2022-11-08-20h37m.csv',    
    "model_type": "XGBRegression"
}


settings = {**settings, **xg_boost_regression_settings}

In [8]:
model_selector = model_selection_svc.get_model_selector(settings)
model_selector.get_top_models()

Unnamed: 0,mean_test_score,std_test_score,rank_test_score,param_alpha,param_eta,param_gamma,param_max_depth,param_min_child_weight,param_subsample,params
2,-5.8555,0.3707,1,0,0.3,1,6,6,1,"{'alpha': 0, 'eta': 0.3, 'gamma': 1, 'max_dept..."
0,-5.8596,0.3714,2,0,0.3,0,6,6,1,"{'alpha': 0, 'eta': 0.3, 'gamma': 0, 'max_dept..."
4,-5.8647,0.3726,3,1,0.3,0,6,6,1,"{'alpha': 1, 'eta': 0.3, 'gamma': 0, 'max_dept..."
6,-5.8648,0.3726,4,1,0.3,1,6,6,1,"{'alpha': 1, 'eta': 0.3, 'gamma': 1, 'max_dept..."
5,-5.866,0.3694,5,1,0.3,0,7,6,1,"{'alpha': 1, 'eta': 0.3, 'gamma': 0, 'max_dept..."
1,-5.8669,0.3795,6,0,0.3,0,7,6,1,"{'alpha': 0, 'eta': 0.3, 'gamma': 0, 'max_dept..."
3,-5.8679,0.3791,7,0,0.3,1,7,6,1,"{'alpha': 0, 'eta': 0.3, 'gamma': 1, 'max_dept..."
7,-5.8683,0.3717,8,1,0.3,1,7,6,1,"{'alpha': 1, 'eta': 0.3, 'gamma': 1, 'max_dept..."


In [9]:
selected_model_index = 4
model_selector.get_candidate_model_by_index(selected_model_index)

mean_fit_time                                                        0.5056
std_fit_time                                                         0.0125
mean_score_time                                                      0.0075
std_score_time                                                       0.0002
param_alpha                                                               1
param_eta                                                               0.3
param_gamma                                                               0
param_max_depth                                                           6
param_min_child_weight                                                    6
param_subsample                                                           1
params                    {'alpha': 1, 'eta': 0.3, 'gamma': 0, 'max_dept...
split0_test_score                                                   -5.8626
split1_test_score                                                   -5.4838
split2_test_

In [10]:
model_selector.get_candidate_model_params_by_index(selected_model_index)

{'alpha': 1,
 'eta': 0.3,
 'gamma': 0,
 'max_depth': 6,
 'min_child_weight': 6,
 'subsample': 1}

### Isolation Forest

For `model_run_data_path` right click the file in side nav and choose `Copy Path`. Paste that and add a `/root/` before it.

In [11]:
if_settings = {
    "model_run_data_path": '/root/HAIP/notebooks/sep-01-models/isolation_forest/candidate_models/tuning_results_run_2022-10-28-02h31m.csv',    
    "model_type": "Isolation Forest"
}


settings = {**settings, **if_settings}

In [12]:
model_selector = model_selection_svc.get_model_selector(settings)
model_selector.get_top_models()

Unnamed: 0,bootstrap,features_included,max_features,max_samples,n_estimators,model_id,auroc_total,auroc_threshold_0.5,auroc_threshold_0.51,auroc_threshold_0.52,auroc_threshold_0.53,auroc_threshold_0.54,auroc_threshold_0.55,auroc_threshold_0.56,auroc_threshold_0.57,auroc_threshold_0.58,auroc_threshold_0.59,auroc_threshold_0.6,auroc_threshold_0.61,auroc_threshold_0.62,auroc_threshold_0.63,auroc_threshold_0.64,auroc_threshold_0.65
240,True,"[score, lag1, year, quarter]",1.0,auto,50,241,0.9746,0.7061,0.8049,0.8779,0.9294,0.954,0.9665,0.9742,0.9085,0.7145,0.5823,0.5472,0.5262,0.5109,0.5082,0.5044,0.5
48,False,"[score, lag1, year, quarter]",1.0,auto,50,49,0.9733,0.7206,0.814,0.8838,0.9291,0.9539,0.9659,0.9735,0.8625,0.6746,0.5823,0.5471,0.5265,0.5122,0.5081,0.5044,0.5
52,False,"[score, lag1, year, quarter]",1.0,200,50,53,0.9695,0.7147,0.8026,0.8673,0.9184,0.9449,0.9613,0.9203,0.8545,0.7011,0.5984,0.561,0.5354,0.5149,0.5075,0.5038,0.5
244,True,"[score, lag1, year, quarter]",1.0,200,50,245,0.9694,0.7182,0.8049,0.8692,0.9199,0.9465,0.9619,0.9211,0.8549,0.7014,0.6035,0.5608,0.5353,0.5149,0.5077,0.5039,0.5
35,False,"[score, lag1, year, quarter]",0.5,auto,300,36,0.968,0.771,0.8238,0.8627,0.8909,0.9129,0.9303,0.9438,0.9543,0.9632,0.9695,0.8962,0.7656,0.6369,0.5875,0.5499,0.5307
227,True,"[score, lag1, year, quarter]",0.5,auto,300,228,0.9671,0.77,0.8227,0.863,0.8902,0.9124,0.9303,0.944,0.9546,0.9632,0.9691,0.9108,0.7597,0.6251,0.5777,0.5468,0.5295
53,False,"[score, lag1, year, quarter]",1.0,200,100,54,0.9666,0.7493,0.8331,0.8879,0.925,0.9446,0.958,0.9481,0.877,0.7779,0.6311,0.5762,0.55,0.5273,0.5143,0.5066,0.5021
245,True,"[score, lag1, year, quarter]",1.0,200,100,246,0.9666,0.75,0.8354,0.8892,0.9254,0.945,0.9584,0.948,0.8765,0.7802,0.6346,0.5764,0.5517,0.5272,0.5142,0.5066,0.5021
39,False,"[score, lag1, year, quarter]",0.5,200,300,40,0.9658,0.7603,0.8097,0.8566,0.8859,0.908,0.9262,0.9411,0.9527,0.9622,0.9675,0.9227,0.7429,0.6433,0.5823,0.5471,0.5247
231,True,"[score, lag1, year, quarter]",0.5,200,300,232,0.9654,0.7593,0.8072,0.8544,0.8846,0.9061,0.9241,0.9399,0.9519,0.9615,0.967,0.8907,0.7352,0.6437,0.5825,0.542,0.5246


In [13]:
selected_model_id = 36
model_selector.get_candidate_model_by_index(selected_model_id)

Unnamed: 0,bootstrap,features_included,max_features,max_samples,n_estimators,model_id,auroc_total,auroc_threshold_0.5,auroc_threshold_0.51,auroc_threshold_0.52,auroc_threshold_0.53,auroc_threshold_0.54,auroc_threshold_0.55,auroc_threshold_0.56,auroc_threshold_0.57,auroc_threshold_0.58,auroc_threshold_0.59,auroc_threshold_0.6,auroc_threshold_0.61,auroc_threshold_0.62,auroc_threshold_0.63,auroc_threshold_0.64,auroc_threshold_0.65
35,False,"[score, lag1, year, quarter]",0.5,auto,300,36,0.968,0.771,0.8238,0.8627,0.8909,0.9129,0.9303,0.9438,0.9543,0.9632,0.9695,0.8962,0.7656,0.6369,0.5875,0.5499,0.5307


In [14]:
model_selector.get_candidate_model_params_by_index(selected_model_id)

{'bootstrap': False,
 'features_included': ['score', 'lag1', 'year', 'quarter'],
 'max_features': 0.5,
 'max_samples': 200,
 'n_estimators': 50}

### Ensemble (Not yet Implemented)

For `model_run_data_path` right click the file in side nav and choose `Copy Path`. Paste that and add a `/root/` before it.

In [15]:
# ensemble_settings = {
#     "model_run_data_path": '',    
#     "model_type": "Ensemble"
# }


# settings = {**settings, **ensemble_settings}

In [16]:
# model_selector = model_selection_svc.get_model_selector(settings)
# model_selector.get_top_models()

## Outcome Analysis

In [17]:
outcome_analysis_settings = {
    # "modeling_dataset_path": "/root/HAIP/notebooks/op-10/data/OP-10.csv",
    "modeling_dataset_path": "/root/HAIP/notebooks/tmp/filtered_sep_01_dataset.csv",
    # either set model_pkl_path to the pkl location
    "if_model_pkl_path": "/root/HAIP/notebooks/sep-01-models/isolation_forest/candidate_models/run_2022-10-28-02h31m/model_id36.pkl",
    # OR set model here
    "if_model": None,
    "if_outlier_prediction_threshold": 0.59,

    "xgbr_x_cols": ['year', 'quarter', 'lag1'],
    "xgbr_y_cols": ['score'],
    "xgbr_outlier_prediction_threshold": 15,
    "xgbr_final_params": {
        'alpha': 2,
        'eta': 0.1,
        'gamma': 2,
        'max_depth': 5,
        'min_child_weight': 8,
        'subsample': 0.8999999999999999
    },
}


settings = {**settings, **outcome_analysis_settings}

In [18]:
xgb_modeling_dataset = outcome_analysis_svc.xgboost_regression(settings)
xgb_modeling_dataset

Unnamed: 0,score,lag1,y_quarter,providerId,outlier,predicted_score,lag1_diff,model
0,62,62.0,2017Q3,10001,False,62.008629,0.0,XGBoost Regression
1,61,62.0,2017Q4,10001,False,62.136074,-1.0,XGBoost Regression
2,60,61.0,2018Q1,10001,False,62.264008,-1.0,XGBoost Regression
3,61,60.0,2018Q2,10001,False,61.337521,1.0,XGBoost Regression
4,62,61.0,2018Q3,10001,False,62.609524,1.0,XGBoost Regression
...,...,...,...,...,...,...,...,...
42290,66,72.0,2021Q3,670128,False,70.838768,-6.0,XGBoost Regression
42291,91,91.0,2020Q3,670131,False,87.310577,0.0,XGBoost Regression
42292,87,91.0,2020Q4,670131,False,87.929871,-4.0,XGBoost Regression
42293,86,87.0,2021Q1,670131,False,84.335396,-1.0,XGBoost Regression


In [19]:
if_modeling_dataset = outcome_analysis_svc.isolation_forest(settings)
if_modeling_dataset

Unnamed: 0,score,lag1,y_quarter,providerId,outlier,lag1_diff,model
0,62,62.0,2017Q3,10001,False,0.0,Isolation Forest
1,61,62.0,2017Q4,10001,False,-1.0,Isolation Forest
2,60,61.0,2018Q1,10001,False,-1.0,Isolation Forest
3,61,60.0,2018Q2,10001,False,1.0,Isolation Forest
4,62,61.0,2018Q3,10001,False,1.0,Isolation Forest
...,...,...,...,...,...,...,...
42290,66,72.0,2021Q3,670128,False,-6.0,Isolation Forest
42291,91,91.0,2020Q3,670131,True,0.0,Isolation Forest
42292,87,91.0,2020Q4,670131,True,-4.0,Isolation Forest
42293,86,87.0,2021Q1,670131,True,-1.0,Isolation Forest


In [20]:
comp = outcome_analysis_svc.get_comparison(xgb_modeling_dataset, if_modeling_dataset)
comp

Unnamed: 0,score,lag1,y_quarter,providerId,outlier,predicted_score,lag1_diff,model,provider_mean_score
0,62,62.0,2017Q3,10001,False,62.008629,0.0,XGBoost Regression,59.142857
1,61,62.0,2017Q4,10001,False,62.136074,-1.0,XGBoost Regression,59.142857
2,60,61.0,2018Q1,10001,False,62.264008,-1.0,XGBoost Regression,59.142857
3,61,60.0,2018Q2,10001,False,61.337521,1.0,XGBoost Regression,59.142857
4,62,61.0,2018Q3,10001,False,62.609524,1.0,XGBoost Regression,59.142857
...,...,...,...,...,...,...,...,...,...
84585,66,72.0,2021Q3,670128,False,,-6.0,Isolation Forest,79.500000
84586,91,91.0,2020Q3,670131,True,,0.0,Isolation Forest,83.500000
84587,87,91.0,2020Q4,670131,True,,-4.0,Isolation Forest,83.500000
84588,86,87.0,2021Q1,670131,True,,-1.0,Isolation Forest,83.500000


In [21]:
##################