### Bootstrap Feature Imporatance Computation for Institutional Claims Medicaid Model.
- This notebook captures the steps involved in computing feature importance scores across 500 cross-validation folds.
- Further this notebook also holds the information on the number of times a feature was ranked higher than noise variable - for both numerical and categorical features data.

In [1]:
# Importing the required libraries
import os
import sys
import warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

In [2]:
# Adding dependency python files to sys path
sys.path[1:1] = ["/home/ec2-user/SageMaker/GitHub Repos/cipy/cipy/"]
sys.path[2:2] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/"]
sys.path[3:3] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/"]
sys.path[4:4] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/02_step"]

sys.path

['/home/ec2-user/SageMaker/envs/wqp_env/lib/python3.6',
 '/home/ec2-user/SageMaker/GitHub Repos/cipy/cipy/',
 '/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/',
 '/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/',
 '/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/02_step',
 '',
 '/home/ec2-user/SageMaker/envs/wqp_env/lib/python3.6/site-packages',
 '/home/ec2-user/SageMaker/envs/wqp_env/lib/python3.6/lib-dynload',
 '/home/ec2-user/anaconda3/envs/JupyterSystemEnv/lib/python3.6',
 '/home/ec2-user/SageMaker/envs/wqp_env/lib/python3.6/site-packages/IPython/extensions',
 '/home/ec2-user/.ipython']

In [3]:
# Importing methods from wqp_methods path file, especially get_cvfolds will be leveraged in this notebook.
from wqp_methods import *

In [4]:
# Importing BootStrapFeatImp class for computing Feature Importance Scores
from bootstrap_feat_imp import BootstrapFeatImp

In [7]:
# Reading the Institutional claims dataset
ref_col= "claim_filing_ind_code3"

# csv_path for Institutional Claims Data Path
csv_path = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/01_data_preprocessing/preprocessed_datasets/"

# Loading the institutional claims dataset into inst_claims
inst_claims = pd.read_csv(csv_path+"ic_preprocessed_dataset_2021-06-06.csv")

# Creating the event_flag - for survival analysis.
inst_claims["event_flag"] = True

In [28]:
final_cat_cols = ['Description_OccurrenceInformation_1',
 'Description_OccurrenceInformation_2',
 'HCPCSCode_Medicine_Services_and_Procedures',
 'HCPCSCode_Pathology_and_Laboratory_Procedures',
 'HCPCSCode_Surgery',
 'RevCode_Emergency_Room',
 'RevCode_Laboratory',
 'RevCode_Medical/Surgical_Supplies_and_Devices',
 'RevCode_Operating_Room_Services',
 'RevCode_Other_Imaging_Services',
 'RevCode_Pharmacy',
 'RevCode_Pharmacy_-_Extension_of_025X',
 'RevCode_Physical_Therapy',
 'RevCode_Radiology_Diagnostic',
 'admission_source_code',
 'admission_type_code',
 'ccsr_category_1_code_ExternalCauseOfInjury_HI_1',
 'ccsr_category_1_code_OtherDiagnosisInformation_HI_1',
 'ccsr_category_1_code_OtherDiagnosisInformation_HI_2',
 'ccsr_category_1_code_OtherDiagnosisInformation_HI_3',
 'ccsr_category_1_code_Patient’SReasonForVisit_1',
 'ccsr_category_1_code_PrincipalDiagnosis_1',
 'claim_creation_hour',
 'claim_creation_weekday',
 'claim_freq_type_code',
 'claim_has_admission_dthr',
 'claim_has_operating_physician_id',
 'claim_has_payer_claim_control_number',
 'claim_has_referring_provider_id',
 'other_hcpcscodes',
 'other_ndccodes',
 'other_revcodes',
 'payer_name',
 'payer_resp_seq_num',
 'subscriber_city',
 'subscriber_gender',
 'subscriber_pat_gender']

In [29]:
# Load the feature information after preprocessing - 01 Notebook output
filtered_columns_data = pickl("/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/02_step/inst_claims_cat_num_filtered_feature_details.pickle")

# Derive Final Model Features for..
claim_filing_indicator_code = "Medicaid"
    
# Retrieving claim_filing_indicator_code specific data
cfic_df = inst_claims[inst_claims[ref_col]==claim_filing_indicator_code].reset_index()

# Clipping the response_time values as per claim_filing_indicator_code specific percentiles
cfic_df = clip_right_tail2(cfic_df, 0.98)

# Creating the structured array with event_flag and response_time values.
y = Surv.from_arrays(cfic_df["event_flag"].values, cfic_df["response_time"].values)

# Retrieving categorical features..
cat_cols = final_cat_cols#filtered_columns_data[claim_filing_indicator_code]["final_cat_cols"]
cat_df = cfic_df[cat_cols]

# Combining 'RevCode_Pharmacy' & 'RevCode_Pharmacy_-_Extension_of_025X'
mask1 = cat_df["RevCode_Pharmacy_-_Extension_of_025X"] == "Yes"
cat_df.loc[mask1, "RevCode_Pharmacy"] = "Yes"
del cat_df["RevCode_Pharmacy_-_Extension_of_025X"]

# Encoding categorical features..
le = LabelEncoder()
encoded_cat_df = cat_df.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type="expand")

# Adding noise variable for categorical data
np.random.seed(0)
encoded_cat_df["noise"] = np.random.poisson(5, size=len(encoded_cat_df))

# Retrieving numerical features..
num_cols = filtered_columns_data[claim_filing_indicator_code]["final_num_cols"]
num_df = cfic_df[num_cols]

# Adding noise variable for numerical data
np.random.seed(0)
num_df["noise"] = np.random.normal(0, 1, len(num_df))

In [9]:
cv_folds = []

if os.path.isfile("inst_medicaid_cv_folds.pickle"):
    cv_folds=pickl("inst_medicaid_cv_folds.pickle")

else:
    for i in range(50):
        cv_folds += get_folds(cfic_df)
        
    pickl("inst_medicaid_cv_folds.pickle", cv_folds)

***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 3836 418 6.1005 6.1005 418 0
1 3864 390 5.7974 5.7974 390 0
2 3838 416 6.0841 6.0841 416 0
3 3809 445 6.6135 6.6135 445 0
4 3784 470 6.6 6.6 470 0
5 3799 455 6.1165 6.1165 455 0
6 3804 450 7.2711 7.2711 450 0
7 3860 394 7.0761 7.0761 394 0
8 3861 393 6.7405 6.7405 393 0
9 3831 423 6.1442 6.1442 423 0
***End get_folds method***
***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 3789 465 6.3763 6.3763 465 0
1 3839 415 5.9735 5.9735 415 0
2 3782 472 7.1949 7.1949 472 0
3 3846 408 6.4338 6.4338 408 0
4 3834 420 5.9214 5.9214 420 0
5 3831 423 6.2199 6.2199 423 0
6 3804 450 6.8756 6.8756 450 0
7 3832 422 6.0403 6.0403 422 0
8 3865 389 6.9306 6.9306 389 0
9 3864 390 6.5538 6.5538 390 0
***End get_folds method***
***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst

6 3825 429 6.8392 6.8392 429 0
7 3863 391 5.9386 5.9386 391 0
8 3807 447 6.6085 6.6085 447 0
9 3846 408 6.451 6.451 408 0
***End get_folds method***
***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 3823 431 6.6659 6.6659 431 0
1 3783 471 6.1401 6.1401 471 0
2 3824 430 6.5651 6.5651 430 0
3 3831 423 6.7234 6.7234 423 0
4 3844 410 6.5732 6.5732 410 0
5 3825 429 6.4009 6.4009 429 0
6 3788 466 6.2382 6.2382 466 0
7 3854 400 6.225 6.225 400 0
8 3864 390 6.9026 6.9026 390 0
9 3850 404 6.2277 6.2277 404 0
***End get_folds method***
***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 3857 397 6.1839 6.1839 397 0
1 3829 425 7.0047 7.0047 425 0
2 3847 407 6.2654 6.2654 407 0
3 3822 432 6.2361 6.2361 432 0
4 3779 475 6.3263 6.3263 475 0
5 3819 435 6.2851 6.2851 435 0
6 3834 420 5.5952 5.5952 420 0
7 3860 394 7.236 7.236 394 0
8 3810 444 6.4032 6.4032 444 0
9 382

fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 3870 384 6.151 6.151 384 0
1 3849 405 6.7704 6.7704 405 0
2 3835 419 6.2601 6.2601 419 0
3 3839 415 5.494 5.494 415 0
4 3806 448 6.2188 6.2188 448 0
5 3810 444 6.3018 6.3018 444 0
6 3785 469 7.6567 7.6567 469 0
7 3834 420 6.6762 6.6762 420 0
8 3828 426 7.1056 7.1056 426 0
9 3830 424 5.8137 5.8137 424 0
***End get_folds method***
***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 3812 442 5.8665 5.8665 442 0
1 3811 443 6.3431 6.3431 443 0
2 3860 394 6.2462 6.2462 394 0
3 3789 465 6.7656 6.7656 465 0
4 3842 412 6.8956 6.8956 412 0
5 3817 437 6.2243 6.2243 437 0
6 3840 414 6.5242 6.5242 414 0
7 3863 391 5.9258 5.9258 391 0
8 3842 412 6.4029 6.4029 412 0
9 3810 444 7.3221 7.3221 444 0
***End get_folds method***
***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 3835 4

In [10]:
len(cv_folds)

500

In [11]:
# Initializing BootStrapFeatImp object
btstrp_featimp = BootstrapFeatImp(num_df, encoded_cat_df, y, cv_folds)

In [12]:
# Numerical Features Importance
numfeat_df, numfeat_noise_df = btstrp_featimp.run_numfeat_wrapper()

100%|██████████| 500/500 [1:30:30<00:00, 10.86s/it]


In [13]:
# Saving the details..
numfeat_df.to_csv("inst_medicaid_numfeat_df.csv", index=False)
numfeat_noise_df.to_csv("inst_medicaid_numfeat_noise_df.csv", index=False)

In [14]:
numfeat_noise_df

Unnamed: 0,feature_name,ge_noise
12,count_of_holidays,500
2,days_taken_for_claim_filing,472
0,total_claim_charge_amount,390
5,count_of_lineitems,306
1,units_sum_quantity,301
3,patient_age,285
8,count_of_total_dx_codes,227
7,LOS,201
9,count_of_OccurrenceInformation,190
11,count_of_ExternalCauseOfInjury_HI,190


In [30]:
# Categorical Features Importance
catfeat_df, catfeat_noise_df = btstrp_featimp.run_catfeat_wrapper()

100%|██████████| 500/500 [24:02<00:00,  2.88s/it]


In [31]:
# Saving the details..
catfeat_df.to_csv("inst_medicaid_catfeat_df_v2.csv", index=False)
catfeat_noise_df.to_csv("inst_medicaid_catfeat_noise_df_v2.csv", index=False)

In [32]:
catfeat_noise_df

Unnamed: 0,feature_name,ge_noise
26,claim_creation_weekday,500
13,RevCode_Radiology_Diagnostic,419
9,RevCode_Other_Imaging_Services,345
35,payer_name,316
31,claim_has_referring_provider_id,281
36,payer_resp_seq_num,276
6,RevCode_Laboratory,271
39,subscriber_pat_gender,268
37,subscriber_city,263
16,ccsr_category_1_code_ExternalCauseOfInjury_HI_1,261
