### Bootstrap Feature Imporatance Computation for Institutional Claims Commercial Model.
- This notebook captures the steps involved in computing feature importance scores across 500 cross-validation folds.
- Further this notebook also holds the information on the number of times a feature was ranked higher than noise variable - for both numerical and categorical features data.

In [12]:
# Importing the required libraries
import os
import sys
import warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

In [13]:
# Adding dependency python files to sys path
sys.path[1:1] = ["/home/ec2-user/SageMaker/GitHub Repos/cipy/cipy/"]
sys.path[2:2] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/"]
sys.path[3:3] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/"]
sys.path[4:4] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/02_step"]

sys.path

['/home/ec2-user/SageMaker/envs/wqp_env/lib/python3.6',
 '/home/ec2-user/SageMaker/GitHub Repos/cipy/cipy/',
 '/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/',
 '/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/',
 '/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/02_step',
 '/home/ec2-user/SageMaker/GitHub Repos/cipy/cipy/',
 '/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/',
 '/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/',
 '/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/02_step',
 '',
 '/home/ec2-user/SageMaker/envs/wqp_env/lib/python3.6/site-packages',
 '/home/ec2-user/SageMaker/envs/wqp_env/lib/python3.6/lib-dynload',
 '/home/ec2-user/anaconda3/envs/JupyterSystemEnv/lib/python3.6',
 '/home/ec2-user/SageMaker/envs/wqp_env/lib/python

In [14]:
# Importing methods from wqp_methods path file, especially get_cvfolds will be leveraged in this notebook.
from wqp_methods import *

In [15]:
# Importing BootStrapFeatImp class for computing Feature Importance Scores
from bootstrap_feat_imp import BootstrapFeatImp

In [16]:
# Reading the Institutional claims dataset
ref_col= "claim_filing_ind_code3"

# csv_path for Institutional Claims Data Path
csv_path = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/01_data_preprocessing/preprocessed_datasets/"

# Loading the institutional claims dataset into inst_claims
inst_claims = pd.read_csv(csv_path+"ic_preprocessed_dataset_2021-06-06.csv")

# Creating the event_flag - for survival analysis.
inst_claims["event_flag"] = True

In [17]:
# Load the feature information after preprocessing - 01 Notebook output
filtered_columns_data = pickl("/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/02_step/inst_claims_cat_num_filtered_feature_details.pickle")

# Derive Final Model Features for..
claim_filing_indicator_code = "Commercial Insurance Co."
    
# Retrieving claim_filing_indicator_code specific data
cfic_df = inst_claims[inst_claims[ref_col]==claim_filing_indicator_code].reset_index()

# Clipping the response_time values as per claim_filing_indicator_code specific percentiles
cfic_df = clip_right_tail2(cfic_df, 0.99)

# Creating the structured array with event_flag and response_time values.
y = Surv.from_arrays(cfic_df["event_flag"].values, cfic_df["response_time"].values)

# Retrieving categorical features..
cat_cols = filtered_columns_data[claim_filing_indicator_code]["final_cat_cols"]
cat_df = cfic_df[cat_cols]

# Encoding categorical features..
le = LabelEncoder()
encoded_cat_df = cat_df.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type="expand")

# Adding noise variable for categorical data
np.random.seed(0)
encoded_cat_df["noise"] = np.random.poisson(5, size=len(encoded_cat_df))

# Retrieving numerical features..
num_cols = filtered_columns_data[claim_filing_indicator_code]["final_num_cols"]
num_df = cfic_df[num_cols]

# Adding noise variable for numerical data
np.random.seed(0)
num_df["noise"] = np.random.normal(0, 1, len(num_df))

In [18]:
cv_folds = []

if os.path.isfile("inst_commercial_cv_folds.pickle"):
    cv_folds=pickl("inst_commercial_cv_folds.pickle")

else:
    for i in range(50):
        cv_folds += get_folds(cfic_df)
        
    pickl("inst_commercial_cv_folds.pickle", cv_folds)

***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 779 83 11.289 11.289 83 0
1 774 88 10.08 10.08 88 0
2 776 86 8.8372 8.8372 86 0
3 752 110 13.191 13.191 110 0
4 783 79 12.797 12.797 79 0
5 785 77 10.688 10.688 77 0
6 777 85 10.071 10.071 85 0
7 777 85 11.729 11.729 85 0
8 792 70 10.143 10.143 70 0
9 763 99 12.505 12.505 99 0
***End get_folds method***
***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 776 86 10.686 10.686 86 0
1 777 85 11.341 11.341 85 0
2 782 80 12.375 12.375 80 0
3 764 98 12.357 12.357 98 0
4 773 89 9.5843 9.5843 89 0
5 781 81 13.407 13.407 81 0
6 771 91 11.264 11.264 91 0
7 775 87 10.23 10.23 87 0
8 774 88 10.375 10.375 88 0
9 785 77 10.636 10.636 77 0
***End get_folds method***
***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 780 82 9.5 9.5 82 0
1 766 96 10.26 

fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 780 82 10.122 10.122 82 0
1 779 83 11.494 11.494 83 0
2 771 91 12.264 12.264 91 0
3 769 93 12.366 12.366 93 0
4 778 84 11.417 11.417 84 0
5 772 90 10.411 10.411 90 0
6 776 86 9.1047 9.1047 86 0
7 771 91 11.868 11.868 91 0
8 775 87 11.575 11.575 87 0
9 787 75 11.387 11.387 75 0
***End get_folds method***
***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 763 99 14.313 14.313 99 0
1 772 90 11.578 11.578 90 0
2 785 77 11.597 11.597 77 0
3 784 78 10.91 10.91 78 0
4 774 88 10.955 10.955 88 0
5 783 79 10.19 10.19 79 0
6 773 89 11.135 11.135 89 0
7 782 80 11.425 11.425 80 0
8 784 78 8.5385 8.5385 78 0
9 758 104 10.837 10.837 104 0
***End get_folds method***
***Begin get_folds method***
fold  #train  #test  timeToResponse   timeToResponseC   Tresponse_tst  Tresponse_shared
0 773 89 9.764 9.764 89 0
1 775 87 11.391 11.391 87 0
2 785 77 9.7

In [19]:
len(cv_folds)

500

In [20]:
# Initializing BootStrapFeatImp object
btstrp_featimp = BootstrapFeatImp(num_df, encoded_cat_df, y, cv_folds)

In [21]:
# Numerical Features Importance
numfeat_df, numfeat_noise_df = btstrp_featimp.run_numfeat_wrapper()

100%|██████████| 500/500 [06:27<00:00,  1.29it/s]


In [22]:
# Saving the details..
numfeat_df.to_csv("inst_commercial_numfeat_df.csv", index=False)
numfeat_noise_df.to_csv("inst_commercial_numfeat_noise_df.csv", index=False)

In [26]:
numfeat_noise_df

Unnamed: 0,feature_name,ge_noise
3,patient_age,414
2,days_taken_for_claim_filing,400
9,count_of_holidays,373
0,total_claim_charge_amount,367
12,count_of_other_subscribers,360
6,count_of_total_dx_codes,346
1,units_sum_quantity,344
8,count_of_ExternalCauseOfInjury_HI,333
7,units_median_quantity,320
4,LOS,312


In [23]:
# Categorical Features Importance
catfeat_df, catfeat_noise_df = btstrp_featimp.run_catfeat_wrapper()

100%|██████████| 500/500 [09:57<00:00,  1.19s/it]


In [24]:
catfeat_df.to_csv("inst_commercial_catfeat_df.csv", index=False)
catfeat_noise_df.to_csv("inst_commercial_catfeat_noise_df.csv", index=False)

In [27]:
catfeat_noise_df

Unnamed: 0,feature_name,ge_noise
38,payer_city,494
39,payer_name,493
40,payer_state,466
22,claim_creation_month,463
24,claim_creation_weekday,432
25,claim_filing_ind_code,425
31,facility_code_value,422
23,claim_creation_quarter,391
26,claim_freq_type_code,379
28,claim_has_payer_claim_control_number,375
