### Bootstrap Feature Imporatance Computation for Institutional Claims Medicaid Model.
- This notebook captures the steps involved in computing feature importance scores across 500 cross-validation folds.
- Further this notebook also holds the information on the number of times a feature was ranked higher than noise variable - for both numerical and categorical features data.

In [None]:
# Importing the required libraries
import os
import sys
import warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

In [None]:
# Adding dependency python files to sys path
sys.path[1:1] = ["/home/ec2-user/SageMaker/GitHub Repos/cipy/cipy/"]
sys.path[2:2] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/"]
sys.path[3:3] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/"]
sys.path[4:4] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/02_step"]

sys.path

In [None]:
# Importing methods from wqp_methods path file, especially get_cvfolds will be leveraged in this notebook.
from wqp_methods import *

In [None]:
# Importing BootStrapFeatImp class for computing Feature Importance Scores
from bootstrap_feat_imp import BootstrapFeatImp

In [None]:
# Reading the Institutional claims dataset
ref_col= "claim_filing_ind_code3"

# csv_path for Institutional Claims Data Path
csv_path = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/01_data_preprocessing/preprocessed_datasets/"

# Loading the institutional claims dataset into inst_claims
inst_claims = pd.read_csv(csv_path+"ic_preprocessed_dataset_2021-06-06.csv")

# Creating the event_flag - for survival analysis.
inst_claims["event_flag"] = True

In [None]:
final_cat_cols = ['Description_OccurrenceInformation_1',
 'Description_OccurrenceInformation_2',
 'HCPCSCode_Medicine_Services_and_Procedures',
 'HCPCSCode_Pathology_and_Laboratory_Procedures',
 'HCPCSCode_Surgery',
 'RevCode_Emergency_Room',
 'RevCode_Laboratory',
 'RevCode_Medical/Surgical_Supplies_and_Devices',
 'RevCode_Operating_Room_Services',
 'RevCode_Other_Imaging_Services',
 'RevCode_Pharmacy',
 'RevCode_Pharmacy_-_Extension_of_025X',
 'RevCode_Physical_Therapy',
 'RevCode_Radiology_Diagnostic',
 'admission_source_code',
 'admission_type_code',
 'ccsr_category_1_code_ExternalCauseOfInjury_HI_1',
 'ccsr_category_1_code_OtherDiagnosisInformation_HI_1',
 'ccsr_category_1_code_OtherDiagnosisInformation_HI_2',
 'ccsr_category_1_code_OtherDiagnosisInformation_HI_3',
 'ccsr_category_1_code_Patient’SReasonForVisit_1',
 'ccsr_category_1_code_PrincipalDiagnosis_1',
 'claim_creation_hour',
 'claim_creation_weekday',
 'claim_freq_type_code',
 'claim_has_admission_dthr',
 'claim_has_operating_physician_id',
 'claim_has_payer_claim_control_number',
 'claim_has_referring_provider_id',
 'other_hcpcscodes',
 'other_ndccodes',
 'other_revcodes',
 'payer_name',
 'payer_resp_seq_num',
 'subscriber_city',
 'subscriber_gender',
 'subscriber_pat_gender']

In [None]:
# Load the feature information after preprocessing - 01 Notebook output
filtered_columns_data = pickl("/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/02_step/inst_claims_cat_num_filtered_feature_details.pickle")

# Derive Final Model Features for..
claim_filing_indicator_code = "Medicaid"
    
# Retrieving claim_filing_indicator_code specific data
cfic_df = inst_claims[inst_claims[ref_col]==claim_filing_indicator_code].reset_index()

# Clipping the response_time values as per claim_filing_indicator_code specific percentiles
cfic_df = clip_right_tail2(cfic_df, 0.98)

# Creating the structured array with event_flag and response_time values.
y = Surv.from_arrays(cfic_df["event_flag"].values, cfic_df["response_time"].values)

# Retrieving categorical features..
cat_cols = final_cat_cols#filtered_columns_data[claim_filing_indicator_code]["final_cat_cols"]
cat_df = cfic_df[cat_cols]

# Combining 'RevCode_Pharmacy' & 'RevCode_Pharmacy_-_Extension_of_025X'
mask1 = cat_df["RevCode_Pharmacy_-_Extension_of_025X"] == "Yes"
cat_df.loc[mask1, "RevCode_Pharmacy"] = "Yes"
del cat_df["RevCode_Pharmacy_-_Extension_of_025X"]

# Encoding categorical features..
le = LabelEncoder()
encoded_cat_df = cat_df.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type="expand")

# Adding noise variable for categorical data
np.random.seed(0)
encoded_cat_df["noise"] = np.random.poisson(5, size=len(encoded_cat_df))

# Retrieving numerical features..
num_cols = filtered_columns_data[claim_filing_indicator_code]["final_num_cols"]
num_df = cfic_df[num_cols]

# Adding noise variable for numerical data
np.random.seed(0)
num_df["noise"] = np.random.normal(0, 1, len(num_df))

In [None]:
cv_folds = []

if os.path.isfile("inst_medicaid_cv_folds.pickle"):
    cv_folds=pickl("inst_medicaid_cv_folds.pickle")

else:
    for i in range(50):
        cv_folds += get_folds(cfic_df)
        
    pickl("inst_medicaid_cv_folds.pickle", cv_folds)

In [None]:
len(cv_folds)

In [None]:
# Initializing BootStrapFeatImp object
btstrp_featimp = BootstrapFeatImp(num_df, encoded_cat_df, y, cv_folds)

In [None]:
# Numerical Features Importance
numfeat_df, numfeat_noise_df = btstrp_featimp.run_numfeat_wrapper()

In [None]:
# Saving the details..
numfeat_df.to_csv("inst_medicaid_numfeat_df.csv", index=False)
numfeat_noise_df.to_csv("inst_medicaid_numfeat_noise_df.csv", index=False)

In [None]:
numfeat_noise_df

In [None]:
# Categorical Features Importance
catfeat_df, catfeat_noise_df = btstrp_featimp.run_catfeat_wrapper()

In [None]:
# Saving the details..
catfeat_df.to_csv("inst_medicaid_catfeat_df_v2.csv", index=False)
catfeat_noise_df.to_csv("inst_medicaid_catfeat_noise_df_v2.csv", index=False)

In [None]:
catfeat_noise_df