### Bootstrap Feature Imporatance Computation for Professional Claims Commercial Model.
- This notebook captures the steps involved in computing feature importance scores across 500 cross-validation folds.
- Further this notebook also holds the information on the number of times a feature was ranked higher than noise variable - for both numerical and categorical features data.

In [None]:
# Importing the required libraries
import sys
import warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

In [None]:
sys.path[1:1] = ["/home/ec2-user/SageMaker/GitHub Repos/cipy/cipy/"]
sys.path[2:2] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/"]
sys.path[3:3] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/"]
sys.path[4:4] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/professional_claims/02_step/"]

sys

In [None]:
# Importing methods from wqp_methods path file, especially get_cvfolds will be leveraged in this notebook.
from wqp_methods import *

In [None]:
# Importing BootStrapFeatImp class for computing Feature Importance Scores
from bootstrap_feat_imp import BootstrapFeatImp

In [None]:
# Reading the Institutional claims dataset
ref_col= "claim_filing_ind_code3"

# csv_path for Institutional Claims Data Path
csv_path = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/01_data_preprocessing/preprocessed_datasets/"

prof_claims= pd.read_csv(csv_path+"pc_preprocessed_dataset_2021-06-01.csv")

# Creating the event_flag - for survival analysis.
prof_claims["event_flag"] = True

In [None]:
# Load the feature information after preprocessing - 01 Notebook output
filtered_columns_data = pickl("prof_claims_cat_num_filtered_feature_details.pickle")

# Derive Final Model Features for..
claim_filing_indicator_code = "Commercial Insurance Co."
    
# Retrieving claim_filing_indicator_code specific data
cfic_df = prof_claims[prof_claims[ref_col]==claim_filing_indicator_code].reset_index()

# Removing outliers in response_time data
cfic_df = clip_right_tail2(cfic_df, 0.99)

# Creating the structured array with event_flag and response_time values.
y = Surv.from_arrays(cfic_df["event_flag"].values, cfic_df["response_time"].values)

# Retrieving categorical features..
cat_cols = filtered_columns_data[claim_filing_indicator_code]["final_cat_cols"]
cat_df = cfic_df[cat_cols]

# Encoding categorical features..
le = LabelEncoder()
encoded_cat_df = cat_df.apply(lambda col: le.fit_transform(col.astype(str)), axis=0, result_type="expand")

# Adding noise variable for categorical data
np.random.seed(0)
encoded_cat_df["noise"] = np.random.poisson(5, size=len(encoded_cat_df))

# Retrieving numerical features..
num_cols = filtered_columns_data[claim_filing_indicator_code]["final_num_cols"]
num_df = cfic_df[num_cols]

# Adding noise variable for numerical data
np.random.seed(0)
num_df["noise"] = np.random.normal(0, 1, len(num_df))

In [None]:
cv_folds = []

if os.path.isfile("prof_commercial_cv_folds.pickle"):
    cv_folds=pickl("prof_commercial_cv_folds.pickle")

else:
    for i in range(50):
        cv_folds += get_folds(cfic_df)
        
    pickl("prof_commercial_cv_folds.pickle", cv_folds)

In [None]:
len(cv_folds)

In [None]:
# Initializing BootStrapFeatImp object
btstrp_featimp = BootstrapFeatImp(num_df, encoded_cat_df, y, cv_folds)

In [None]:
# Numerical Features Importance
numfeat_df, numfeat_noise_df = btstrp_featimp.run_numfeat_wrapper()

In [None]:
numfeat_noise_df

In [None]:
# Categorical Features Importance
catfeat_df, catfeat_noise_df = btstrp_featimp.run_catfeat_wrapper()

In [None]:
catfeat_noise_df

In [None]:
# Saving the details..
numfeat_df.to_csv("prof_commercial_numfeat_df.csv", index=False)
numfeat_noise_df.to_csv("prof_commercial_numfeat_noise_df.csv", index=False)

catfeat_df.to_csv("prof_commercial_catfeat_df.csv", index=False)
catfeat_noise_df.to_csv("prof_commercial_catfeat_noise_df.csv", index=False)