In [None]:
import sys
import json
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

In [None]:
sys.path[1:1] = ["/home/ec2-user/SageMaker/GitHub Repos/cipy/cipy/"]

sys.path[2:2] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/"]

In [None]:
sys.path

In [None]:
from wqp_methods import *

In [None]:
# Defining the file paths

# csv_path for Professional claims data path
csv_path= "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/01_data_preprocessing/preprocessed_datasets/"

# feature_file path for Numerical and Categorical column names 
feature_file = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/professional_claims/json_files/"

# path to save categorical & numerical filtered columns and categorical & numerical statistics for each claim_filing_indicator_code
output_path = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/professional_claims/02_step/"

In [None]:
# Loading the institutional claims dataset into inst_claims
prof_claims = pd.read_csv(csv_path+"pc_preprocessed_dataset_2021-06-01.csv")

In [None]:
# Loading the Categorical Column Names List
with open(feature_file+"pc_cat_cols.json") as f:
    cat_list = json.load(f)

In [None]:
# Loading the Numercial Column Names List
with open(feature_file+"pc_num_cols.json") as f:
    num_list = json.load(f)

In [None]:
# Models will be created for each unique ref_col value
ref_col = "claim_filing_ind_code3"

# Retrieve the unique claim filing indicator code values
claim_filing_indicator_codes = set(prof_claims[ref_col])

In [None]:
right_tail_clipping_dict = {"Medicaid":0.98, "Commercial Insurance Co.": 0.99, "Medicare Part B": 0.99}

In [None]:
prof_claims_filtered_features_dict = {}

for claim_filing_indicator_code in claim_filing_indicator_codes:    
    print("Claim Filing Indicator Code: ", claim_filing_indicator_code)
    
    # Retrieving data specific to claim_filing_indicator_code
    cfic_df = prof_claims[prof_claims[ref_col]==claim_filing_indicator_code].reset_index(drop=True)
    
    # Considering a claim_file_indicator_code only if it has more than 200 observations
    if cfic_df.shape[0] <= 200:
        print("\t Claim filing indicator code {} has less than 200 observations, hence we're not consider this for modeling".format(claim_filing_indicator_code))
        continue
    
    # Shape of claim_filing_indicator_code specific data
    print("\t Dataset shape: {}".format(cfic_df.shape))
    
    # Clipping the response_time values which are greater than 99th Percentile
    cfic_df = clip_right_tail2(cfic_df, right_tail_clipping_dict[claim_filing_indicator_code])
    
    # Retrieving Categorical Columns
    cat_df = cfic_df[cat_list]
    
    # Filter Categorical Columns based on missingness, cardinality etc.
    cat_stats, filtered_cat_cols_tracker = filter_cat_cols(cat_df)
    
    # Retrieve the retained final categorical columns
    final_cat_cols = list(cat_stats.index)

    # Retrieving Numerical Columns
    num_df = cfic_df[num_list]
    
    # Filter Numerical Columns based on missingness and variance threshold etc.
    num_stats, filtered_num_cols_tracker = filter_numeric_cols(num_df)
    
    # Retrieve the retained final numerical columns
    final_num_cols = list(num_stats.index)
    
    prof_claims_filtered_features_dict[claim_filing_indicator_code] = {"filtered_cat_cols_tracker": filtered_cat_cols_tracker, 
                                                                       "final_cat_cols": final_cat_cols, 
                                                                       "filtered_num_cols_tracker": filtered_num_cols_tracker, 
                                                                       "final_num_cols": final_num_cols, 
                                                                       "cat_stats": cat_stats,
                                                                       "num_stats": num_stats}

In [None]:
prof_claims_filtered_features_dict

In [None]:
# Saving inst_claims_filtered_features_dict to a pickle file
pickl("/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/professional_claims/02_step/prof_claims_cat_num_filtered_feature_details.pickle", prof_claims_filtered_features_dict)