In [None]:
import sys
import pickle
import warnings
import pandas as pd
from datetime import date
import s3fs
import boto3

warnings.filterwarnings("ignore")

In [None]:
sys.path[1:1] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/01_data_preprocessing/preprocessing_scripts"]

In [None]:
from inst_claims_preprocessing_methods import InstClaimsPreprocessing

In [None]:
icp = InstClaimsPreprocessing()

In [None]:
json_path = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/01_data_preprocessing/preprocessing_artifacts/institutional_artifacts/"

In [None]:
# columns list to identify a claim uniquely
ref_cols = icp.read_json(path=json_path, file_name="ref_cols.json")
ref_cols = ref_cols["ref_cols"]

# additional cols used
cols_for_sa = icp.read_json(path=json_path, file_name="cols_for_sa.json")
cols_for_sa = cols_for_sa["cols_for_sa"]

# rename claims cols
rename_cols = icp.read_json(path=json_path, file_name="rename_cols.json")

In [None]:
#PreprocessingConfigForIC.json has paths to data files, and external files such as Dx codes to CCSR etc
config = icp.read_json(path=json_path, file_name="preprocessing_config.json")

In [None]:
claims_data = pd.read_pickle("s3://cerner-mlecosystem-processing-us-west-2-dsprod/manual-uploads/hdx/Extracts Backup/Pickle files/837I_paired_version2.pkl")
    
payers_claims_counts = pd.read_csv(config["payer_claim_counts"])
    
revenue_codes = pd.read_csv(config["rev_codes_path"])
    
#CCSR codes are used for ICD-10-CM codes
dx_ccsr = pd.read_csv(config["ccsr_path"])
    
#CCS codes are used for ICD-10-PCM codes
dx_ccs = pd.read_csv(config["ccs_path"])
    
ndc_codes = pd.read_csv(config["ndc_codes_path"], dtype=str)
    
    
drg_codes = pd.read_csv(config["drg_codes_path"], dtype=str)
drg_codes.rename(columns={"Description": "Drg_Description"}, inplace=True)
    
cond_codes = pd.read_csv(config["cond_codes_path"], dtype=str)
    
value_codes = pd.read_csv(config["val_codes_path"], dtype=str)
value_codes = value_codes.drop(columns=["Long_description"])
    
occur_codes = pd.read_csv(config["occur_info_path"], dtype=str)
occur_codes = occur_codes.drop(columns=["Long_Description"])

hcpcs_codes = pd.read_csv(config["hcpcs_codes_path"], dtype=str)
hcpcs_codes = hcpcs_codes.drop(columns=["Category", "Types", "Code"])

occur_span_codes = pd.read_csv(config["occur_span_path"], dtype=str)
occur_span_codes = occur_span_codes.drop(columns=["Long_description"])

In [None]:
hcpcs_codes = icp.add_prefix_strings(hcpcs_codes, "HCPCSCode_", ["Section"])

In [None]:
ndc_codes = icp.add_prefix_strings(ndc_codes, "NDCCode_", ["Concept_Name"])

In [None]:
revenue_codes = icp.add_prefix_strings(revenue_codes, "RevCode_", ["Category", "Description", "Major Category Description"])

### Creating count_of_lineitems feature

In [None]:
# claims_data shape before creating count_of_lineitems feature
rows_count1, cols_count1 = claims_data.shape[0], claims_data.shape[1]

# adding an extra feature count_of_lineitems to claims_data
claims_data = icp.get_count_of_lineitems(claims_data, ref_cols)

# claims_data shape after creating count_of_lineitems feature
rows_count2, cols_count2 = claims_data.shape[0], claims_data.shape[1]

# the number of observations shouldn't change, asserting the same
assert (rows_count1==rows_count2), "Error! The number of observations are not constant as expected."

# asserting whether the feature was created or not
assert (cols_count1+1==cols_count2), "Error! count_of_lineitems feature was not created as expected."

### Validating count_of_lineitems feature:

#### Case-1: count_of_lineitems shouldn't be zero.

In [None]:
zero_mask = claims_data["count_of_lineitems"]==0
assert (sum(zero_mask)==0), "Error! There are claims with zero count_of_lineitems in dataset"

#### Case-2: Randomly select a claim file and check if the number of line items is equal to the count_of_lineitems value or not.

In [None]:
random_sample = claims_data[ref_cols].drop_duplicates(keep="first").sample(1)
print(f"Claim Submitter Id is:{random_sample[ref_cols[3]].unique()[0]}")
ref_ident, date, time, claim_submitter_id = random_sample[ref_cols[0]].values, random_sample[ref_cols[1]].values, random_sample[ref_cols[2]].values, random_sample[ref_cols[3]].values

mask1 = claims_data[ref_cols[0]]==ref_ident[0]
mask2 = claims_data[ref_cols[1]]==date[0]
mask3 = claims_data[ref_cols[2]]==time[0]
mask4 = claims_data[ref_cols[3]]==claim_submitter_id[0]

random_sample_df = claims_data[(mask1) & (mask2) & (mask3) & (mask4)]
assert(random_sample_df.shape[0]==list(random_sample_df["count_of_lineitems"].unique())[0]), "Error! The number of line items and count_of_lineitems are not matching."

In [None]:
icp.get_numeric_cols

### Creating the below aggregate features:
- sum:
    - days_sum_quantity
    - units_sum_quantity
- mean:
    - days_mean_quantity
    - units_mean_quantity
- median:
    - days_median_quantity
    - units_median_quantity

In [None]:
# claims_data shape before creating aggregate features
rows_count1, cols_count1 = claims_data.shape[0], claims_data.shape[1]

# adding six additional features to claims_data
claims_data = icp.get_agg_features(claims_data, ref_cols, "DA")

# claims_data shape after creating aggregate features
rows_count2, cols_count2 = claims_data.shape[0], claims_data.shape[1]

# the number of observations shouldn't change, asserting the same
assert (rows_count1==rows_count2), "Error! The number of observations are not constant as expected."

# asserting whether the feature was created or not
assert (cols_count1+6==cols_count2), "Error! Aggregate features were not created as expected."

In [None]:
# adding aggregate features to numeric columns list
icp.append_to_numeric_cols(["days_sum_quantity", 
                            "units_sum_quantity",
                            "days_mean_quantity",
                            "units_mean_quantity", 
                            "days_median_quantity", 
                            "units_median_quantity"])

### Creating "count_of_other_subscribers" feature

In [None]:
# claims_data shape before creating count_of_other_subscribers feature
rows_count1, cols_count1 = claims_data.shape[0], claims_data.shape[1]

# creating count_of_other_subscribers feature
claims_data = icp.get_count_of_other_subscribers(claims_data)

# claims_data shape after creating count_of_other_subscribers feature
rows_count2, cols_count2 = claims_data.shape[0], claims_data.shape[1]

# the number of observations shouldn't change, asserting the same
assert (rows_count1==rows_count2), "Error! The number of observations are not constant as expected."

# asserting whether the feature was created or not
assert (cols_count1+1==cols_count2), "Error! count_of_other_subscribers feature was not created as expected."

### Creating "count_of_invest_dev_exemp" feature

In [None]:
# claims_data shape before creating count_of_invest_dev_exemp feature
rows_count1, cols_count1 = claims_data.shape[0], claims_data.shape[1]

# creating count_of_invest_dev_exemp feature
claims_data = icp.get_count_of_invest_dev_exemp(claims_data)

# claims_data shape after creating count_of_invest_dev_exemp feature
rows_count2, cols_count2 = claims_data.shape[0], claims_data.shape[1]

# the number of observations shouldn't change, asserting the same
assert (rows_count1==rows_count2), "Error! The number of observations are not constant as expected."

# asserting whether the feature was created or not
assert (cols_count1+1==cols_count2), "Error! count_of_invest_dev_exemp feature was not created as expected."

### Reducing all categories with value_counts() less than 1% of the total under "revenue codes" into "other_revcodes"

In [None]:
# claims_data shape before creating count_of_lineitems feature
rows_count1, cols_count1 = claims_data.shape[0], claims_data.shape[1]

column_name = "Loop2400_SV2-Segment-InstitutionalServiceLine_SV201-Product/ServiceID-234"
claims_data = icp.reduce_unique_values(claims_data, column_name, "reduced_serviceline_revcode", 1, "other_revcodes")

# claims_data shape before creating count_of_lineitems feature
rows_count2, cols_count2 = claims_data.shape[0], claims_data.shape[1]

# the number of observations shouldn't change, asserting the same
assert (rows_count1==rows_count2), "Error! The number of observations are not constant as expected."

# asserting whether the feature was created or not
assert (cols_count1+1==cols_count2), "Error! reduced_serviceline_revcode column was not created as expected."

### Reducing all categories with value_counts() less than 1% of the total under "hcpc codes" into "other_hcpcscodes"

In [None]:
# claims_data shape before creating count_of_lineitems feature
rows_count1, cols_count1 = claims_data.shape[0], claims_data.shape[1]

column_name = "Loop2400_SV2-Segment-InstitutionalServiceLine_SV202-COMPOSITEMEDICALPROCEDUREIDENTIFIER-Composite-C003_02-Product/ServiceID-234"
claims_data = icp.reduce_unique_values(claims_data, column_name, "reduced_serviceline_hcpcscode", 1, "other_hcpcscodes")

# claims_data shape before creating count_of_lineitems feature
rows_count2, cols_count2 = claims_data.shape[0], claims_data.shape[1]

# the number of observations shouldn't change, asserting the same
assert (rows_count1==rows_count2), "Error! The number of observations are not constant as expected."

# asserting whether the feature was created or not
assert (cols_count1+1==cols_count2), "Error! count_of_lineitems feature was not created as expected."

### Reducing all categories with value_counts() less than 1% of the total under "ndc codes" into "other_ndccodes"

In [None]:
# claims_data shape before creating count_of_lineitems feature
rows_count1, cols_count1 = claims_data.shape[0], claims_data.shape[1]

column_name = "Loop2410_LIN-Segment-DrugIdentification_LIN03-Product/ServiceID-234"
claims_data = icp.reduce_unique_values(claims_data, column_name, "reduced_serviceline_ndccode", 1, "other_ndccodes")

# claims_data shape before creating count_of_lineitems feature
rows_count2, cols_count2 = claims_data.shape[0], claims_data.shape[1]

# the number of observations shouldn't change, asserting the same
assert (rows_count1==rows_count2), "Error! The number of observations are not constant as expected."

# asserting whether the feature was created or not
assert (cols_count1+1==cols_count2), "Error! count_of_lineitems feature was not created as expected."

### Converting "revenue codes" in "reduced_serviceline_revcode" column into their corresponding "categories"

In [None]:
claims_data = claims_data.merge(revenue_codes[["Revenue Codes", "Category"]], left_on="reduced_serviceline_revcode", right_on="Revenue Codes", how="left")
mask = claims_data["reduced_serviceline_revcode"]=="other_revcodes"
claims_data.loc[mask, "Category"] = "other_revcodes"

In [None]:
claims_data.shape

In [None]:
column_name = "Category"
claims_data = icp.serviceline_to_claimlevel(claims_data, ref_cols, column_name)

if "nan_value" in claims_data.columns:
    claims_data.drop(columns=["nan_value"], inplace=True)

In [None]:
claims_data.shape

### Converting "hcpcs codes" in "reduced_serviceline_hcpcscode" column into their corresponding "categories"

In [None]:
claims_data = claims_data.merge(hcpcs_codes, left_on="reduced_serviceline_hcpcscode", right_on="CPT Codes", how="left")
mask = claims_data["reduced_serviceline_hcpcscode"]=="other_hcpcscodes"
claims_data.loc[mask, "Section"] = "other_hcpcscodes"

In [None]:
column_name = "Section"
claims_data = icp.serviceline_to_claimlevel(claims_data, ref_cols, column_name)

if "nan_value" in claims_data.columns:
    claims_data.drop(columns=["nan_value"], inplace=True)

In [None]:
claims_data.shape

### Converting "ndc codes" in "reduced_serviceline_ndccode" column into their corresponding "concept names"

In [None]:
claims_data = claims_data.merge(ndc_codes, left_on="reduced_serviceline_ndccode", right_on="NDC_Code", how="left")
mask = claims_data["reduced_serviceline_ndccode"]=="other_ndccodes"
claims_data.loc[mask, "Concept_Name"] = "other_ndccodes"

In [None]:
column_name = "Concept_Name"
claims_data = icp.serviceline_to_claimlevel(claims_data, ref_cols, column_name)

if "nan_value" in claims_data.columns:
    claims_data.drop(columns=["nan_value"], inplace=True)

In [None]:
claims_data.shape

### Retrieving Claim Level Data alone

In [None]:
# Creating additional_columns list for retrieving claim level data alone
additional_cols = [col for col in claims_data.columns if "RevCode_" in col] \
                + [col for col in claims_data.columns if "HCPCSCode_" in col] \
                + [col for col in claims_data.columns if "NDCCode_" in col]

additional_cols += ["other_revcodes", "other_hcpcscodes", "other_ndccodes"]

# Adding additional_cols to categorical_columns list
icp.append_to_categorical_cols(additional_cols)

In [None]:
# Retrieving claim level data alone - removing all duplicates, retaining the first occurrence.
inst_claims_data = icp.get_claim_level_data(claims_data, ref_cols+cols_for_sa+additional_cols)
inst_claims_data = inst_claims_data.drop_duplicates(ref_cols, keep=False)

In [None]:
inst_claims_data.shape, claims_data.shape

### Renaming columns

In [None]:
# Renaming claim level columns
inst_claims_data = inst_claims_data.rename(columns=rename_cols)

#### Correcting payerName and claimFilingIndCode combinations
#### JIRA: https://jira2.cerner.com/browse/INTELLIGEN-2314

In [None]:
# Correcting the payaer_name and claim_filing_indicator_code combinations
inst_claims_data = icp.correct_payer_claimfiling_combinations(inst_claims_data)

In [None]:
# Validating if there are any payers with more than one claim_filing_indicator_code values
columns = ["payer_name", "claim_filing_ind_code2"]
temp_df = inst_claims_data[columns].groupby(columns).size().reset_index(name="count")
temp_df = temp_df.pivot_table(index="payer_name", columns="claim_filing_ind_code2")
temp_df.columns = [j for i, j in temp_df.columns]
correction_mask = temp_df.apply(lambda x: icp.has_multiple_claimfilingindicators(x), axis=1)

assert (sum(correction_mask)==0), "Error! payer_name and claim_filing_ind_code combinations are not corrected as expected."

In [None]:
# Converting all ICD-10-CM code columns into CCSR Hierarchical Categories
inst_claims_data = icp.code_mapping(inst_claims_data, dx_ccsr, "PrincipalDiagnosis", "ICD_10_CM_CODE", "ccsr_category_1_code_")
inst_claims_data = icp.code_mapping(inst_claims_data, dx_ccsr, "AdmittingDiagnosis", "ICD_10_CM_CODE", "ccsr_category_1_code_")
inst_claims_data = icp.code_mapping(inst_claims_data, dx_ccsr, "Patient’SReasonForVisit", "ICD_10_CM_CODE", "ccsr_category_1_code_")
inst_claims_data = icp.code_mapping(inst_claims_data, dx_ccsr, "ExternalCauseOfInjury_HI", "ICD_10_CM_CODE", "ccsr_category_1_code_")
inst_claims_data = icp.code_mapping(inst_claims_data, dx_ccsr, "OtherDiagnosisInformation_HI", "ICD_10_CM_CODE", "ccsr_category_1_code_")
inst_claims_data = icp.code_mapping(inst_claims_data, dx_ccs, "PrincipalProcedureInformation", "ICD_10_PCS_CODE", "ccs_description_")
inst_claims_data = icp.code_mapping(inst_claims_data, dx_ccs, "OtherProcedureInformation_HI", "ICD_10_PCS_CODE", "ccs_description_")

In [None]:
# Converting all of the below Segments code values into their corresponding descriptions
inst_claims_data = icp.code_mapping(inst_claims_data, drg_codes, "DiagnosisRelatedGroup", "DRG_Code", "Drg_Description_")
inst_claims_data = icp.code_mapping(inst_claims_data, cond_codes, "ConditionInformation","code_value", "description_")
inst_claims_data = icp.code_mapping(inst_claims_data, occur_span_codes, "OccurrenceSpanInformation", "Occurrence_span_code", "Description_")
inst_claims_data = icp.code_mapping(inst_claims_data, occur_codes, "OccurrenceInformation", "Occurrence_code", "Description_")
inst_claims_data = icp.code_mapping(inst_claims_data, value_codes, "ValueInformation", "Value_codes", "Description_")

In [None]:
# Adding all CCS, CCSR, DRG_Code etc to categorical columns list
str_pattern = r"ccsr_category_1_code_|ccs_description_|Drg_Description_|description_|Description_"

count_columns = list(inst_claims_data.filter(regex=str_pattern, axis=1).columns)
icp.append_to_categorical_cols(count_columns)

In [None]:
# Creating Segment wise counts of CCSR categories with repetition and without repetiton
inst_claims_data = icp.count_dx_codes(inst_claims_data)

In [None]:
# Adding all count_of and count_of_unique columns to numerical columns list
count_columns = list(inst_claims_data.filter(regex="count_of", axis=1).columns)
icp.append_to_numeric_cols(count_columns)

In [None]:
#code values to description dictionaries

entity_type_codes = icp.read_json(path=json_path, file_name="entity_type.json")
gender_codes = icp.read_json(path=json_path, file_name="gender_codes.json")
newborn_codes = icp.read_json(path=json_path, file_name="newborn_codes.json")
payer_resp_codes = icp.read_json(path=json_path, file_name="payer_resp_codes.json")
report_type_codes = icp.read_json(path=json_path, file_name="report_type_codes.json")
inpat_outpat_codes = icp.read_json(path=json_path, file_name="inpat_outpat_codes.json")
delay_reason_codes = icp.read_json(path=json_path, file_name="delay_reason_codes.json")
claim_filing_codes = icp.read_json(path=json_path, file_name="claim_filing_codes.json")
facility_code_values = icp.read_json(path=json_path, file_name="facility_code_values.json")
patient_status_codes = icp.read_json(path=json_path, file_name="patient_status_codes.json")
admission_type_codes = icp.read_json(path=json_path, file_name="admission_type_codes.json")
admission_source_codes = icp.read_json(path=json_path, file_name="admission_source_codes.json")
transaction_type_codes = icp.read_json(path=json_path, file_name="transaction_type_codes.json")
individual_relatn_codes = icp.read_json(path=json_path, file_name="individual_relatn_codes.json")
report_transmission_codes = icp.read_json(path=json_path, file_name="report_transmission_codes.json")
release_of_information_codes = icp.read_json(path=json_path, file_name="release_of_information_codes.json")
transaction_set_purpose_codes = icp.read_json(path=json_path, file_name="transaction_set_purpose_codes.json")
yes_no_condition_response_codes = icp.read_json(path=json_path, file_name="yes_or_no_condition_response_codes.json")
epsdt_referral_codes = icp.read_json(path=json_path, file_name="epsdt_referral_code_qualifiers.json")
provider_accept_assignment_codes = icp.read_json(path=json_path, file_name="provider_accept_assignment_codes.json")

In [None]:
inst_claims_data = icp.convert_codes(inst_claims_data, "subscriber_gender", gender_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "subscriber_entity_type", entity_type_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "patient_gender", gender_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "payer_resp_seq_num", payer_resp_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "delay_reason_code", delay_reason_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "ind_reltn_code", individual_relatn_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "transaction_type_code", transaction_type_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "ind_reltn_code2", individual_relatn_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "claim_filing_ind_code", claim_filing_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "claim_filing_ind_code2", claim_filing_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "facility_code_value", facility_code_values)
inst_claims_data = icp.convert_codes(inst_claims_data, "patient_status_code", patient_status_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "transaction_set_purpose_code", transaction_set_purpose_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "release_of_information_code", release_of_information_codes)
inst_claims_data = icp.newborn_codes(inst_claims_data, newborn_codes, admission_type_codes, admission_source_codes)
inst_claims_data = icp.convert_codes(inst_claims_data, "provider_accept_assignment_code", provider_accept_assignment_codes)

In [None]:
# Adding all convert_codes columns to categorical columns list
columns_list = ["payer_name",
                "subscriber_gender",
                "subscriber_entity_type",
                "patient_gender",
                "payer_resp_seq_num",
                "delay_reason_code",
                "ind_reltn_code",
                "transaction_type_code",
                "ind_reltn_code2",
                "facility_code_value",
                "patient_status_code",
                "transaction_set_purpose_code",
                "release_of_information_code",
                "provider_accept_assignment_code"]

icp.append_to_categorical_cols(columns_list)

In [None]:
inst_claims_data = icp.fill_missing_values(inst_claims_data, "facility_name", "billing_provider_name")
inst_claims_data = icp.fill_missing_values(inst_claims_data, "facility_city", "billing_provider_city")
inst_claims_data = icp.fill_missing_values(inst_claims_data, "facility_state", "billing_provider_state")

In [None]:
# Adding all fill_missing_values columns to categorical columns list
columns_list = ["facility_name",
                "facility_city",
                "facility_state",
                "billing_provider_name",
                "billing_provider_city",
                "billing_provider_state"]

icp.append_to_categorical_cols(columns_list)

In [None]:
#We noticed some spelling mistakes in the facilityName column, and correcting them using under this method
correction_dictionary = {"WRAY COMMUNITY DSITRICT HOSPITAL": "WRAY COMMUNITY DISTRICT HOSPITAL",
                                "WRAYCLINIC": "WRAY CLINIC"}


inst_claims_data = icp.data_corrections(inst_claims_data, "facility_name", correction_dictionary)

In [None]:
inst_claims_data = icp.compare_columns(inst_claims_data, "subscriber_state", "payer_state", "are_subscriber_payer_in_same_state")
inst_claims_data = icp.compare_columns(inst_claims_data, "subscriber_city", "payer_city", "are_subscriber_payer_in_same_city")
inst_claims_data = icp.compare_columns(inst_claims_data, "facility_state", "payer_state", "are_facility_payer_in_same_state")
inst_claims_data = icp.compare_columns(inst_claims_data, "facility_city", "payer_city", "are_facility_payer_in_same_city")
inst_claims_data = icp.compare_columns(inst_claims_data, "billing_provider_state", "payer_state", "are_billing_provider_payer_in_same_state")
inst_claims_data = icp.compare_columns(inst_claims_data, "billing_provider_city", "payer_city", "are_billing_provider_payer_in_same_city")

In [None]:
# Adding all compare columns to categorical columns list
columns_list = ["subscriber_state",
                "subscriber_city",
                "payer_state",
                "payer_city",
                "are_subscriber_payer_in_same_state",
                "are_subscriber_payer_in_same_city",
                "are_facility_payer_in_same_state",
                "are_facility_payer_in_same_city",
                "are_billing_provider_payer_in_same_state",
                "are_billing_provider_payer_in_same_city"]

icp.append_to_categorical_cols(columns_list)

In [None]:
#Merging Patient and Subscriber
inst_claims_data = icp.merge_two_cols(inst_claims_data, "subscriber_gender", "patient_gender", "subscriber_pat_gender")
inst_claims_data = icp.merge_two_cols(inst_claims_data, "subscriber_city", "patient_city", "subscriber_pat_city")
inst_claims_data = icp.merge_two_cols(inst_claims_data, "subscriber_state", "patient_state", "subscriber_pat_state")
inst_claims_data = icp.merge_two_cols(inst_claims_data, "ind_reltn_code", "ind_reltn_code2", "subscriber_pat_reltn")
inst_claims_data = icp.compute_patient_age(inst_claims_data, "subscriber_dob", "patient_dob", "claim_creation_date")

In [None]:
# Adding all merge_two columns to categorical columns list
columns_list = ["subscriber_gender", "patient_gender", "subscriber_pat_gender",
                "ind_reltn_code", "ind_reltn_code2", "subscriber_pat_reltn"]

icp.append_to_categorical_cols(columns_list)

# Adding patient_age to numeric_columns list
icp.append_to_numeric_cols(["patient_age"])

In [None]:
inst_claims_data = icp.create_flag_column(inst_claims_data, "prior_authorization", "claim_has_priorior_authorization")
inst_claims_data = icp.create_flag_column(inst_claims_data, "subscriber_group_name", "claim_has_subscriber_group_name")
inst_claims_data = icp.create_flag_column(inst_claims_data, "demo_project_id", "claim_has_demo_project_id")
inst_claims_data = icp.create_flag_column(inst_claims_data, "discharge_hour", "claim_has_discharge_hour")
inst_claims_data = icp.create_flag_column(inst_claims_data, "admission_date_hour", "claim_has_admission_dthr")
inst_claims_data = icp.create_flag_column(inst_claims_data, "statement_dates", "claim_has_statement_dates")
inst_claims_data = icp.create_flag_column(inst_claims_data, "referral_number", "claim_has_referral_number")
inst_claims_data = icp.create_flag_column(inst_claims_data, "service_authorization_exception_code", "claim_has_service_auth_exception")
inst_claims_data = icp.create_flag_column(inst_claims_data, "auto_accident_state", "claim_has_auto_accident_state")
inst_claims_data = icp.create_flag_column(inst_claims_data, "attending_provider_id", "claim_has_attending_provider_id")
inst_claims_data = icp.create_flag_column(inst_claims_data, "medical_record_number", "claim_has_medical_record_number")
inst_claims_data = icp.create_flag_column(inst_claims_data, "rendering_provider_id", "claim_has_rendering_provider_id")
inst_claims_data = icp.create_flag_column(inst_claims_data, "referring_provider_id", "claim_has_referring_provider_id")
inst_claims_data = icp.create_flag_column(inst_claims_data, "repricer_received_date", "claim_has_repricer_received_date")
inst_claims_data = icp.create_flag_column(inst_claims_data, "operating_physician_id", "claim_has_operating_physician_id")
inst_claims_data = icp.create_flag_column(inst_claims_data, "subscriber_group_or_policy_number", "claim_has_subscriber_group_or_policy_number")
inst_claims_data = icp.create_flag_column(inst_claims_data, "peer_review_org_approval", "claim_has_peer_review_org_approval")
inst_claims_data = icp.create_flag_column(inst_claims_data, "payer_claim_control_number", "claim_has_payer_claim_control_number")
inst_claims_data = icp.create_flag_column(inst_claims_data, "other_operating_physician_id", "claim_has_other_operating_physician_id")

In [None]:
# Adding all merge_two columns to categorical columns list
columns_list = ["prior_authorization",
                "claim_has_priorior_authorization",
                "subscriber_group_name",
                "claim_has_subscriber_group_name",                
                "claim_has_demo_project_id",
                "claim_has_discharge_hour", 
                "claim_has_admission_dthr",
                "claim_has_statement_dates",
                "claim_has_referral_number",
                "service_authorization_exception_code",
                "claim_has_service_auth_exception",
                "claim_has_auto_accident_state",
                "attending_provider_id",
                "claim_has_attending_provider_id",
                "claim_has_medical_record_number",
                "rendering_provider_id",
                "claim_has_rendering_provider_id",
                "referring_provider_id",
                "claim_has_referring_provider_id",
                "claim_has_repricer_received_date",
                "operating_physician_id",
                "claim_has_operating_physician_id",
                "subscriber_group_or_policy_number",
                "claim_has_subscriber_group_or_policy_number",
                "peer_review_org_approval",
                "claim_has_peer_review_org_approval",
                "payer_claim_control_number",
                "claim_has_payer_claim_control_number",
                "other_operating_physician_id",
                "claim_has_other_operating_physician_id",
                "subscriber_pat_reltn"]

icp.append_to_categorical_cols(columns_list)

In [None]:
inst_claims_data = icp.get_time_variables(inst_claims_data, "statement_dates")

# Appending time_variables to numeric_columns list
icp.append_to_numeric_cols(["LOS", "days_taken_for_claim_filing"])

In [None]:
inst_claims_data = icp.get_date_features(inst_claims_data)

In [None]:
inst_claims_data["response_time"] = (pd.to_datetime(inst_claims_data["response_date"])-pd.to_datetime(inst_claims_data["claim_creation_date"])).dt.days

In [None]:
years = icp.get_min_max_years(inst_claims_data)
holidays_list = icp.get_holidays_list(years)
inst_claims_data["count_of_holidays"] = inst_claims_data["claim_creation_date"].apply(lambda x: icp.create_holiday_count_variable(years, holidays_list, x))

# Appending count_of_holidays to numeric_columns list
icp.append_to_numeric_cols(["count_of_holidays"])

In [None]:
reassign_dict = {"Other Federal Program": "Commercial Insurance Co.", "Medicare Part B": "Medicare Part A"}

inst_claims_data["claim_filing_ind_code3"] = inst_claims_data["claim_filing_ind_code2"].copy()
for key, value in reassign_dict.items():
    mask = inst_claims_data["claim_filing_ind_code2"]==key
    inst_claims_data.loc[mask, "claim_filing_ind_code3"] = value

In [None]:
# adding other important columns from claims data to numeric and categorical columns list
categorical_columns = ["billing_provider_taxonomy_code",
                       "claim_freq_type_code",
                       "yes_no_condition_resp_code",
                       "admission_type_code",
                       "admission_source_code"]

numeric_columns = ["total_claim_charge_amount",
                   "patient_estimated_amount"]

icp.append_to_categorical_cols(categorical_columns)
icp.append_to_numeric_cols(numeric_columns)

In [None]:
inst_claims_data["claim_filing_ind_code3"].value_counts()

In [None]:
from datetime import date
inst_claims_data.to_csv("/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/01_data_preprocessing/preprocessed_datasets/ic_preprocessed_dataset_"+str(date.today())+".csv", index=False)

In [None]:
num_file_path = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/json_files/ic_num_cols.json"
cat_file_path = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/json_files/ic_cat_cols.json"

icp.save_cols_list(num_file_path, cat_file_path)