In [None]:
import os
import sys
import warnings
import itertools
import numpy as np
import pandas as pd
from collections import defaultdict

import sksurv
from sksurv.util import Surv
from sksurv.ensemble import RandomSurvivalForest
from sklearn.model_selection import train_test_split
from sksurv.metrics import integrated_brier_score
from sksurv.metrics import concordance_index_censored
from sklearn.model_selection import GridSearchCV


warnings.filterwarnings("ignore")

In [None]:
sys.path[1:1] = ["/home/ec2-user/SageMaker/GitHub Repos/cipy/cipy/"]
sys.path[2:2] = ["/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2"]

sys.path

In [None]:
from wqp_methods import *

In [None]:
# csv_path for Institutional Claims Data Path
csv_path = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/01_data_preprocessing/preprocessed_datasets/"

# feature_file path for Numerical and Categorical Column Names 
feature_file = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/json_files/"

# path to save Categorical & Numerical Filtered Columns and Categorical & Numerical statistics for each claim_filing_indicator_code
output_path = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/02_step/"

In [None]:
# Loading the institutional claims dataset into inst_claims
inst_claims = pd.read_csv(csv_path+"ic_preprocessed_dataset_2021-06-06.csv")

inst_claims["event_flag"] = True

In [None]:
rfe_feat_path = "/home/ec2-user/SageMaker/Users/SP056963/work_queue_prioritization_v2/02_feat_sel_and_imp/institutional_claims/03_step/ic_rfe_output.pickle"

with open(rfe_feat_path, "rb") as input_file:
    rfe_feat_op = pickle.load(input_file)

In [None]:
#final features as of 12th December, 2020.
#please note that these may change once we finalize the hyper-parameters.

medicaid_num_feat = ["days_sum_quantity", 
                     "units_median_quantity"]

medicaid_cat_feat = ["claim_creation_weekday",
                     "claim_creation_dayofmonth"]

In [None]:
# Performing outlier removal in response time values, encoding categorical data and then combining categorical and continuous features
payer_name = "Medicaid"
df = inst_claims[inst_claims["claim_filing_ind_code3"]==payer_name].reset_index(drop=True)
df = clip_right_tail2(df, 0.98)
    
df["response_time"] = df["response_time"].astype("float")
folds = get_folds(df) 

In [None]:
# Encoding the categorical data
from collections import defaultdict

encoder_dict = defaultdict(LabelEncoder)
cat_df = df[medicaid_cat_feat]
cat_df = cat_df.astype("str")
cat_df = cat_df.apply(lambda x: encoder_dict[x.name].fit_transform(x))

num_df = df[medicaid_num_feat]

rsf_df = pd.concat([num_df, cat_df], axis=1)

y = Surv.from_arrays(np.repeat(True, len(df)), df["response_time"].values)    

In [None]:
# Defining the grid search parameters
param_grid = {"n_estimators":[20, 50, 100, 200], 
              "max_features":["sqrt", "log2"], 
              "max_depth":[5, 10, 15, 20],
              "min_samples_split":[8, 10], 
              "min_samples_leaf":[4, 5]}

In [None]:
# Create the GridSearch Object
grid_search_cv = GridSearchCV(estimator=RandomSurvivalForest(oob_score=True), 
                              param_grid=param_grid, 
                              cv=folds,
                              n_jobs=-1, 
                              verbose=10,
                              return_train_score=True, 
                              refit=True)

In [None]:
# Fit the rsf algorithm on training data
grid_search_cv.fit(rsf_df, y)

In [None]:
# Best Model parameters
pd.set_option("display.max_colwidth", -1)
results = pd.DataFrame(grid_search_cv.cv_results_)
results.head()

In [None]:
# Best Test C-Index Parameters:
mask = results["rank_test_c_index"]=0
print("Best Parameters: ", results.loc[mask, "params"])

In [None]:
import matplotlib.pyplot as plt

plt.plot([i for i in results.index], [i for i in results["mean_train_c_index"].values], label="Train Mean C-Index")
plt.plot([i for i in results.index], [i for i in results["mean_test_c_index"].values], label="Test Mean C-Index")

plt.legend()
plt.show()

In [None]:
mask = results["rank_test_c_index"]=0
params =  results.loc[mask, "params"]
print(params)

In [None]:
pickle.dump(pd.concat([cat_df, num_df], axis=1), open("TD_005010X223A2_"+payer_name+".pickle", "wb"))
pickle.dump(rsf_df, open("ED_005010X223A2_"+payer_name+".pickle", "wb"))
pickle.dump(encoder_dict, open("LE_005010X223A2_"+payer_name+".pickle", "wb"))
pickle.dump(rsf, open("005010X223A2_"+payer_name+".pickle", "wb"))
pickle.dump(results, open("RE_005010X223A2_"+payer_name+".pickle", "wb"))