# Feature Selection Claim Approval

We are predicting whether a claim will be approved or not, and subsequently require an ePA.

In [178]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(8.0,6.0)})

In [179]:
data = pd.read_csv('../../data/training/train.csv',index_col=1, parse_dates=True)

If we are predicting whether or not there is an ePA, we need to drop the ePA information.

In [180]:
pa_columns = ['correct_diagnosis','tried_and_failed','contraindication','pa_approved','reject_code']

We can also drop the id's, since these are useful indices but not useful for classification.

In [181]:
id_columns = ['dim_pa_id','dim_date_id','dim_claim_id','Unnamed: 0']

We can also drop the year, since this is not cyclical.

In [182]:
date_columns = ['calendar_year']

In [183]:
data = data.drop(columns=pa_columns+id_columns+date_columns)

In [184]:
data.head()

Unnamed: 0_level_0,calendar_month,calendar_day,day_of_week,is_weekday,is_workday,is_holiday,bin,drug,pharmacy_claim_approved
date_val,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2019-06-18,6,18,3,1,1,0,999001,C,1
2019-02-07,2,7,5,1,1,0,999001,B,1
2017-01-19,1,19,5,1,1,0,417614,B,0
2019-04-01,4,1,2,1,1,0,417614,B,0
2017-11-07,11,7,3,1,1,0,417380,A,0


In [185]:
def encode(data,feature,prefix=''):
    for feat in np.unique(data[feature]):
        data[prefix+'_'+str(feat)] = pd.get_dummies(data[feature])[feat]
    data = data.drop(columns=[feature])
    return data

In [186]:
# Encode the data 

data = encode(data,'drug','drug')

data = encode(data,'bin','payer')

In [187]:
data.head()

Unnamed: 0_level_0,calendar_month,calendar_day,day_of_week,is_weekday,is_workday,is_holiday,pharmacy_claim_approved,drug_A,drug_B,drug_C,payer_417380,payer_417614,payer_417740,payer_999001
date_val,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-06-18,6,18,3,1,1,0,1,0,0,1,0,0,0,1
2019-02-07,2,7,5,1,1,0,1,0,1,0,0,0,0,1
2017-01-19,1,19,5,1,1,0,0,0,1,0,0,1,0,0
2019-04-01,4,1,2,1,1,0,0,0,1,0,0,1,0,0
2017-11-07,11,7,3,1,1,0,0,1,0,0,1,0,0,0


## RandomForest Feature Importance

In [189]:
X = data.drop(columns=['pharmacy_claim_approved'])
y = data['pharmacy_claim_approved']

In [190]:
from sklearn.ensemble import RandomForestClassifier

Quick optimization so we have a decent model to calculate this.

In [191]:
from sklearn.ensemble import RandomForestClassifier

### Feature Importance Results

In [195]:
forest = RandomForestClassifier()

forest.fit(X, y)

RandomForestClassifier()

In [197]:
score_df = pd.DataFrame({'feature':X.columns,
                            'importance_score': forest.feature_importances_})

score_df.sort_values('importance_score',ascending=False)

Unnamed: 0,feature,importance_score
12,payer_999001,0.313673
10,payer_417614,0.170023
11,payer_417740,0.11178
8,drug_C,0.111718
6,drug_A,0.110845
9,payer_417380,0.103069
7,drug_B,0.071891
1,calendar_day,0.004347
0,calendar_month,0.001829
2,day_of_week,0.000745


### Cross-Checking with Extra Trees

In [198]:
from sklearn.ensemble import ExtraTreesClassifier

In [201]:
extra = ExtraTreesClassifier()

extra.fit(X, y)

ExtraTreesClassifier()

In [203]:
score_df = pd.DataFrame({'feature':X.columns,
                            'importance_score': extra.feature_importances_})

score_df.sort_values('importance_score',ascending=False)

Unnamed: 0,feature,importance_score
12,payer_999001,0.31419
10,payer_417614,0.206536
9,payer_417380,0.112818
6,drug_A,0.11141
8,drug_C,0.090554
11,payer_417740,0.081452
7,drug_B,0.079499
1,calendar_day,0.002142
0,calendar_month,0.001
2,day_of_week,0.000358
