### PAs rejection feature selection ###

This notebook contains methods for:

- Data processing and cleaning
- Feature selection using the following methods:
    - Correlation
    - Linear SVC
    - Logistic Regression
    - Random Forest
    - Extra Tree

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Data processing and cleaning ####

In [2]:
## Import data
claims = pd.read_csv('../Data/dim_claims.csv')
dates = pd.read_csv('../Data/dim_date.csv')
pa = pd.read_csv('../Data/dim_pa.csv')
bridge = pd.read_csv('../Data/bridge.csv')

In [3]:
## Marge data
data = claims.merge(bridge, on='dim_claim_id')
data = data.merge(dates, on='dim_date_id')
data = data.merge(pa, on='dim_pa_id')

In [4]:
## Drop not relevant columns
data.drop(columns=['dim_claim_id', 'pharmacy_claim_approved', 'dim_pa_id', 'dim_date_id'], 
                  axis=1, inplace=True)

In [5]:
## One-hot encoding variables
data = pd.get_dummies(data, columns=['drug', 'bin', 'reject_code'], drop_first=False)

In [6]:
## Divide X and y data
X = data[['calendar_month', 'calendar_day', 'day_of_week', 'is_weekday', 'is_workday', 'is_holiday',
          'correct_diagnosis', 'tried_and_failed', 'contraindication', 'drug_A', 'drug_B', 'drug_C',
          'bin_417380', 'bin_417614', 'bin_417740', 'bin_999001', 'reject_code_70.0', 
          'reject_code_75.0', 'reject_code_76.0']]
y = data['pa_approved']

In [7]:
#Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=.2,
                                                   random_state=5438,
                                                   shuffle=True,
                                                   stratify=y)

**Feature selection methods:**
- **Correlation**

In [8]:
y_Train = pd.DataFrame(y_train)
corr = pd.concat([y_Train, X_train], axis=1, keys=['y_Train', 'X_train']).corr().loc[ 'X_train', 'y_Train'].sort_values('pa_approved',ascending=False)
corr

Unnamed: 0,pa_approved
reject_code_75.0,0.388569
reject_code_76.0,0.144627
bin_999001,0.140282
tried_and_failed,0.12549
bin_417380,0.065973
drug_A,0.064607
correct_diagnosis,0.03644
drug_B,0.03446
is_workday,0.001061
is_weekday,0.000936


- **Linear SVC**

In [9]:
#Import LinearSVC
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [10]:
lsvc = LinearSVC(C=1, penalty="l1", dual=False, max_iter=1000)
lsvc.fit(X_train, y_train)

LinearSVC(C=1, dual=False, penalty='l1')

In [11]:
score_df = pd.DataFrame({'feature':X_train.columns,
                            'importance_score': lsvc.coef_.ravel()}).sort_values('importance_score',ascending=False)
score_df

Unnamed: 0,feature,importance_score
17,reject_code_75.0,1.058738
18,reject_code_76.0,0.586124
7,tried_and_failed,0.338523
9,drug_A,0.254661
15,bin_999001,0.192618
6,correct_diagnosis,0.120853
12,bin_417380,0.115575
10,drug_B,0.039951
13,bin_417614,0.014926
3,is_weekday,0.010011


- **Logistic Regression**

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
logr = LogisticRegression(C=1, penalty="l1", solver='liblinear')
logr.fit(X_train, y_train)

LogisticRegression(C=1, penalty='l1', solver='liblinear')

In [14]:
score_df = pd.DataFrame({'feature':X_train.columns,
                            'importance_score': logr.coef_.ravel()}).sort_values('importance_score',ascending=False)
score_df

Unnamed: 0,feature,importance_score
17,reject_code_75.0,3.139265
18,reject_code_76.0,1.504848
9,drug_A,0.937501
7,tried_and_failed,0.919675
12,bin_417380,0.864774
15,bin_999001,0.720342
10,drug_B,0.43537
6,correct_diagnosis,0.326923
3,is_weekday,0.16466
2,day_of_week,0.004655


- **Random Forest**

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [17]:
importance = score_df = pd.DataFrame({'feature':X.columns,
                                      'importance_score': rf.feature_importances_})

importance.sort_values('importance_score',ascending=False)

Unnamed: 0,feature,importance_score
16,reject_code_70.0,0.22501
1,calendar_day,0.166112
17,reject_code_75.0,0.143274
8,contraindication,0.117493
0,calendar_month,0.074823
7,tried_and_failed,0.047053
2,day_of_week,0.032378
11,drug_C,0.028571
9,drug_A,0.027709
13,bin_417614,0.026835


- **Extra Trees**

In [18]:
from sklearn.ensemble import ExtraTreesClassifier

In [19]:
extra = ExtraTreesClassifier()

extra.fit(X_train, y_train)

ExtraTreesClassifier()

In [20]:
score_df = pd.DataFrame({'feature':X_train.columns,
                            'importance_score': extra.feature_importances_})

score_df.sort_values('importance_score',ascending=False)

Unnamed: 0,feature,importance_score
16,reject_code_70.0,0.265674
17,reject_code_75.0,0.14365
8,contraindication,0.130915
1,calendar_day,0.122777
0,calendar_month,0.055175
7,tried_and_failed,0.050148
9,drug_A,0.03029
11,drug_C,0.029846
13,bin_417614,0.027075
18,reject_code_76.0,0.026258
