### Claims rejection prediction ###

This notebook contains methods for:
- Data processing and cleaning
- Feature selection using the following methods:
    - Correlation
    - Linear SVC
    - Logistic Regression
    - Random Forest
    - Extra Tree 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Data Processing and cleaning

In [2]:
claims = pd.read_csv('Data/dim_claims.csv')
dates = pd.read_csv('Data/dim_date.csv')
pa = pd.read_csv('Data/dim_pa.csv')
bridge = pd.read_csv('Data/bridge.csv')

In [3]:
data = claims.merge(bridge, on='dim_claim_id')
data = data.merge(dates, on='dim_date_id')

In [4]:
data.head()

Unnamed: 0,dim_claim_id,bin,drug,reject_code,pharmacy_claim_approved,dim_pa_id,dim_date_id,date_val,calendar_year,calendar_month,calendar_day,day_of_week,is_weekday,is_workday,is_holiday
0,1,417380,A,75.0,0,1.0,1,2017-01-01,2017,1,1,1,0,0,1
1,2,999001,A,,1,,1,2017-01-01,2017,1,1,1,0,0,1
2,3,417740,A,76.0,0,2.0,1,2017-01-01,2017,1,1,1,0,0,1
3,4,999001,A,,1,,1,2017-01-01,2017,1,1,1,0,0,1
4,5,417740,A,,1,,1,2017-01-01,2017,1,1,1,0,0,1


In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [6]:
df = data[['drug', 'bin', 'calendar_month', 
          'calendar_day', 'day_of_week', 'is_weekday', 'is_workday', 'is_holiday']]

X = pd.get_dummies(df, columns=['drug', 'bin'], drop_first=False)

X.head()

Unnamed: 0,calendar_month,calendar_day,day_of_week,is_weekday,is_workday,is_holiday,drug_A,drug_B,drug_C,bin_417380,bin_417614,bin_417740,bin_999001
0,1,1,1,0,0,1,1,0,0,1,0,0,0
1,1,1,1,0,0,1,1,0,0,0,0,0,1
2,1,1,1,0,0,1,1,0,0,0,0,1,0
3,1,1,1,0,0,1,1,0,0,0,0,0,1
4,1,1,1,0,0,1,1,0,0,0,0,1,0


In [7]:
y = data['pharmacy_claim_approved']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=.2,
                                                   random_state=5438,
                                                   shuffle=True,
                                                   stratify=y)

#### Feature selection methods:

- **Correlation**

In [9]:
y_Train = pd.DataFrame(y_train)

In [41]:
corr = pd.concat([y_Train, X_train], axis=1, keys=['y_Train', 'X_train']).corr().loc[ 'X_train', 'y_Train'].sort_values('pharmacy_claim_approved',ascending=False)
corr


Unnamed: 0,pharmacy_claim_approved
bin_999001,0.616084
drug_C,0.061885
calendar_day,0.001103
is_workday,-0.000252
is_holiday,-0.000306
is_weekday,-0.000595
calendar_month,-0.001346
day_of_week,-0.002046
drug_A,-0.017728
drug_B,-0.039766


- **Linear SVC**

In [12]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [19]:
lsvc = LinearSVC(C=1, penalty="l1", dual=False)
lsvc.fit(X_train, y_train)

LinearSVC(C=1, dual=False, penalty='l1')

In [42]:
score_df = pd.DataFrame({'feature':X_train.columns,
                            'importance_score': lsvc.coef_.ravel()}).sort_values('importance_score',ascending=False)
score_df

Unnamed: 0,feature,importance_score
12,bin_999001,0.571776
8,drug_C,0.284005
6,drug_A,0.157312
7,drug_B,0.104976
3,is_weekday,0.001351
1,calendar_day,-1.1e-05
0,calendar_month,-1.8e-05
4,is_workday,-0.000243
2,day_of_week,-0.000791
5,is_holiday,-0.015323


- **Logistic Regression**

In [36]:
from sklearn.linear_model import LogisticRegression

In [39]:
logr = LogisticRegression(C=1, penalty="l1", solver='liblinear')
logr.fit(X_train, y_train)

LogisticRegression(C=1, penalty='l1', solver='liblinear')

In [43]:
score_df = pd.DataFrame({'feature':X_train.columns,
                            'importance_score': logr.coef_.ravel()}).sort_values('importance_score',ascending=False)
score_df

Unnamed: 0,feature,importance_score
12,bin_999001,1.741458
8,drug_C,0.602064
6,drug_A,0.161483
3,is_weekday,0.023919
1,calendar_day,-4.5e-05
0,calendar_month,-0.000121
2,day_of_week,-0.00277
4,is_workday,-0.019927
7,drug_B,-0.022091
5,is_holiday,-0.071938


- **Random Forest**

In [32]:
from sklearn.ensemble import RandomForestClassifier

In [69]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [70]:
importance = score_df = pd.DataFrame({'feature':X.columns,
                                      'importance_score': rf.feature_importances_})

importance.sort_values('importance_score',ascending=False)

Unnamed: 0,feature,importance_score
12,bin_999001,0.342663
10,bin_417614,0.174922
8,drug_C,0.110398
9,bin_417380,0.104399
11,bin_417740,0.094759
6,drug_A,0.088725
7,drug_B,0.078209
1,calendar_day,0.003643
0,calendar_month,0.001559
2,day_of_week,0.000656


- **Extra Trees**

In [143]:
from sklearn.ensemble import ExtraTreesClassifier

In [147]:
extra = ExtraTreesClassifier()

extra.fit(X_train, y_train)

ExtraTreesClassifier()

In [148]:
score_df = pd.DataFrame({'feature':X_train.columns,
                            'importance_score': extra.feature_importances_})

score_df.sort_values('importance_score',ascending=False)

Unnamed: 0,feature,importance_score
12,bin_999001,0.336119
10,bin_417614,0.187855
9,bin_417380,0.112252
6,drug_A,0.100028
11,bin_417740,0.094141
8,drug_C,0.085417
7,drug_B,0.081224
1,calendar_day,0.001824
0,calendar_month,0.000813
2,day_of_week,0.000297
