# Quick Model for Classifying Rejection.
I am going give to XGBoost a quick go to classify if a drug has been rejected

### Import Libraries

In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

### Import data and split into train/test groups

In [5]:
cmm_data = pd.read_csv('/home/ruggiec/Downloads/pharmacy_tx.csv')
cmm_data_no_dups = cmm_data.drop_duplicates()
cmm_train, cmm_test = train_test_split(cmm_data_no_dups, shuffle=True, 
                             random_state= 614, test_size=0.2)

### Feature Engineering

In [6]:
def is_it_generic(text):
    if text.split()[0] == 'generic':
        return 1
    else:
        return 0
    
def drug_name(text):
    return text.split()[1]

def pharm_split(text):
    return int(text.split('#')[1])

In [7]:
cmm_train['generic'] = cmm_train.drug.apply(is_it_generic)
cmm_train['name'] = cmm_train['drug'].apply(drug_name)
cmm_train['private'] = cmm_train.group.isna().astype(int)
cmm_train['popularity'] = cmm_train.groupby('name')[['tx_date']].transform('count')
cmm_train['rejected'] = cmm_train['rejected'].astype(int)
cmm_train['rejected_count'] = cmm_train.groupby('name')['rejected'].transform('sum')
cmm_train['pharm_num'] = cmm_train.pharmacy.apply(pharm_split)
cmm_train['tx_date'] = pd.to_datetime(cmm_train.tx_date)
cmm_train['weekday'] = cmm_train.tx_date.dt.day_of_week
cmm_train['day_num'] = cmm_train.tx_date.dt.day_of_year

In [8]:
full_features =  ['weekday', 'day_num', 'pharm_num', 'diagnosis_encoded', 'bin_encoded', 
             'pcn_encoded', 'private', 'generic', 'name_encoded']

#### Encoding Categorical Variables with Label Encoder

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
le_name = LabelEncoder()
le_name.fit(cmm_train['name'])
cmm_train['name_encoded'] = le_name.transform(cmm_train['name'])

names are transformed to numerical data above. below we can transform numerical data back to a name.

In [12]:
le_name.inverse_transform([19])

array(['foxivelule'], dtype=object)

In [13]:
le_diagnosis = LabelEncoder()
le_diagnosis.fit(cmm_train['diagnosis'])
cmm_train['diagnosis_encoded'] = le_diagnosis.transform(cmm_train['diagnosis'])

In [14]:
le_diagnosis.inverse_transform([66])

array(['M31.63'], dtype=object)

In [15]:
le_bin = LabelEncoder()
le_bin.fit(cmm_train['bin'])
cmm_train['bin_encoded'] = le_bin.transform(cmm_train['bin'])

In [16]:
le_bin.inverse_transform([2])

array([322463])

In [17]:
le_pcn = LabelEncoder()
le_pcn.fit(cmm_train['pcn'])
cmm_train['pcn_encoded'] = le_pcn.transform(cmm_train['pcn'])

In [18]:
le_pcn.inverse_transform([4])

array(['3O71UTS'], dtype=object)

### What do we have?

In [19]:
cmm_train.columns

Index(['tx_date', 'pharmacy', 'diagnosis', 'drug', 'bin', 'pcn', 'group',
       'rejected', 'patient_pay', 'generic', 'name', 'private', 'popularity',
       'rejected_count', 'pharm_num', 'weekday', 'day_num', 'name_encoded',
       'diagnosis_encoded', 'bin_encoded', 'pcn_encoded'],
      dtype='object')

In [20]:
cmm_train.head()

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,...,private,popularity,rejected_count,pharm_num,weekday,day_num,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
3962160,2022-04-23,Pharmacy #29,M31.63,branded mamate,322463,3O71UTS,,0,25.75,0,...,1,23539,4480,29,5,113,38,66,2,4
1161541,2022-02-06,Pharmacy #7,Z20.23,branded vivafastat,725700,327CKV,IOEAN1DWVV3Y,0,180.52,0,...,0,43263,14072,7,6,37,75,127,8,2
10643305,2022-10-12,Pharmacy #4,U41.19,generic foxivelule,322463,3O71UTS,,0,12.22,1,...,1,367822,0,4,2,285,19,101,2,4
10629491,2022-10-11,Pharmacy #31,K87.68,generic ribosatharin,664344,YFVIA,AJK5MZ25T9IA,0,12.37,1,...,0,121908,8960,31,1,284,56,59,5,44
10928718,2022-10-18,Pharmacy #6,G99.93,generic simarol,96934,S76J7V6,,0,13.93,1,...,1,144501,7380,6,1,291,61,37,0,34


In [21]:
print('The number of rejected claims is', cmm_train['rejected'].sum())
print('The total number of claims is', len(cmm_train))
print('The percentage of rejected claims is', cmm_train['rejected'].sum()/len(cmm_train)*100, '%')

The number of rejected claims is 839691
The total number of claims is 10049476
The percentage of rejected claims is 8.355569981957268 %


The rejected category is imbalanced so some consideration should be given to the metrics used to score the classifier.

In [35]:
base_features = ['diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 
                 'private', 'generic', 'name_encoded']
pharm_features = ['pharm_num', 'diagnosis_encoded', 'bin_encoded', 
                  'pcn_encoded', 'private', 'generic', 'name_encoded']
day_features = ['weekday', 'pharm_num', 'diagnosis_encoded', 'bin_encoded',
               'pcn_encoded', 'private', 'generic', 'name_encoded']
full_features = ['weekday', 'day_num', 'pharm_num', 'diagnosis_encoded', 'bin_encoded',
               'pcn_encoded', 'private', 'generic', 'name_encoded']

### Importing the HistGradientBoostingClassifier
This should be faster than XGBoost for large datasets.

Using cross validation with recall, precision, and accuracy for metrics

In [23]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, precision_score, accuracy_score

In [24]:
kf = KFold(n_splits=5, shuffle=True, random_state=614)
clf = HistGradientBoostingClassifier()

In [25]:
metrics = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(cmm_train):
    clf.fit(cmm_train[base_features].iloc[tt_index], cmm_train['rejected'].iloc[tt_index])
    predictions = clf.predict(cmm_train[base_features].iloc[ho_index])
    metrics[i, 0] = recall_score(cmm_train['rejected'].iloc[ho_index], predictions)
    metrics[i, 1] = precision_score(cmm_train['rejected'].iloc[ho_index], predictions)
    metrics[i, 2] = accuracy_score(cmm_train['rejected'].iloc[ho_index], predictions)
    i+=1

In [26]:
print('average recall is', metrics.mean(axis=0)[0])
print('average precision is', metrics.mean(axis=0)[1])
print('average accuracy is', metrics.mean(axis=0)[2])

average recall is 5.237346940798134e-05
average precision is 0.16261081402257874
average accuracy is 0.916422906003724


The first go at classification of rejected claims failed to give any useful information

High accuracy with low precision and recall implies most things are being classified as not rejected

### Let's try Down Sampling

In [27]:
from sklearn.utils import resample

In [28]:
temp = resample(cmm_train[cmm_train['rejected']==0], 
                replace=False, 
                n_samples=len(cmm_train[cmm_train['rejected']==1]))

In [29]:
downsample = pd.concat([cmm_train[cmm_train['rejected']==1], temp])

In [30]:
downsample

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,...,private,popularity,rejected_count,pharm_num,weekday,day_num,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
8270129,2022-08-14,Pharmacy #15,P07.55,branded semufolic,664344,YFVIA,AJK5MZ25T9IA,1,0.00,0,...,0,237813,41930,15,6,226,60,79,5,44
7009332,2022-07-14,Pharmacy #14,Y51.55,generic sorine,160389,RB7UU,RS5RB3YA,1,0.00,1,...,0,115708,13891,14,3,195,63,124,1,31
1316687,2022-02-10,Pharmacy #37,Z20.23,branded vivafastat,160389,RB7UU,RS5RB3YA,1,0.00,0,...,0,43263,14072,37,3,41,75,127,1,31
9652363,2022-09-17,Pharmacy #7,I38.43,branded colifunene,725700,,DYGBI610ZY,1,0.00,0,...,0,179278,14364,7,5,260,7,49,8,48
7415161,2022-07-23,Pharmacy #43,H36.57,branded semufolic,725700,9C5MOR3,S2QKZ0OFNWS6X,1,0.00,0,...,0,237813,41930,43,5,204,60,41,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10182774,2022-10-02,Pharmacy #21,H60.83,generic tovane,691847,N098KI,6SP1DG,0,15.55,1,...,0,243513,0,21,6,275,72,45,6,25
5314398,2022-05-29,Pharmacy #28,H36.57,branded antimab,322463,3O71UTS,,0,19.46,0,...,1,207603,619,28,6,149,0,41,2,4
1954597,2022-02-26,Pharmacy #6,K87.68,generic ribosatharin,725700,1UQC,,0,12.41,1,...,1,121908,8960,6,5,57,56,59,8,0
12196724,2022-11-17,Pharmacy #2,I68.27,branded prazinib,664344,YFVIA,AJK5MZ25T9IA,0,17.11,0,...,0,919151,0,2,3,321,52,52,5,44


In [31]:
len(downsample)

1679382

In [32]:
metrics_down = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(downsample):
    clf.fit(downsample[base_features].iloc[tt_index], downsample['rejected'].iloc[tt_index])
    predictions = clf.predict(downsample[base_features].iloc[ho_index])
    metrics_down[i, 0] = recall_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 1] = precision_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 2] = accuracy_score(downsample['rejected'].iloc[ho_index], predictions)
    i+=1

In [33]:
print('average recall is', metrics_down.mean(axis=0)[0])
print('average precision is', metrics_down.mean(axis=0)[1])
print('average accuracy is', metrics_down.mean(axis=0)[2])

average recall is 0.9776516850340533
average precision is 0.7965152197724688
average accuracy is 0.8639451897418422


Added the pharmacy and day of week as features in this model

In [36]:
metrics_down = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(downsample):
    clf.fit(downsample[day_features].iloc[tt_index], downsample['rejected'].iloc[tt_index])
    predictions = clf.predict(downsample[day_features].iloc[ho_index])
    metrics_down[i, 0] = recall_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 1] = precision_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 2] = accuracy_score(downsample['rejected'].iloc[ho_index], predictions)
    i+=1

In [37]:
print('average recall is', metrics_down.mean(axis=0)[0])
print('average precision is', metrics_down.mean(axis=0)[1])
print('average accuracy is', metrics_down.mean(axis=0)[2])

average recall is 0.9782950564757502
average precision is 0.7963182106144838
average accuracy is 0.8640345080449029


full feature set

In [38]:
metrics_down = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(downsample):
    clf.fit(downsample[full_features].iloc[tt_index], downsample['rejected'].iloc[tt_index])
    predictions = clf.predict(downsample[full_features].iloc[ho_index])
    metrics_down[i, 0] = recall_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 1] = precision_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 2] = accuracy_score(downsample['rejected'].iloc[ho_index], predictions)
    i+=1

In [39]:
print('average recall is', metrics_down.mean(axis=0)[0])
print('average precision is', metrics_down.mean(axis=0)[1])
print('average accuracy is', metrics_down.mean(axis=0)[2])

average recall is 0.9768743879037652
average precision is 0.7968719689142606
average accuracy is 0.8639308984166607


This is somewhat promising. 

In [42]:
from sklearn.metrics import confusion_matrix

base features

In [46]:
clf.fit(downsample[base_features], downsample['rejected'])
pred_down = clf.predict(cmm_train[base_features])

print('recall is',recall_score(cmm_train['rejected'], pred_down))
print('precision is',precision_score(cmm_train['rejected'], pred_down))
print('accuracy is',accuracy_score(cmm_train['rejected'], pred_down))

recall is 0.979187582098653
precision is 0.26172704868823726
accuracy is 0.7674745429512941


In [47]:
cf_mat = confusion_matrix(cmm_train['rejected'], pred_down)
pd.DataFrame(cf_mat, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6890502,2319283
Actual 1,17476,822215


full features

In [40]:
clf.fit(downsample[day_features], downsample['rejected'])
pred_down = clf.predict(cmm_train[day_features])

In [41]:
print('recall is',recall_score(cmm_train['rejected'], pred_down))
print('precision is',precision_score(cmm_train['rejected'], pred_down))
print('accuracy is',accuracy_score(cmm_train['rejected'], pred_down))

recall is 0.9748478904739958
precision is 0.26284328417396824
accuracy is 0.7694564373306628


In [45]:
cf_mat = confusion_matrix(cmm_train['rejected'], pred_down)
pd.DataFrame(cf_mat, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6914063,2295722
Actual 1,21120,818571


Downsampling looks to be an efficient way to train the classifier to predict rejected claims but reduces the training data for non-rejected claims. Perhaps upsampling will fix that problem.

# Let's Compare to some other Algorithms

I will try adaboost and randomforest

In [40]:
from sklearn.ensemble import AdaBoostClassifier

In [41]:
ada_class = AdaBoostClassifier(n_estimators=100, random_state=0)

In [42]:
metrics_down = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(downsample):
    ada_class.fit(downsample[base_features].iloc[tt_index], downsample['rejected'].iloc[tt_index])
    predictions = ada_class.predict(downsample[base_features].iloc[ho_index])
    metrics_down[i, 0] = recall_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 1] = precision_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 2] = accuracy_score(downsample['rejected'].iloc[ho_index], predictions)
    i+=1

In [45]:
print('average recall is', metrics_down.mean(axis=0)[0])
print('average precision is', metrics_down.mean(axis=0)[1])
print('average accuracy is', metrics_down.mean(axis=0)[2])

average recall is 0.8758974919919822
average precision is 0.7827331190677065
average accuracy is 0.8163854313882792


Train adaboost classifier on the downsampled data but predict on the train set

In [47]:
ada_class.fit(downsample[base_features], downsample['rejected'])
pred_down = ada_class.predict(cmm_train[base_features])

In [48]:
print('recall is',recall_score(cmm_train['rejected'], pred_down))
print('precision is',precision_score(cmm_train['rejected'], pred_down))
print('accuracy is',accuracy_score(cmm_train['rejected'], pred_down))

recall is 0.8768499364647233
precision is 0.24978271150261866
accuracy is 0.7696577413588529


In [49]:
cf_mat = confusion_matrix(cmm_train['rejected'], pred_down)

In [50]:
pd.DataFrame(cf_mat, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6998374,2211411
Actual 1,103408,736283
