# Quick Model for Classifying Rejection.
I am going give to XGBoost a quick go to classify if a drug has been rejected

### Import Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

### Import data and split into train/test groups

In [2]:
cmm_data = pd.read_csv('/home/ruggiec/Downloads/pharmacy_tx.csv')
cmm_data_no_dups = cmm_data.drop_duplicates()
cmm_train, cmm_test = train_test_split(cmm_data_no_dups, shuffle=True, 
                             random_state= 614, test_size=0.2)

### Feature Engineering

In [3]:
def is_it_generic(text):
    if text.split()[0] == 'generic':
        return 1
    else:
        return 0
    
def drug_name(text):
    return text.split()[1]

In [4]:
cmm_train['generic'] = cmm_train.drug.apply(is_it_generic)
cmm_train['name'] = cmm_train['drug'].apply(drug_name)
cmm_train['private'] = cmm_train.group.isna().astype(int)
cmm_train['popularity'] = cmm_train.groupby('name')[['tx_date']].transform('count')
cmm_train['rejected'] = cmm_train['rejected'].astype(int)
cmm_train['rejected_count'] = cmm_train.groupby('name')['rejected'].transform('sum')

#### Encoding Categorical Variables with Label Encoder

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
le_name = LabelEncoder()
le_name.fit(cmm_train['name'])
cmm_train['name_encoded'] = le_name.transform(cmm_train['name'])

names are transformed to numerical data above. below we can transform numerical data back to a name.

In [7]:
le_name.inverse_transform([19])

array(['foxivelule'], dtype=object)

In [8]:
le_diagnosis = LabelEncoder()
le_diagnosis.fit(cmm_train['diagnosis'])
cmm_train['diagnosis_encoded'] = le_diagnosis.transform(cmm_train['diagnosis'])

In [9]:
le_diagnosis.inverse_transform([66])

array(['M31.63'], dtype=object)

In [10]:
le_bin = LabelEncoder()
le_bin.fit(cmm_train['bin'])
cmm_train['bin_encoded'] = le_bin.transform(cmm_train['bin'])

In [11]:
le_bin.inverse_transform([2])

array([322463])

In [12]:
le_pcn = LabelEncoder()
le_pcn.fit(cmm_train['pcn'])
cmm_train['pcn_encoded'] = le_pcn.transform(cmm_train['pcn'])

In [13]:
le_pcn.inverse_transform([4])

array(['3O71UTS'], dtype=object)

### What do we have?

In [14]:
cmm_train.columns

Index(['tx_date', 'pharmacy', 'diagnosis', 'drug', 'bin', 'pcn', 'group',
       'rejected', 'patient_pay', 'generic', 'name', 'private', 'popularity',
       'rejected_count', 'name_encoded', 'diagnosis_encoded', 'bin_encoded',
       'pcn_encoded'],
      dtype='object')

In [15]:
cmm_train.head()

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,name,private,popularity,rejected_count,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
3962160,2022-04-23,Pharmacy #29,M31.63,branded mamate,322463,3O71UTS,,0,25.75,0,mamate,1,23539,4480,38,66,2,4
1161541,2022-02-06,Pharmacy #7,Z20.23,branded vivafastat,725700,327CKV,IOEAN1DWVV3Y,0,180.52,0,vivafastat,0,43263,14072,75,127,8,2
10643305,2022-10-12,Pharmacy #4,U41.19,generic foxivelule,322463,3O71UTS,,0,12.22,1,foxivelule,1,367822,0,19,101,2,4
10629491,2022-10-11,Pharmacy #31,K87.68,generic ribosatharin,664344,YFVIA,AJK5MZ25T9IA,0,12.37,1,ribosatharin,0,121908,8960,56,59,5,44
10928718,2022-10-18,Pharmacy #6,G99.93,generic simarol,96934,S76J7V6,,0,13.93,1,simarol,1,144501,7380,61,37,0,34


In [16]:
print('The number of rejected claims is', cmm_train['rejected'].sum())
print('The total number of claims is', len(cmm_train))
print('The percentage of rejected claims is', cmm_train['rejected'].sum()/len(cmm_train)*100, '%')

The number of rejected claims is 839691
The total number of claims is 10049476
The percentage of rejected claims is 8.355569981957268 %


The rejected category is imbalanced so some consideration should be given to the metrics used to score the classifier.

In [17]:
base_features = ['diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 'private', 'generic', 'name_encoded']

### Importing the HistGradientBoostingClassifier
This should be faster than XGBoost for large datasets.

Using cross validation with recall, precision, and accuracy for metrics

In [18]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, precision_score, accuracy_score

In [19]:
kf = KFold(n_splits=5, shuffle=True, random_state=614)
clf = HistGradientBoostingClassifier()

In [20]:
metrics = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(cmm_train):
    clf.fit(cmm_train[base_features].iloc[tt_index], cmm_train['rejected'].iloc[tt_index])
    predictions = clf.predict(cmm_train[base_features].iloc[ho_index])
    metrics[i, 0] = recall_score(cmm_train['rejected'].iloc[ho_index], predictions)
    metrics[i, 1] = precision_score(cmm_train['rejected'].iloc[ho_index], predictions)
    metrics[i, 2] = accuracy_score(cmm_train['rejected'].iloc[ho_index], predictions)
    i+=1

  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
print('average recall is', metrics.mean(axis=0)[0])
print('average precision is', metrics.mean(axis=0)[1])
print('average accuracy is', metrics.mean(axis=0)[2])

average recall is 3.5716764802593775e-05
average precision is 0.07574154240820907
average accuracy is 0.9164340508617526


The first go at classification of rejected claims failed to give any useful information

High accuracy with low precision and recall implies most things are being classified as not rejected

### Let's try Down Sampling

In [22]:
from sklearn.utils import resample

In [39]:
temp = resample(cmm_train[cmm_train['rejected']==0], 
                replace=False, 
                n_samples=len(cmm_train[cmm_train['rejected']==1]))

In [46]:
downsample = pd.concat([cmm_train[cmm_train['rejected']==1], temp])

In [47]:
downsample

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,name,private,popularity,rejected_count,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
8270129,2022-08-14,Pharmacy #15,P07.55,branded semufolic,664344,YFVIA,AJK5MZ25T9IA,1,0.00,0,semufolic,0,237813,41930,60,79,5,44
7009332,2022-07-14,Pharmacy #14,Y51.55,generic sorine,160389,RB7UU,RS5RB3YA,1,0.00,1,sorine,0,115708,13891,63,124,1,31
1316687,2022-02-10,Pharmacy #37,Z20.23,branded vivafastat,160389,RB7UU,RS5RB3YA,1,0.00,0,vivafastat,0,43263,14072,75,127,1,31
9652363,2022-09-17,Pharmacy #7,I38.43,branded colifunene,725700,,DYGBI610ZY,1,0.00,0,colifunene,0,179278,14364,7,49,8,48
7415161,2022-07-23,Pharmacy #43,H36.57,branded semufolic,725700,9C5MOR3,S2QKZ0OFNWS6X,1,0.00,0,semufolic,0,237813,41930,60,41,8,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11386342,2022-10-30,Pharmacy #20,K32.86,branded lalol,160389,RB7UU,RS5RB3YA,0,12.54,0,lalol,0,202160,0,35,57,1,31
6860054,2022-07-10,Pharmacy #0,W50.87,generic pucomalol,664344,YFVIA,AJK5MZ25T9IA,0,17.11,1,pucomalol,0,417336,10707,54,115,5,44
64754,2022-01-05,Pharmacy #38,C98.15,generic ratin,322463,3Y5ZW0,,0,18.78,1,ratin,1,262785,12316,55,21,2,5
7256746,2022-07-19,Pharmacy #28,K32.86,branded pucomalol,725700,327CKV,IOEAN1DWVV3Y,0,14.14,0,pucomalol,0,417336,10707,54,57,8,2


In [48]:
len(downsample)

1679382

In [50]:
metrics_down = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(downsample):
    clf.fit(downsample[base_features].iloc[tt_index], downsample['rejected'].iloc[tt_index])
    predictions = clf.predict(downsample[base_features].iloc[ho_index])
    metrics_down[i, 0] = recall_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 1] = precision_score(downsample['rejected'].iloc[ho_index], predictions)
    metrics_down[i, 2] = accuracy_score(downsample['rejected'].iloc[ho_index], predictions)
    i+=1

In [51]:
print('average recall is', metrics_down.mean(axis=0)[0])
print('average precision is', metrics_down.mean(axis=0)[1])
print('average accuracy is', metrics_down.mean(axis=0)[2])

average recall is 0.977365031776008
average precision is 0.796125455764795
average accuracy is 0.8635390874756379


This is somewhat promising. 

In [52]:
clf.fit(downsample[base_features], downsample['rejected'])
pred_down = clf.predict(cmm_train[base_features])

In [63]:
print('recall is',recall_score(cmm_train['rejected'], pred_down))
print('precision is',precision_score(cmm_train['rejected'], pred_down))
print('accuracy is',accuracy_score(cmm_train['rejected'], pred_down))

recall is 0.9781645867348823
precision is 0.2629000446192226
accuracy is 0.7690234794331565


In [56]:
from sklearn.metrics import confusion_matrix

In [58]:
cf_mat = confusion_matrix(cmm_train['rejected'], pred_down)

In [60]:
pd.DataFrame(cf_mat, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,6906927,2302858
Actual 1,18335,821356


Downsampling looks to be an efficient way to train the classifier to predict rejected claims but reduces the training data for non-rejected claims. Perhaps upsampling will fix that problem.