# Quick Model for Classifying Rejection.
I am going give to XGBoost a quick go to classify if a drug has been rejected

### Import Libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

### Import data and split into train/test groups

In [2]:
cmm_data = pd.read_csv('/home/ruggiec/Downloads/pharmacy_tx.csv')
cmm_data_no_dups = cmm_data.drop_duplicates()
cmm_train, cmm_test = train_test_split(cmm_data_no_dups, shuffle=True, 
                             random_state= 614, test_size=0.2)

### Feature Engineering

In [3]:
def is_it_generic(text):
    if text.split()[0] == 'generic':
        return 1
    else:
        return 0
    
def drug_name(text):
    return text.split()[1]

In [4]:
cmm_train['generic'] = cmm_train.drug.apply(is_it_generic)
cmm_train['name'] = cmm_train['drug'].apply(drug_name)
cmm_train['private'] = cmm_train.group.isna().astype(int)
cmm_train['popularity'] = cmm_train.groupby('name')[['tx_date']].transform('count')
cmm_train['rejected'] = cmm_train['rejected'].astype(int)
cmm_train['rejected_count'] = cmm_train.groupby('name')['rejected'].transform('sum')

#### Encoding Categorical Variables with Label Encoder

In [8]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le_name = LabelEncoder()
le_name.fit(cmm_train['name'])
cmm_train['name_encoded'] = le_name.transform(cmm_train['name'])

names are transformed to numerical data above. below we can transform numerical data back to a name.

In [15]:
le_name.inverse_transform([19])

array(['foxivelule'], dtype=object)

In [19]:
le_diagnosis = LabelEncoder()
le_diagnosis.fit(cmm_train['diagnosis'])
cmm_train['diagnosis_encoded'] = le_diagnosis.transform(cmm_train['diagnosis'])

In [27]:
le_diagnosis.inverse_transform([66])

array(['M31.63'], dtype=object)

In [21]:
le_bin = LabelEncoder()
le_bin.fit(cmm_train['bin'])
cmm_train['bin_encoded'] = le_bin.transform(cmm_train['bin'])

In [26]:
le_bin.inverse_transform([2])

array([322463])

In [22]:
le_pcn = LabelEncoder()
le_pcn.fit(cmm_train['pcn'])
cmm_train['pcn_encoded'] = le_pcn.transform(cmm_train['pcn'])

In [29]:
le_pcn.inverse_transform([4])

array(['3O71UTS'], dtype=object)

### What do we have?

In [32]:
cmm_train.columns

Index(['tx_date', 'pharmacy', 'diagnosis', 'drug', 'bin', 'pcn', 'group',
       'rejected', 'patient_pay', 'generic', 'name', 'private', 'popularity',
       'rejected_count', 'name_encoded', 'diagnosis_encoded', 'bin_encoded',
       'pcn_encoded'],
      dtype='object')

In [23]:
cmm_train.head()

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,name,private,popularity,rejected_count,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
3962160,2022-04-23,Pharmacy #29,M31.63,branded mamate,322463,3O71UTS,,0,25.75,0,mamate,1,23539,4480,38,66,2,4
1161541,2022-02-06,Pharmacy #7,Z20.23,branded vivafastat,725700,327CKV,IOEAN1DWVV3Y,0,180.52,0,vivafastat,0,43263,14072,75,127,8,2
10643305,2022-10-12,Pharmacy #4,U41.19,generic foxivelule,322463,3O71UTS,,0,12.22,1,foxivelule,1,367822,0,19,101,2,4
10629491,2022-10-11,Pharmacy #31,K87.68,generic ribosatharin,664344,YFVIA,AJK5MZ25T9IA,0,12.37,1,ribosatharin,0,121908,8960,56,59,5,44
10928718,2022-10-18,Pharmacy #6,G99.93,generic simarol,96934,S76J7V6,,0,13.93,1,simarol,1,144501,7380,61,37,0,34


In [38]:
print('The number of rejected claims is', cmm_train['rejected'].sum())
print('The total number of claims is', len(cmm_train))

The number of rejected claims is 839691
The total number of claims is 10049476


The rejected category is not balanced so some consideration should be given to the metrics used to score the classifier.

In [33]:
base_features = ['diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 'private', 'generic', 'name_encoded']

### Importing the HistGradientBoostingClassifier
This should be faster than XGBoost for large datasets.

In [34]:
from sklearn.ensemble import HistGradientBoostingClassifier

In [35]:
clf = HistGradientBoostingClassifier()
clf.fit(cmm_train[base_features], cmm_train['rejected'])

In [36]:
clf.score(cmm_train[base_features], cmm_train['rejected'])

0.9164184281847133

In [41]:
clf.predict(cmm_train[cmm_train['rejected']==1][base_features]).sum()

76

In [44]:
clf.predict(cmm_train[base_features]).sum()

412

### The first go at classification of rejected claims failed to give any useful information