In [36]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
cmm_data = pd.read_csv('/home/ruggiec/Downloads/pharmacy_tx.csv')
cmm_data_no_dups = cmm_data.drop_duplicates()
cmm_train, cmm_test = train_test_split(cmm_data_no_dups, shuffle=True, 
                             random_state= 614, test_size=0.2)

In [3]:
def is_it_generic(text):
    if text.split()[0] == 'generic':
        return 1
    else:
        return 0
    
def drug_name(text):
    return text.split()[1]

def pharm_split(text):
    return int(text.split('#')[1])

In [4]:
cmm_train['generic'] = cmm_train.drug.apply(is_it_generic)
cmm_train['name'] = cmm_train['drug'].apply(drug_name)
cmm_train['private'] = cmm_train.group.isna().astype(int)
cmm_train['popularity'] = cmm_train.groupby('name')[['tx_date']].transform('count')
cmm_train['rejected'] = cmm_train['rejected'].astype(int)
cmm_train['rejected_count'] = cmm_train.groupby('name')['rejected'].transform('sum')
cmm_train['pharm_num'] = cmm_train.pharmacy.apply(pharm_split)
cmm_train['tx_date'] = pd.to_datetime(cmm_train.tx_date)
cmm_train['weekday'] = cmm_train.tx_date.dt.day_of_week
cmm_train['day_num'] = cmm_train.tx_date.dt.day_of_year

In [37]:
le_name = LabelEncoder()
le_name.fit(cmm_train['name'])
cmm_train['name_encoded'] = le_name.transform(cmm_train['name'])

le_diagnosis = LabelEncoder()
le_diagnosis.fit(cmm_train['diagnosis'])
cmm_train['diagnosis_encoded'] = le_diagnosis.transform(cmm_train['diagnosis'])

le_bin = LabelEncoder()
le_bin.fit(cmm_train['bin'])
cmm_train['bin_encoded'] = le_bin.transform(cmm_train['bin'])

le_pcn = LabelEncoder()
le_pcn.fit(cmm_train['pcn'])
cmm_train['pcn_encoded'] = le_pcn.transform(cmm_train['pcn'])

In [89]:
minority = resample(cmm_train[cmm_train['rejected']==0], 
                replace=False, 
                n_samples=len(cmm_train[cmm_train['rejected']==1]))

majority = cmm_train[cmm_train['rejected']==1]

undersample = pd.concat([majority, minority])

In [84]:
undersample.head()

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,...,private,popularity,rejected_count,pharm_num,weekday,day_num,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
8270129,2022-08-14,Pharmacy #15,P07.55,branded semufolic,664344,YFVIA,AJK5MZ25T9IA,1,0.0,0,...,0,237813,41930,15,6,226,60,79,5,44
7009332,2022-07-14,Pharmacy #14,Y51.55,generic sorine,160389,RB7UU,RS5RB3YA,1,0.0,1,...,0,115708,13891,14,3,195,63,124,1,31
1316687,2022-02-10,Pharmacy #37,Z20.23,branded vivafastat,160389,RB7UU,RS5RB3YA,1,0.0,0,...,0,43263,14072,37,3,41,75,127,1,31
9652363,2022-09-17,Pharmacy #7,I38.43,branded colifunene,725700,,DYGBI610ZY,1,0.0,0,...,0,179278,14364,7,5,260,7,49,8,48
7415161,2022-07-23,Pharmacy #43,H36.57,branded semufolic,725700,9C5MOR3,S2QKZ0OFNWS6X,1,0.0,0,...,0,237813,41930,43,5,204,60,41,8,8


In [52]:
undersample.columns

Index(['tx_date', 'pharmacy', 'diagnosis', 'drug', 'bin', 'pcn', 'group',
       'rejected', 'patient_pay', 'generic', 'name', 'private', 'popularity',
       'rejected_count', 'pharm_num', 'weekday', 'day_num', 'name_encoded',
       'diagnosis_encoded', 'bin_encoded', 'pcn_encoded'],
      dtype='object')

In [42]:
features = ['generic', 'private', 'pharm_num', 'weekday', 'day_num', 
            'name_encoded', 'diagnosis_encoded', 'bin_encoded', 'pcn_encoded']

In [70]:
gbdt = HistGradientBoostingClassifier(categorical_features=[1, 1, 1, 1, 0, 1, 1, 1, 1])

In [71]:
kf = KFold(n_splits=5, shuffle=True, random_state=614)

In [85]:
metrics = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(undersample):
    gbdt.fit(undersample[features].iloc[tt_index], undersample['rejected'].iloc[tt_index])
    predictions = gbdt.predict(undersample[features].iloc[ho_index])
    metrics[i, 0] = recall_score(undersample['rejected'].iloc[ho_index], predictions)
    metrics[i, 1] = precision_score(undersample['rejected'].iloc[ho_index], predictions)
    metrics[i, 2] = accuracy_score(undersample['rejected'].iloc[ho_index], predictions)
    i+=1

In [86]:
print('average recall is', metrics.mean(axis=0)[0])
print('average precision is', metrics.mean(axis=0)[1])
print('average accuracy is', metrics.mean(axis=0)[2])

average recall is 0.9872509280442928
average precision is 0.8266632460266828
average accuracy is 0.8779120361177553


In [87]:
gbdt.fit(undersample[features], undersample['rejected'])
preds = gbdt.predict(cmm_train[features])

print('recall is',recall_score(cmm_train['rejected'], preds))
print('precision is',precision_score(cmm_train['rejected'], preds))
print('accuracy is',accuracy_score(cmm_train['rejected'], preds))

recall is 0.9884767134576886
precision is 0.2583905362285003
accuracy is 0.7619864956142987
