In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [3]:
cmm_data = pd.read_csv('/home/ruggiec/Downloads/pharmacy_tx.csv')
cmm_data_no_dups = cmm_data.drop_duplicates()
cmm_train, cmm_test = train_test_split(cmm_data_no_dups, shuffle=True, 
                             random_state= 614, test_size=0.2)

In [4]:
def is_it_generic(text):
    if text.split()[0] == 'generic':
        return 1
    else:
        return 0
    
def drug_name(text):
    return text.split()[1]

def pharm_split(text):
    return int(text.split('#')[1])

In [5]:
cmm_train['generic'] = cmm_train.drug.apply(is_it_generic)
cmm_train['name'] = cmm_train['drug'].apply(drug_name)
cmm_train['private'] = cmm_train.group.isna().astype(int)
cmm_train['popularity'] = cmm_train.groupby('name')[['tx_date']].transform('count')
cmm_train['rejected'] = cmm_train['rejected'].astype(int)
cmm_train['rejected_count'] = cmm_train.groupby('name')['rejected'].transform('sum')
cmm_train['pharm_num'] = cmm_train.pharmacy.apply(pharm_split)
cmm_train['tx_date'] = pd.to_datetime(cmm_train.tx_date)
cmm_train['weekday'] = cmm_train.tx_date.dt.day_of_week
cmm_train['day_num'] = cmm_train.tx_date.dt.day_of_year

In [6]:
le_name = LabelEncoder()
le_name.fit(cmm_train['name'])
cmm_train['name_encoded'] = le_name.transform(cmm_train['name'])

le_diagnosis = LabelEncoder()
le_diagnosis.fit(cmm_train['diagnosis'])
cmm_train['diagnosis_encoded'] = le_diagnosis.transform(cmm_train['diagnosis'])

le_bin = LabelEncoder()
le_bin.fit(cmm_train['bin'])
cmm_train['bin_encoded'] = le_bin.transform(cmm_train['bin'])

le_pcn = LabelEncoder()
le_pcn.fit(cmm_train['pcn'])
cmm_train['pcn_encoded'] = le_pcn.transform(cmm_train['pcn'])

In [7]:
minority = resample(cmm_train[cmm_train['rejected']==0], 
                replace=False, 
                n_samples=len(cmm_train[cmm_train['rejected']==1]))

majority = cmm_train[cmm_train['rejected']==1]

undersample = pd.concat([majority, minority])

In [8]:
undersample.head()

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,...,private,popularity,rejected_count,pharm_num,weekday,day_num,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
8270129,2022-08-14,Pharmacy #15,P07.55,branded semufolic,664344,YFVIA,AJK5MZ25T9IA,1,0.0,0,...,0,237813,41930,15,6,226,60,79,5,44
7009332,2022-07-14,Pharmacy #14,Y51.55,generic sorine,160389,RB7UU,RS5RB3YA,1,0.0,1,...,0,115708,13891,14,3,195,63,124,1,31
1316687,2022-02-10,Pharmacy #37,Z20.23,branded vivafastat,160389,RB7UU,RS5RB3YA,1,0.0,0,...,0,43263,14072,37,3,41,75,127,1,31
9652363,2022-09-17,Pharmacy #7,I38.43,branded colifunene,725700,,DYGBI610ZY,1,0.0,0,...,0,179278,14364,7,5,260,7,49,8,48
7415161,2022-07-23,Pharmacy #43,H36.57,branded semufolic,725700,9C5MOR3,S2QKZ0OFNWS6X,1,0.0,0,...,0,237813,41930,43,5,204,60,41,8,8


In [9]:
undersample.columns

Index(['tx_date', 'pharmacy', 'diagnosis', 'drug', 'bin', 'pcn', 'group',
       'rejected', 'patient_pay', 'generic', 'name', 'private', 'popularity',
       'rejected_count', 'pharm_num', 'weekday', 'day_num', 'name_encoded',
       'diagnosis_encoded', 'bin_encoded', 'pcn_encoded'],
      dtype='object')

In [10]:
features = ['generic', 'private', 'pharm_num', 'weekday', 'day_num', 
            'name_encoded', 'diagnosis_encoded', 'bin_encoded', 'pcn_encoded']

In [11]:
gbdt = HistGradientBoostingClassifier(categorical_features=[1, 1, 1, 1, 0, 1, 1, 1, 1])

In [12]:
kf = KFold(n_splits=5, shuffle=True, random_state=614)

In [13]:
metrics = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(undersample):
    gbdt.fit(undersample[features].iloc[tt_index], undersample['rejected'].iloc[tt_index])
    predictions = gbdt.predict(undersample[features].iloc[ho_index])
    metrics[i, 0] = recall_score(undersample['rejected'].iloc[ho_index], predictions)
    metrics[i, 1] = precision_score(undersample['rejected'].iloc[ho_index], predictions)
    metrics[i, 2] = accuracy_score(undersample['rejected'].iloc[ho_index], predictions)
    i+=1

In [14]:
print('average recall is', metrics.mean(axis=0)[0])
print('average precision is', metrics.mean(axis=0)[1])
print('average accuracy is', metrics.mean(axis=0)[2])

average recall is 0.9775330373990917
average precision is 0.7965719051365185
average accuracy is 0.86394518881819


In [15]:
gbdt.fit(undersample[features], undersample['rejected'])
preds = gbdt.predict(cmm_train[features])

print('recall is',recall_score(cmm_train['rejected'], preds))
print('precision is',precision_score(cmm_train['rejected'], preds))
print('accuracy is',accuracy_score(cmm_train['rejected'], preds))

recall is 0.9770248817719852
precision is 0.2636690082157229
accuracy is 0.7701008490392932


In [16]:
xgbc = GradientBoostingClassifier()

In [17]:
metrics = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(undersample):
    xgbc.fit(undersample[features].iloc[tt_index], undersample['rejected'].iloc[tt_index])
    predictions = xgbc.predict(undersample[features].iloc[ho_index])
    metrics[i, 0] = recall_score(undersample['rejected'].iloc[ho_index], predictions)
    metrics[i, 1] = precision_score(undersample['rejected'].iloc[ho_index], predictions)
    metrics[i, 2] = accuracy_score(undersample['rejected'].iloc[ho_index], predictions)
    i+=1

In [18]:
print('average recall is', metrics.mean(axis=0)[0])
print('average precision is', metrics.mean(axis=0)[1])
print('average accuracy is', metrics.mean(axis=0)[2])

average recall is 0.913245430915603
average precision is 0.7867405318883959
average accuracy is 0.8328462481651853


In [None]:
clf.fit(oversample[features], oversample['rejected'])
pred_over = clf.predict(cmm_train[features])

print('recall is',recall_score(cmm_train['rejected'], pred_over))
print('precision is',precision_score(cmm_train['rejected'], pred_over))
print('accuracy is',accuracy_score(cmm_train['rejected'], pred_over))

In [None]:
cmm_train.groupby('name')['patient_pay'].mean().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(8,6))

plt.scatter(cmm_train[cmm_train.name=='monemodiase'].day_num, 
            cmm_train[cmm_train.name=='monemodiase'].patient_pay)

plt.show()

In [None]:
plt.figure(figsize=(8,6))

plt.scatter(cmm_train[cmm_train.name=='pheromycin'].day_num, 
            cmm_train[cmm_train.name=='pheromycin'].patient_pay)

plt.show()

In [None]:
cmm_train.groupby('bin')['patient_pay'].mean().sort_values(ascending=False)

In [None]:
plt.figure(figsize=(8,6))

plt.scatter(cmm_train[cmm_train.bin==756120].day_num, 
            cmm_train[cmm_train.bin==756120].patient_pay)

plt.show()

In [None]:
plt.figure(figsize=(8,6))

plt.scatter(cmm_train[cmm_train.bin==322463].day_num, 
            cmm_train[cmm_train.bin==322463].patient_pay)

plt.show()

In [None]:
cmm_train[(cmm_train.name=='monemodiase') & (cmm_train.patient_pay<500)].sort_values(by='patient_pay', ascending=False)

In [None]:
cmm_train[(cmm_train.name=='monemodiase')]['patient_pay'].value_counts()

In [21]:
cmm_train[(cmm_train.name=='monemodiase') & (cmm_train.bin==322463) & (cmm_train.pcn=='3O71UTS')]

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,...,private,popularity,rejected_count,pharm_num,weekday,day_num,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
2355248,2022-03-11,Pharmacy #3,U75.95,branded monemodiase,322463,3O71UTS,,0,306.94,0,...,1,28617,9104,3,4,70,41,104,2,4
7648274,2022-07-29,Pharmacy #14,U75.95,branded monemodiase,322463,3O71UTS,,0,306.35,0,...,1,28617,9104,14,4,210,41,104,2,4
9563108,2022-09-15,Pharmacy #16,B45.03,branded monemodiase,322463,3O71UTS,,0,308.16,0,...,1,28617,9104,16,3,258,41,8,2,4
8325796,2022-08-16,Pharmacy #52,B45.03,branded monemodiase,322463,3O71UTS,,0,306.51,0,...,1,28617,9104,52,1,228,41,8,2,4
4886039,2022-05-19,Pharmacy #10,U75.95,branded monemodiase,322463,3O71UTS,,0,306.24,0,...,1,28617,9104,10,3,139,41,104,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559277,2022-01-18,Pharmacy #35,B45.03,branded monemodiase,322463,3O71UTS,,1,0.00,0,...,1,28617,9104,35,1,18,41,8,2,4
6124533,2022-06-20,Pharmacy #25,U75.95,branded monemodiase,322463,3O71UTS,,1,0.00,0,...,1,28617,9104,25,0,171,41,104,2,4
269162,2022-01-10,Pharmacy #57,U75.95,branded monemodiase,322463,3O71UTS,,0,305.95,0,...,1,28617,9104,57,0,10,41,104,2,4
9267459,2022-09-09,Pharmacy #3,U75.95,branded monemodiase,322463,3O71UTS,,0,306.94,0,...,1,28617,9104,3,4,252,41,104,2,4
