In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [3]:
cmm_data = pd.read_csv('/home/ruggiec/Downloads/pharmacy_tx.csv')
cmm_data_no_dups = cmm_data.drop_duplicates()
cmm_train, cmm_test = train_test_split(cmm_data_no_dups, shuffle=True, 
                             random_state= 614, test_size=0.2)

In [4]:
def is_it_generic(text):
    if text.split()[0] == 'generic':
        return 1
    else:
        return 0
    
def drug_name(text):
    return text.split()[1]

def pharm_split(text):
    return int(text.split('#')[1])

In [5]:
cmm_train['generic'] = cmm_train.drug.apply(is_it_generic)
cmm_train['name'] = cmm_train['drug'].apply(drug_name)
cmm_train['private'] = cmm_train.group.isna().astype(int)
cmm_train['popularity'] = cmm_train.groupby('name')[['tx_date']].transform('count')
cmm_train['rejected'] = cmm_train['rejected'].astype(int)
cmm_train['rejected_count'] = cmm_train.groupby('name')['rejected'].transform('sum')
cmm_train['pharm_num'] = cmm_train.pharmacy.apply(pharm_split)
cmm_train['tx_date'] = pd.to_datetime(cmm_train.tx_date)
cmm_train['weekday'] = cmm_train.tx_date.dt.day_of_week
cmm_train['day_num'] = cmm_train.tx_date.dt.day_of_year

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
le_name = LabelEncoder()
le_name.fit(cmm_train['name'])
cmm_train['name_encoded'] = le_name.transform(cmm_train['name'])

le_diagnosis = LabelEncoder()
le_diagnosis.fit(cmm_train['diagnosis'])
cmm_train['diagnosis_encoded'] = le_diagnosis.transform(cmm_train['diagnosis'])

le_bin = LabelEncoder()
le_bin.fit(cmm_train['bin'])
cmm_train['bin_encoded'] = le_bin.transform(cmm_train['bin'])

le_pcn = LabelEncoder()
le_pcn.fit(cmm_train['pcn'])
cmm_train['pcn_encoded'] = le_pcn.transform(cmm_train['pcn'])

In [8]:
cmm_train.columns

Index(['tx_date', 'pharmacy', 'diagnosis', 'drug', 'bin', 'pcn', 'group',
       'rejected', 'patient_pay', 'generic', 'name', 'private', 'popularity',
       'rejected_count', 'pharm_num', 'weekday', 'day_num', 'name_encoded',
       'diagnosis_encoded', 'bin_encoded', 'pcn_encoded'],
      dtype='object')

In [9]:
cmm_train.head()

Unnamed: 0,tx_date,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,generic,...,private,popularity,rejected_count,pharm_num,weekday,day_num,name_encoded,diagnosis_encoded,bin_encoded,pcn_encoded
3962160,2022-04-23,Pharmacy #29,M31.63,branded mamate,322463,3O71UTS,,0,25.75,0,...,1,23539,4480,29,5,113,38,66,2,4
1161541,2022-02-06,Pharmacy #7,Z20.23,branded vivafastat,725700,327CKV,IOEAN1DWVV3Y,0,180.52,0,...,0,43263,14072,7,6,37,75,127,8,2
10643305,2022-10-12,Pharmacy #4,U41.19,generic foxivelule,322463,3O71UTS,,0,12.22,1,...,1,367822,0,4,2,285,19,101,2,4
10629491,2022-10-11,Pharmacy #31,K87.68,generic ribosatharin,664344,YFVIA,AJK5MZ25T9IA,0,12.37,1,...,0,121908,8960,31,1,284,56,59,5,44
10928718,2022-10-18,Pharmacy #6,G99.93,generic simarol,96934,S76J7V6,,0,13.93,1,...,1,144501,7380,6,1,291,61,37,0,34


In [10]:
base_features = ['diagnosis_encoded', 'bin_encoded', 'pcn_encoded', 
                 'private', 'generic', 'name_encoded']
pharm_features = ['pharm_num', 'diagnosis_encoded', 'bin_encoded', 
                  'pcn_encoded', 'private', 'generic', 'name_encoded']
day_features = ['weekday', 'pharm_num', 'diagnosis_encoded', 'bin_encoded',
               'pcn_encoded', 'private', 'generic', 'name_encoded']
full_features = ['weekday', 'day_num', 'pharm_num', 'diagnosis_encoded', 'bin_encoded',
               'pcn_encoded', 'private', 'generic', 'name_encoded']

In [12]:
cmm_paid = cmm_train[cmm_train.rejected == 0]

In [14]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error

In [15]:
kf = KFold(n_splits=5, shuffle=True, random_state=614)
hgbr = HistGradientBoostingRegressor()

In [19]:
metrics = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(cmm_paid):
    hgbr.fit(cmm_paid[base_features].iloc[tt_index], cmm_paid['patient_pay'].iloc[tt_index])
    predictions = hgbr.predict(cmm_paid[base_features].iloc[ho_index])
    metrics[i, 0] = mean_squared_error(cmm_paid['patient_pay'].iloc[ho_index], predictions)
    metrics[i, 1] = mean_absolute_error(cmm_paid['patient_pay'].iloc[ho_index], predictions)
    metrics[i, 2] = mean_absolute_percentage_error(cmm_paid['patient_pay'].iloc[ho_index], predictions)
    i+=1

In [20]:
print('mean squared error is', metrics.mean(axis=0)[0])
print('mean absolute error is', metrics.mean(axis=0)[1])
print('mean absolute percentage error is', metrics.mean(axis=0)[2])

mean squared error is 184.26791417378195
mean absolute error is 6.731561024777966
mean absolute percentage error is 0.3573011265665149


In [21]:
metrics = np.zeros((5, 3))
i=0
for tt_index, ho_index in kf.split(cmm_paid):
    hgbr.fit(cmm_paid[full_features].iloc[tt_index], cmm_paid['patient_pay'].iloc[tt_index])
    predictions = hgbr.predict(cmm_paid[full_features].iloc[ho_index])
    metrics[i, 0] = mean_squared_error(cmm_paid['patient_pay'].iloc[ho_index], predictions)
    metrics[i, 1] = mean_absolute_error(cmm_paid['patient_pay'].iloc[ho_index], predictions)
    metrics[i, 2] = mean_absolute_percentage_error(cmm_paid['patient_pay'].iloc[ho_index], predictions)
    i+=1

In [22]:
print('mean squared error is', metrics.mean(axis=0)[0])
print('mean absolute error is', metrics.mean(axis=0)[1])
print('mean absolute percentage error is', metrics.mean(axis=0)[2])

mean squared error is 185.98686547393487
mean absolute error is 6.770601235529616
mean absolute percentage error is 0.3590110791248608


In [25]:
hgbr.fit(cmm_paid[base_features], cmm_paid['patient_pay'])
predictions = hgbr.predict(cmm_paid[base_features])