In [1]:
import numpy as np
import pandas as pd

from catboost import Pool, CatBoostRegressor

In [2]:
df = pd.read_csv("pharmacy_new.csv")

In [3]:
df =df.drop(columns=['tx_date', 'drug', 'bin', 'diagnosis_letter', 'diagnosis_number','Unnamed: 0'])

In [4]:
df = df[df['rejected'] == False]
df = df.drop(columns='rejected')

In [5]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [6]:
df['insurance_null'] = df['pcn'].isna().astype(int)+df['group'].isna().astype(int)

In [7]:
df['pcn'].fillna(value = 'None', inplace = True)
df['group'].fillna(value = 'None', inplace = True)

In [8]:
train, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)

In [9]:
train_data = train.drop(columns=['patient_pay'])
train_label = train['patient_pay']

In [10]:
train_pool = Pool(train_data, train_label, cat_features = list(range(8)))

In [11]:
model = CatBoostRegressor(iterations =50, 
                         depth = 5,
                         learning_rate=1,
                         loss_function = 'RMSE')
model.fit(train_pool)

0:	learn: 28.3557484	total: 1.93s	remaining: 1m 34s
1:	learn: 24.8213446	total: 3.55s	remaining: 1m 25s
2:	learn: 19.7207030	total: 4.79s	remaining: 1m 15s
3:	learn: 19.2051711	total: 6.03s	remaining: 1m 9s
4:	learn: 18.7695738	total: 7.29s	remaining: 1m 5s
5:	learn: 18.0097166	total: 8.24s	remaining: 1m
6:	learn: 17.7094336	total: 9.18s	remaining: 56.4s
7:	learn: 17.5527976	total: 10.2s	remaining: 53.5s
8:	learn: 17.5052886	total: 11.4s	remaining: 51.8s
9:	learn: 17.3949080	total: 12.3s	remaining: 49.4s
10:	learn: 17.3374044	total: 13.3s	remaining: 47.1s
11:	learn: 17.2972392	total: 14.3s	remaining: 45.2s
12:	learn: 17.2601984	total: 15.3s	remaining: 43.4s
13:	learn: 17.1758640	total: 16.2s	remaining: 41.7s
14:	learn: 17.1226486	total: 17.2s	remaining: 40.1s
15:	learn: 17.0870835	total: 18.2s	remaining: 38.6s
16:	learn: 17.0420205	total: 19.2s	remaining: 37.2s
17:	learn: 17.0235768	total: 20.2s	remaining: 35.9s
18:	learn: 16.9943474	total: 21.2s	remaining: 34.6s
19:	learn: 16.9736034	

<catboost.core.CatBoostRegressor at 0x1e9361c07f0>

In [12]:
test_data = test.drop(columns=['patient_pay'])
test_label = test['patient_pay']
preds=model.predict(test_data)
diff = preds - test_label
diff.describe()

count    2.302402e+06
mean     6.634881e-02
std      1.652097e+01
min     -4.637239e+02
25%     -1.851980e+00
50%      5.965040e-02
75%      2.052162e+00
max      2.382640e+02
Name: patient_pay, dtype: float64

In [13]:
np.percentile(diff, 90), np.percentile(diff, 95), np.percentile(diff, 98), np.percentile(diff, 99), np.percentile(diff, 100)

(6.629801705816988,
 11.881694138538265,
 25.281441215999894,
 48.59040692036817,
 238.26396877938052)

In [14]:
test_error = test[abs(diff)>6]

In [15]:
test_error

Unnamed: 0,pharmacy,diagnosis,pcn,group,patient_pay,brand,drug_name,month,insurance_null
2232638,52,G99.93,TPJD,,6.56,branded,gorol,3,1
10606872,20,B05.36,,1CAHL,81.14,branded,plazamiglutic,11,1
2557631,53,B45.03,,STGRDKR1J5RD,213.71,branded,pranic,3,1
9758427,15,I68.27,,YY6B1J4E8KJ3,51.19,branded,mule,10,1
1376290,10,Q72.66,KB38N,6BYJBW,73.26,branded,momudobatin,2,0
...,...,...,...,...,...,...,...,...,...
9948197,48,I68.27,BIZF,QK6BI1N61,92.46,branded,hidizuzunib,10,0
6297008,40,Z20.23,BIZF,QK6BI1N61,156.54,branded,vivafastat,7,0
7660501,13,Z66.42,J5DT8,IX6P0,158.14,branded,nusudaric,8,0
4707516,25,Z66.42,BIZF,QK6BI1N61,153.72,branded,nusudaric,5,0


In [17]:
model2 = CatBoostRegressor(iterations =50, 
                         depth = 16,
                         learning_rate=1,
                         loss_function = 'RMSE')
model2.fit(train_pool)

0:	learn: 28.3103460	total: 2.6s	remaining: 2m 7s
1:	learn: 24.6724079	total: 6.3s	remaining: 2m 31s
2:	learn: 18.5102672	total: 10.4s	remaining: 2m 43s
3:	learn: 16.7447289	total: 13.9s	remaining: 2m 40s
4:	learn: 16.4136290	total: 17.8s	remaining: 2m 40s
5:	learn: 16.2510645	total: 21.3s	remaining: 2m 36s
6:	learn: 16.2078345	total: 24.6s	remaining: 2m 31s
7:	learn: 16.1534400	total: 27.9s	remaining: 2m 26s
8:	learn: 16.1045997	total: 31.4s	remaining: 2m 23s
9:	learn: 16.0784090	total: 34.8s	remaining: 2m 19s
10:	learn: 16.0447556	total: 38.2s	remaining: 2m 15s
11:	learn: 16.0261532	total: 41.5s	remaining: 2m 11s
12:	learn: 16.0127106	total: 44.9s	remaining: 2m 7s
13:	learn: 16.0079955	total: 48.3s	remaining: 2m 4s
14:	learn: 15.9941625	total: 52s	remaining: 2m 1s
15:	learn: 15.9849515	total: 55.8s	remaining: 1m 58s
16:	learn: 15.9764438	total: 59.9s	remaining: 1m 56s
17:	learn: 15.9727461	total: 1m 3s	remaining: 1m 52s
18:	learn: 15.9638870	total: 1m 7s	remaining: 1m 49s
19:	learn: 

<catboost.core.CatBoostRegressor at 0x1e9361c0e20>

In [18]:
preds2=model2.predict(test_data)
diff2 = preds2 - test_label
diff2.describe()

count    2.302402e+06
mean     6.267065e-02
std      1.587425e+01
min     -4.770060e+02
25%     -4.893450e-01
50%      1.432920e-02
75%      7.311997e-01
max      3.026251e+02
Name: patient_pay, dtype: float64