In [1]:
import numpy as np
import pandas as pd

from catboost import Pool, CatBoostRegressor

In [2]:
df = pd.read_csv("pharmacy_new.csv")

In [3]:
df =df.drop(columns=['tx_date', 'drug', 'diagnosis_letter', 'diagnosis_number','Unnamed: 0'])

In [4]:
df = df[df['rejected'] == False]
df = df.drop(columns='rejected')

In [6]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [11]:
df['insurance_null'] = df['pcn'].isna().astype(int)+df['group'].isna().astype(int)

In [15]:
df['pcn'].fillna(value = 'None', inplace = True)
df['group'].fillna(value = 'None', inplace = True)

In [16]:
train, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)

In [17]:
train_data = train.drop(columns=['patient_pay'])
train_label = train['patient_pay']

In [26]:
train_pool = Pool(train_data, train_label, cat_features = list(range(8)))

In [27]:
model = CatBoostRegressor(iterations =50, 
                         depth = 5,
                         learning_rate=1,
                         loss_function = 'RMSE')
model.fit(train_pool)

0:	learn: 28.3254015	total: 5.42s	remaining: 4m 25s
1:	learn: 24.8624181	total: 10.1s	remaining: 4m 2s
2:	learn: 21.9789052	total: 13.9s	remaining: 3m 37s
3:	learn: 19.7142396	total: 17s	remaining: 3m 14s
4:	learn: 19.2617856	total: 20.7s	remaining: 3m 6s
5:	learn: 18.7074957	total: 24.3s	remaining: 2m 58s
6:	learn: 18.1145719	total: 27s	remaining: 2m 45s
7:	learn: 17.8769542	total: 29.7s	remaining: 2m 35s
8:	learn: 17.7831757	total: 32.6s	remaining: 2m 28s
9:	learn: 17.6727572	total: 35.5s	remaining: 2m 22s
10:	learn: 17.6041205	total: 38.4s	remaining: 2m 16s
11:	learn: 17.5507830	total: 41.3s	remaining: 2m 10s
12:	learn: 17.5136056	total: 44.5s	remaining: 2m 6s
13:	learn: 17.4283426	total: 47.5s	remaining: 2m 2s
14:	learn: 17.3727303	total: 50.3s	remaining: 1m 57s
15:	learn: 17.3221869	total: 53.2s	remaining: 1m 53s
16:	learn: 17.2867027	total: 56.1s	remaining: 1m 48s
17:	learn: 17.1713219	total: 59s	remaining: 1m 44s
18:	learn: 17.1581919	total: 1m 1s	remaining: 1m 41s
19:	learn: 17

<catboost.core.CatBoostRegressor at 0x2422ca8fb50>

In [29]:
test_data = test.drop(columns=['patient_pay'])
test_label = test['patient_pay']
preds=model.predict(test_data)
diff = preds - test_label
diff.describe()

count    2.302402e+06
mean    -1.007007e-02
std      1.629074e+01
min     -4.710023e+02
25%     -1.777327e+00
50%      9.165819e-02
75%      2.141701e+00
max      2.407480e+02
Name: patient_pay, dtype: float64

In [30]:
np.percentile(diff, 90), np.percentile(diff, 95), np.percentile(diff, 98), np.percentile(diff, 99), np.percentile(diff, 100)

(6.382890846051689,
 11.895698427445444,
 23.738702446970226,
 45.29840209794493,
 240.74801786584453)

In [31]:
test_error = test[abs(diff)>6]

In [32]:
test_error

Unnamed: 0,pharmacy,diagnosis,bin,pcn,group,patient_pay,brand,drug_name,month,insurance_null
2232638,52,G99.93,664344,TPJD,,6.56,branded,gorol,3,1
2557631,53,B45.03,664344,,STGRDKR1J5RD,213.71,branded,pranic,3,1
9758427,15,I68.27,664344,,YY6B1J4E8KJ3,51.19,branded,mule,10,1
1376290,10,Q72.66,571569,KB38N,6BYJBW,73.26,branded,momudobatin,2,0
2608649,5,G51.87,322463,,HO8HUGL,16.47,branded,choxestamenium,3,1
...,...,...,...,...,...,...,...,...,...,...
9948197,48,I68.27,664344,BIZF,QK6BI1N61,92.46,branded,hidizuzunib,10,0
6297008,40,Z20.23,664344,BIZF,QK6BI1N61,156.54,branded,vivafastat,7,0
4707516,25,Z66.42,664344,BIZF,QK6BI1N61,153.72,branded,nusudaric,5,0
4814457,19,H36.57,664344,KBOSN,,58.69,branded,semufolic,5,1


In [33]:
test_error['brand'].value_counts(normalize = True)

branded    0.867532
generic    0.132468
Name: brand, dtype: float64

In [34]:
test_error['group'].value_counts(normalize = True)

None             0.270456
IOEAN1DWVV3Y     0.066437
DGLGRYP          0.045907
6SP1DG           0.041580
6BYJBW           0.037717
YY6B1J4E8KJ3     0.037081
STGRDKR1J5RD     0.036302
AJK5MZ25T9IA     0.032298
1CAHL            0.028026
HO8HUGL          0.025149
L9QZA            0.024379
Z01MLD4I         0.023660
52H8KH0F83K      0.020972
T51T6V2E8L       0.020179
FZPLF4O6FD       0.018522
IX6P0            0.017661
MP3IQ            0.016712
RS5RB3YA         0.015703
BH2Q8B3GY2GAV    0.015409
SJVO3GXUURRGO    0.013575
EVD4X5           0.013501
I4UYEP84W3       0.011970
DYGBI610ZY       0.011790
KZWQDIHCLLHD1    0.010982
GOM8K0           0.010895
ZX2QUWR          0.010830
9R3Z3QKDF3       0.010459
S2QKZ0OFNWS6X    0.010263
0OGKQ            0.009332
RGVK1            0.007705
7DUPMODV0        0.007572
U19J4RVCA        0.007513
QK6BI1N61        0.007299
IGN6JL34H37D     0.006817
O19XSLDEFB       0.005884
TFZOR5R49        0.005229
0TZ9XYJZJH       0.004952
V96T9QL5         0.004873
XK8RM5E75ZW 

In [38]:
test_error['pcn'].value_counts(normalize = True)

None       0.218010
327CKV     0.066437
MSCXSG     0.045907
S76J7V6    0.043105
1UQC       0.041986
N098KI     0.041580
3O71UTS    0.037925
KB38N      0.037717
YFVIA      0.032298
DY4B       0.028328
TPJD       0.026076
WM6A       0.024379
T17LNK     0.023660
NC7EN      0.023597
TAZ5W      0.020179
IF448      0.018522
CS8580     0.017784
J5DT8      0.017661
YICC41     0.016712
RB7UU      0.015703
REGLCC     0.015409
RM0HB      0.013575
T52GV      0.013501
ZQPX       0.010982
XH4T3      0.010895
KBOSN      0.010711
7THOQ5     0.010502
9C5MOR3    0.010263
6ZGS97C    0.010212
W1LW9Y     0.009332
3Y5ZW0     0.008882
RAM3J      0.007572
9FU70      0.007513
BIZF       0.007299
AZUO5U     0.006817
P4LC       0.005884
W7L3       0.004873
2TIC       0.004452
BZ22Z2     0.004281
9D24       0.004177
CG3ZWQ     0.003976
MQWH09H    0.003820
NG4CS      0.003754
OO0E       0.003202
K5KDJ7G    0.002680
YL5CMT     0.002367
393U       0.002026
ULM7G      0.002007
FX2Z       0.001468
Name: pcn, dtype: fl

In [37]:
test_error['pharmacy'].value_counts(normalize = True)

1     0.019180
24    0.018732
52    0.018643
39    0.018499
9     0.018362
44    0.018207
33    0.018097
10    0.018040
55    0.018008
36    0.018002
8     0.017998
17    0.017949
31    0.017869
18    0.017816
0     0.017793
29    0.017774
54    0.017668
43    0.017642
4     0.017602
27    0.017556
19    0.017552
12    0.017484
11    0.017374
28    0.017374
40    0.017367
2     0.017291
26    0.017285
14    0.017281
16    0.017272
6     0.017251
48    0.017198
35    0.017126
5     0.017046
46    0.017016
41    0.017016
49    0.016999
32    0.016970
20    0.016847
3     0.016837
51    0.016792
37    0.016788
30    0.016750
22    0.016727
15    0.016714
50    0.016712
57    0.016684
25    0.016612
56    0.016570
53    0.016496
45    0.016490
42    0.016268
34    0.016268
23    0.016177
13    0.016122
7     0.016067
47    0.016043
21    0.015910
38    0.015787
Name: pharmacy, dtype: float64

In [46]:
test_error[['pcn', 'group']].value_counts(normalize = True, ascending = False)[0:30]

pcn      group        
327CKV   IOEAN1DWVV3Y     0.066437
MSCXSG   DGLGRYP          0.045907
S76J7V6  None             0.043105
1UQC     None             0.041986
N098KI   6SP1DG           0.041580
3O71UTS  None             0.037925
KB38N    6BYJBW           0.037717
None     YY6B1J4E8KJ3     0.037081
         STGRDKR1J5RD     0.036302
YFVIA    AJK5MZ25T9IA     0.032298
DY4B     None             0.028328
None     1CAHL            0.028026
TPJD     None             0.026076
None     HO8HUGL          0.025149
WM6A     L9QZA            0.024379
T17LNK   Z01MLD4I         0.023660
NC7EN    None             0.023597
None     52H8KH0F83K      0.020972
TAZ5W    T51T6V2E8L       0.020179
IF448    FZPLF4O6FD       0.018522
CS8580   None             0.017784
J5DT8    IX6P0            0.017661
YICC41   MP3IQ            0.016712
RB7UU    RS5RB3YA         0.015703
REGLCC   BH2Q8B3GY2GAV    0.015409
RM0HB    SJVO3GXUURRGO    0.013575
T52GV    EVD4X5           0.013501
None     I4UYEP84W3       0.0119

In [47]:
test_error[['bin']].value_counts(normalize = True, ascending = False)

bin   
664344    0.344484
725700    0.137958
691847    0.121261
322463    0.118666
757349    0.069942
571569    0.052587
96934     0.047282
539437    0.039574
956971    0.024379
718350    0.017661
160389    0.015703
756120    0.010502
dtype: float64

In [48]:
test_error[['bin', 'brand']].value_counts(normalize = True, ascending = False)

bin     brand  
664344  branded    0.286459
725700  branded    0.123683
322463  branded    0.106001
691847  branded    0.101887
757349  branded    0.063524
664344  generic    0.058025
571569  branded    0.046660
96934   branded    0.044613
539437  branded    0.034287
691847  generic    0.019375
956971  branded    0.019070
718350  branded    0.017046
160389  branded    0.014982
725700  generic    0.014275
322463  generic    0.012666
756120  branded    0.009321
757349  generic    0.006417
571569  generic    0.005927
956971  generic    0.005309
539437  generic    0.005288
96934   generic    0.002669
756120  generic    0.001180
160389  generic    0.000721
718350  generic    0.000616
dtype: float64

In [49]:
model = CatBoostRegressor(iterations =50, 
                         depth = 8,
                         learning_rate=1,
                         loss_function = 'RMSE')
model.fit(train_pool)

0:	learn: 28.3254015	total: 5.8s	remaining: 4m 44s
1:	learn: 24.7182261	total: 10.9s	remaining: 4m 22s
2:	learn: 19.0288474	total: 16.8s	remaining: 4m 23s
3:	learn: 18.3681132	total: 22.8s	remaining: 4m 21s
4:	learn: 18.1804847	total: 27.3s	remaining: 4m 5s
5:	learn: 17.1953394	total: 31.9s	remaining: 3m 53s
6:	learn: 17.0301692	total: 36.8s	remaining: 3m 45s
7:	learn: 16.8408782	total: 41.2s	remaining: 3m 36s
8:	learn: 16.7206070	total: 45.6s	remaining: 3m 27s
9:	learn: 16.6498523	total: 50s	remaining: 3m 20s
10:	learn: 16.5833662	total: 54.6s	remaining: 3m 13s
11:	learn: 16.5496443	total: 59.1s	remaining: 3m 7s
12:	learn: 16.4262758	total: 1m 4s	remaining: 3m 2s
13:	learn: 16.3997792	total: 1m 8s	remaining: 2m 56s
14:	learn: 16.3664287	total: 1m 13s	remaining: 2m 51s
15:	learn: 16.3275274	total: 1m 17s	remaining: 2m 45s
16:	learn: 16.2932239	total: 1m 22s	remaining: 2m 40s
17:	learn: 16.2601955	total: 1m 27s	remaining: 2m 35s
18:	learn: 16.2385509	total: 1m 31s	remaining: 2m 29s
19:	

<catboost.core.CatBoostRegressor at 0x2422e12fd30>

In [50]:
preds2=model.predict(test_data)
diff2 = preds2 - test_label
diff2.describe()

count    2.302402e+06
mean    -2.145987e-02
std      1.561128e+01
min     -5.000846e+02
25%     -1.074372e+00
50%      2.394291e-02
75%      1.253501e+00
max      2.577495e+02
Name: patient_pay, dtype: float64

In [51]:
diff2[diff2>50]

6730287      95.846122
8656006      61.352868
5825179      79.588860
12299855     55.114329
11320073     56.837509
               ...    
8326118     103.742111
7448794      58.770973
9619601      77.435419
12300749     83.863762
6297008     104.589450
Name: patient_pay, Length: 16738, dtype: float64