In [1]:
import numpy as np
import pandas as pd

from catboost import Pool, CatBoostRegressor

In [2]:
df = pd.read_csv("pharmacy_new.csv")

In [3]:
df =df.drop(columns=['tx_date', 'drug', 'bin', 'diagnosis_letter', 'diagnosis_number','Unnamed: 0'])

In [4]:
df = df[df['rejected'] == False]
df = df.drop(columns='rejected')

In [5]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [9]:
df['pcn'].fillna(value = 'pcnNone', inplace = True)
df['group'].fillna(value = 'gpNone', inplace = True)

In [12]:
df['insurance'] = df['pcn'].astype('str')+df['group'].astype('str')
df

Unnamed: 0,pharmacy,diagnosis,pcn,group,patient_pay,brand,drug_name,month,insurance
0,6,G99.93,1UQC,gpNone,13.39,branded,tanoclolol,1,1UQCgpNone
1,42,U60.52,pcnNone,52H8KH0F83K,7.02,branded,oxasoted,1,pcnNone52H8KH0F83K
2,37,Q85.91,1UQC,gpNone,13.39,branded,cupitelol,1,1UQCgpNone
3,30,U60.52,KB38N,6BYJBW,10.84,generic,oxasoted,1,KB38N6BYJBW
4,18,N55.01,pcnNone,ZX2QUWR,47.00,branded,mamate,1,pcnNoneZX2QUWR
...,...,...,...,...,...,...,...,...,...
12561839,39,Q72.66,KB38N,6BYJBW,66.47,branded,momudobatin,12,KB38N6BYJBW
12561841,45,N59.44,pcnNone,TFZOR5R49,6.28,generic,tafistitrisin,12,pcnNoneTFZOR5R49
12561842,54,W50.87,N098KI,6SP1DG,6.94,generic,tanoclolol,12,N098KI6SP1DG
12561843,0,I68.27,S76J7V6,gpNone,13.93,branded,prazinib,12,S76J7V6gpNone


In [13]:
df = df.drop(columns=['pcn','group'])

In [14]:
train, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)

In [15]:
train_data = train.drop(columns=['patient_pay'])
train_label = train['patient_pay']

In [18]:
train_data.columns

Index(['pharmacy', 'diagnosis', 'brand', 'drug_name', 'month', 'insurance'], dtype='object')

In [19]:
train_pool = Pool(train_data, train_label, cat_features = list(range(6)))

In [20]:
model = CatBoostRegressor(iterations =50, 
                         depth = 5,
                         learning_rate=1,
                         loss_function = 'RMSE')
model.fit(train_pool)

0:	learn: 28.4786916	total: 1.61s	remaining: 1m 18s
1:	learn: 24.6475207	total: 2.71s	remaining: 1m 5s
2:	learn: 19.4400104	total: 4.02s	remaining: 1m 3s
3:	learn: 18.8122828	total: 4.83s	remaining: 55.5s
4:	learn: 18.4224844	total: 5.67s	remaining: 51s
5:	learn: 17.7289409	total: 6.43s	remaining: 47.2s
6:	learn: 17.5264383	total: 7.19s	remaining: 44.2s
7:	learn: 17.4093821	total: 8.19s	remaining: 43s
8:	learn: 17.3013444	total: 9.03s	remaining: 41.1s
9:	learn: 17.2554299	total: 10.1s	remaining: 40.3s
10:	learn: 17.2097714	total: 10.9s	remaining: 38.7s
11:	learn: 17.1371291	total: 11.7s	remaining: 37s
12:	learn: 17.1058350	total: 12.5s	remaining: 35.5s
13:	learn: 17.0242659	total: 13.3s	remaining: 34.1s
14:	learn: 17.0039290	total: 14s	remaining: 32.8s
15:	learn: 16.9917064	total: 14.8s	remaining: 31.5s
16:	learn: 16.9779937	total: 15.6s	remaining: 30.3s
17:	learn: 16.9561710	total: 16.4s	remaining: 29.2s
18:	learn: 16.8911439	total: 17.2s	remaining: 28.1s
19:	learn: 16.8841216	total: 

<catboost.core.CatBoostRegressor at 0x1dfdff9f250>

In [21]:
test_data = test.drop(columns=['patient_pay'])
test_label = test['patient_pay']
preds=model.predict(test_data)
diff = preds - test_label
diff.describe()

count    2.302402e+06
mean     5.745417e-02
std      1.634484e+01
min     -4.648299e+02
25%     -1.785161e+00
50%      6.519838e-02
75%      2.029625e+00
max      2.443762e+02
Name: patient_pay, dtype: float64

In [22]:
np.percentile(diff, 90), np.percentile(diff, 95), np.percentile(diff, 98), np.percentile(diff, 99), np.percentile(diff, 100)

(6.283334213524515,
 11.780230071148452,
 25.0540464452112,
 46.730503940406855,
 244.37623167465807)

In [25]:
test_error = test[abs(diff)>6]
test_error

Unnamed: 0,pharmacy,diagnosis,patient_pay,brand,drug_name,month,insurance
2232638,52,G99.93,6.56,branded,gorol,3,TPJDgpNone
10606872,20,B05.36,81.14,branded,plazamiglutic,11,pcnNone1CAHL
2557631,53,B45.03,213.71,branded,pranic,3,pcnNoneSTGRDKR1J5RD
9758427,15,I68.27,51.19,branded,mule,10,pcnNoneYY6B1J4E8KJ3
1376290,10,Q72.66,73.26,branded,momudobatin,2,KB38N6BYJBW
...,...,...,...,...,...,...,...
9948197,48,I68.27,92.46,branded,hidizuzunib,10,BIZFQK6BI1N61
6297008,40,Z20.23,156.54,branded,vivafastat,7,BIZFQK6BI1N61
4707516,25,Z66.42,153.72,branded,nusudaric,5,BIZFQK6BI1N61
4814457,19,H36.57,58.69,branded,semufolic,5,KBOSNgpNone


In [28]:
model2 = CatBoostRegressor(iterations =30, 
                         depth = 10,
                         learning_rate=.5,
                         loss_function = 'MultiRMSE')
model2.fit(train_pool)

0:	learn: 32.2290497	total: 2.46s	remaining: 1m 11s
1:	learn: 26.6070276	total: 48.4s	remaining: 11m 17s
2:	learn: 22.6437548	total: 1m 37s	remaining: 14m 39s
3:	learn: 19.5556399	total: 2m 34s	remaining: 16m 41s
4:	learn: 18.6178000	total: 3m 24s	remaining: 17m 1s
5:	learn: 18.3216393	total: 4m 16s	remaining: 17m 4s
6:	learn: 17.3464898	total: 5m 5s	remaining: 16m 45s
7:	learn: 17.0128306	total: 5m 53s	remaining: 16m 10s
8:	learn: 16.8317588	total: 6m 42s	remaining: 15m 38s
9:	learn: 16.7726800	total: 7m 25s	remaining: 14m 50s
10:	learn: 16.6175766	total: 8m 20s	remaining: 14m 23s
11:	learn: 16.5676289	total: 9m 7s	remaining: 13m 41s
12:	learn: 16.5267654	total: 9m 55s	remaining: 12m 59s
13:	learn: 16.4811335	total: 10m 40s	remaining: 12m 12s
14:	learn: 16.4465900	total: 11m 27s	remaining: 11m 27s
15:	learn: 16.4228067	total: 12m 19s	remaining: 10m 46s
16:	learn: 16.3756233	total: 13m 13s	remaining: 10m 6s
17:	learn: 16.3584979	total: 14m 3s	remaining: 9m 22s
18:	learn: 16.3471958	tot

<catboost.core.CatBoostRegressor at 0x1dfe38b0280>

In [29]:
preds2=model2.predict(test_data)
diff2 =abs( preds2 - test_label)
diff2.describe()

count    2.302402e+06
mean     5.010719e+00
std      1.521262e+01
min      3.005931e-05
25%      5.686987e-01
50%      1.443162e+00
75%      3.870043e+00
max      4.798195e+02
Name: patient_pay, dtype: float64

In [30]:
np.percentile(diff, 90), np.percentile(diff, 95), np.percentile(diff, 98), np.percentile(diff, 99), np.percentile(diff, 100)

(6.283334213524515,
 11.780230071148452,
 25.0540464452112,
 46.730503940406855,
 244.37623167465807)