In [None]:
## 

In [1]:
import numpy as np
import pandas as pd

from catboost import Pool, CatBoostRegressor

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [2]:
df = pd.read_csv("pharmacy_new.csv")

## Use the following if you use the original dataframe
## df = pd.read_csv("pharmacy_tx.csv")

In [3]:
df =df.drop(columns=['tx_date', 'diagnosis_letter', 'diagnosis_number','Unnamed: 0'])

In [4]:
df = df[df['rejected'] == False]
df = df.drop(columns='rejected')

In [5]:
df['insurance'] = df['pcn'].astype('str')+df['group'].astype('str')
df = df.drop(columns=['pcn','group'])

In [6]:
def drugtrain(drug_name, num_iter, dep, lrate, lfun):
    # drug_name : name of the drug
    # num_iter : number of iterations
    # dep: depth
    # lrate : learning rate
    # lfun : lost function -- https://catboost.ai/en/docs/concepts/loss-functions-regression
    
    df_drug = df[df['drug_name']==drug_name]
    df_drug = df.drop(columns='drug_name')
    
    train_drug, test_drug = train_test_split(df_drug, test_size=0.2, shuffle=True, random_state=42)
    print('Just train_set_split')
    
    col = ['pharmacy', 'diagnosis', 'brand', 'month', 'insurance']
    
    train_data_drug = train_drug[col]
    train_pool_drug = Pool(train_data_drug, train_drug['patient_pay'], cat_features = list(range(5)))
    
    print('Just finish pool')
    model_drug = CatBoostRegressor(iterations =num_iter, 
                         depth = dep,
                         learning_rate=lrate,
                         loss_function = lfun)
    print('Begin training')
    model_drug.fit(train_pool_drug)
    
    print('Finish training')
    test_data_drug = test_drug[col]
    test_label_drug = test_drug['patient_pay']
    
    preds_drug=model_drug.predict(test_data_drug)
    test_drug['pred '] = preds_drug
    
    r2 = r2_score(preds_drug,test_label_drug)
    mean_abs = mean_absolute_error(preds_drug,test_label_drug)
    mean_sqr = mean_squared_error(preds_drug,test_label_drug)
    mean_perc = mean_absolute_percentage_error(preds_drug,test_label_drug)
    
    print(r2, mean_abs, np.sqrt(mean_sqr), mean_perc*100)
    
    return test_drug, r2, mean_abs, np.sqrt(mean_sqr), mean_perc*100

In [7]:
def train(dframe, cols, num_iter, dep, lrate, lfun):

    # drug_name : name of the drug
    # cols : columns to train, e.g. cols = ['pharmacy', 'diagnosis', 'brand', 'month', 'insurance']
    # num_iter : number of iterations
    # dep: depth
    # lrate : learning rate
    # lfun : lost function -- https://catboost.ai/en/docs/concepts/loss-functions-regression
    
    c = len(cols) 
    train, test = train_test_split(dframe, test_size=0.2, shuffle=True, random_state=42)
    print('Just train_set_split')
    
    train_data = train[cols]
    train_pool = Pool(train_data, train['patient_pay'], cat_features = list(range(c)))
    
    print('Just finish pool')
    model = CatBoostRegressor(iterations =num_iter, 
                         depth = dep,
                         learning_rate=lrate,
                         loss_function = lfun)
    print('Begin training')
    model.fit(train_pool)
    
    print('Finish training')
    test_data = test[cols]
    test_label = test['patient_pay']
    
    preds=model.predict(test_data)
    test['pred '] = preds
    
    r2 = r2_score(preds,test_label)
    mean_abs = mean_absolute_error(preds,test_label)
    mean_sqr = mean_squared_error(preds,test_label)
    mean_perc = mean_absolute_percentage_error(preds,test_label)
    
    print(r2, mean_abs, np.sqrt(mean_sqr), mean_perc*100)
    
    return test, r2, mean_abs, np.sqrt(mean_sqr), mean_perc*100

In [None]:
### Using the drugtrain and the train functions

In [8]:
test_tan, r2_tan , mean_abs_tan, Msq_tan, mean_perc_tan = drugtrain('tanoclolol', 50, 8, .5, 'RMSE')

Just train_set_split
Just finish pool
Begin training
0:	learn: 33.9768314	total: 1.81s	remaining: 1m 28s
1:	learn: 29.5373851	total: 3.45s	remaining: 1m 22s
2:	learn: 26.7744212	total: 4.66s	remaining: 1m 13s
3:	learn: 26.0203618	total: 5.87s	remaining: 1m 7s
4:	learn: 25.7561385	total: 7.03s	remaining: 1m 3s
5:	learn: 25.6159072	total: 8.16s	remaining: 59.8s
6:	learn: 25.5353511	total: 9.29s	remaining: 57.1s
7:	learn: 25.4911685	total: 10.5s	remaining: 55.3s
8:	learn: 25.4322386	total: 11.7s	remaining: 53.5s
9:	learn: 25.3961622	total: 13.1s	remaining: 52.5s
10:	learn: 25.3650837	total: 14.3s	remaining: 50.7s
11:	learn: 25.3556736	total: 15.4s	remaining: 48.8s
12:	learn: 25.3317219	total: 16.7s	remaining: 47.6s
13:	learn: 25.3131840	total: 17.8s	remaining: 45.8s
14:	learn: 25.2984047	total: 19s	remaining: 44.3s
15:	learn: 25.2894040	total: 20.2s	remaining: 42.9s
16:	learn: 25.2763084	total: 21.7s	remaining: 42.2s
17:	learn: 25.2711757	total: 23s	remaining: 41s
18:	learn: 25.2656167	to

In [9]:
r2_tan , mean_abs_tan, Msq_tan, mean_perc_tan

(0.4375700605694538, 11.571766049404886, 25.06383756658607, 35.36040503088395)

In [10]:
df_mone_exp = df[(df['drug_name']=='monemodiase') &(df['patient_pay']>250)]
cols = ['pharmacy', 'diagnosis', 'brand', 'month', 'insurance']

test_mone_exp, r2_mone_exp , mean_abs_mone_exp, Msq_mone_exp, mean_mone_exp_mone_exp = train(df_mone_exp, cols, 100, 8, .05, 'RMSE')

Just train_set_split
Just finish pool
Begin training
0:	learn: 74.2099681	total: 33.4ms	remaining: 3.31s
1:	learn: 72.4223820	total: 68.8ms	remaining: 3.37s
2:	learn: 70.7691279	total: 107ms	remaining: 3.46s
3:	learn: 69.2492779	total: 146ms	remaining: 3.51s
4:	learn: 67.8651652	total: 171ms	remaining: 3.24s
5:	learn: 66.5818957	total: 194ms	remaining: 3.04s
6:	learn: 65.3847255	total: 247ms	remaining: 3.28s
7:	learn: 64.3039732	total: 262ms	remaining: 3.02s
8:	learn: 63.3085763	total: 276ms	remaining: 2.79s
9:	learn: 62.3874965	total: 308ms	remaining: 2.77s
10:	learn: 61.5441544	total: 337ms	remaining: 2.73s
11:	learn: 60.7752273	total: 367ms	remaining: 2.69s
12:	learn: 60.0807196	total: 395ms	remaining: 2.65s
13:	learn: 59.4438212	total: 429ms	remaining: 2.64s
14:	learn: 58.8529169	total: 474ms	remaining: 2.69s
15:	learn: 58.3189316	total: 501ms	remaining: 2.63s
16:	learn: 57.8398802	total: 528ms	remaining: 2.58s
17:	learn: 57.4148353	total: 547ms	remaining: 2.49s
18:	learn: 57.01741