In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import _pickle as pickle
import random
import scipy.stats as ss

In [2]:
startTime = datetime.now()

input_age_seq = pickle.load(open("../data/baseline/input_age_seq","rb"))
input_sex_seq = pickle.load(open("../data/baseline/input_sex_seq","rb"))
input_zip_seq = pickle.load(open("../data/baseline/input_zip_seq","rb"))

input_cost_seq = pickle.load(open("../data/baseline/input_cost_seq","rb"))
target_cost_seq = pickle.load(open("../data/baseline/target_cost_seq","rb"))

input_medical_cost_seq = pickle.load(open("../data/baseline/input_medical_cost_seq","rb"))
input_pharmacy_cost_seq = pickle.load(open("../data/baseline/input_pharmacy_cost_seq","rb"))

print(datetime.now() - startTime)

0:00:00.087626


In [3]:
startTime = datetime.now()

input_diag_seq = pickle.load(open("../data/advance/input_diag_seq","rb"))
input_drugclass = pickle.load(open("../data/baseline/input_drugclass","rb"))

print(datetime.now() - startTime)

0:00:01.865746


In [4]:
icd2ccs = pickle.load(open("/Users/xxz005/Desktop/RAW_DATA/icd2ccs","rb"))

In [5]:
input_ccs = []
ccs2int = {}

for p in input_diag_seq:
    p_ccs = []
    for v in p:
        for c in v:
            c = c.replace(".","")
            if c in icd2ccs:
                ccs = icd2ccs[c]
                if ccs not in ccs2int:
                    ccs2int[ccs] = len(ccs2int)
                p_ccs.append(ccs2int[ccs])
    input_ccs.append(p_ccs)
    
len(ccs2int)

251

In [6]:
drugclass2int = {}

for p in input_drugclass:
    for drug in p:
        if drug not in drugclass2int:
            drugclass2int[drug] = len(drugclass2int)
input_int_drugclass = [[drugclass2int[i] for i in p] for p in input_drugclass]      

len(drugclass2int)

306

In [7]:
zip2int = {}

for p in input_zip_seq:
    if p not in zip2int: 
        zip2int[p] = len(zip2int)

input_int_zip = [zip2int[i] for i in input_zip_seq]      
len(zip2int)

437

In [8]:
def build_feature(seq, size):    
    X = np.zeros((len(seq), size))
    for i in range(len(seq)):
        value = seq[i]
        X[i][value] = 1
    return X

In [9]:
age_feature = build_feature([i//5 for i in input_age_seq], 4)
sex_feature = build_feature([1 if i=="M" else 0 for i in input_sex_seq], 2)
zip_feature = build_feature(input_int_zip, len(zip2int))

cost_feature = np.array([np.log(i+1) for i in input_cost_seq]).reshape(-1,1)
medical_cost_feature = np.array([np.log(i+1) for i in input_medical_cost_seq]).reshape(-1,1)
pharmacy_cost_feature = np.array([np.log(i+1) for i in input_pharmacy_cost_seq]).reshape(-1,1)

In [10]:
ccs_feature = build_feature(input_ccs, len(ccs2int))
drugclass_feature = build_feature(input_int_drugclass, len(drugclass2int))

In [11]:
features = [age_feature, sex_feature, zip_feature, cost_feature, medical_cost_feature, pharmacy_cost_feature,\
            ccs_feature, drugclass_feature]

X = np.concatenate(features, axis =1)

In [12]:
X.shape, zip_feature.shape

((143102, 1003), (143102, 437))

In [18]:
# y = np.array([max(0,x)/1000 for x in target_cost_seq])

y = np.array([np.log(i+1) for i in target_cost_seq])
# y = np.array([x/len(target_cost_seq) for x in ss.rankdata(target_cost_seq)])

In [14]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
import scipy

In [15]:
def result(y_true, y_pred):
    return metrics.mean_absolute_error(y_true, y_pred), \
           metrics.r2_score(y_true, y_pred),\
           np.sqrt(metrics.mean_squared_error(y_true, y_pred)),\
           scipy.stats.pearsonr(y_true, y_pred)[0]

def generate_result(model, seed):
    idx_train, idx_val = train_test_split(range(len(y)), train_size=0.85, random_state=seed)
    idx_train, idx_test = train_test_split(range(len(idx_train)), train_size=0.82, random_state=seed)
    
    model.fit(X[idx_train], y[idx_train])
    
    y_pred = model.predict(X[idx_test])
    mae, r2, rmse, pcc = result(y[idx_test], y_pred)
    return mae, r2, rmse, pcc

def display(list_eva):
    for list_ in list_eva:
        print(np.mean(list_), np.std(list_))
        print()

# Lasso

In [16]:
from sklearn import linear_model

In [19]:
mae_list, r2_list, rmse_list, pcc_list = [], [], [], []

for i in range(5):
    regr = linear_model.Lasso(alpha=0.0001)
    mae, r2, rmse, pcc = generate_result(regr, seed=i)
    mae_list.append(mae)
    r2_list.append(r2)
    rmse_list.append(rmse)
    pcc_list.append(pcc)
    
display([mae_list, r2_list, rmse_list, pcc_list])

1.1105001061210653 0.010025314585897272

0.23071742324385144 0.0058909226544947785

1.6754710340913916 0.01790564622178337

0.480435868852463 0.006187625377678258



In [24]:
mae_list, r2_list, rmse_list, pcc_list = [], [], [], []

for i in range(5):
    regr = linear_model.Ridge()
    mae, r2, rmse, pcc = generate_result(regr, seed=i)
    mae_list.append(mae)
    r2_list.append(r2)
    rmse_list.append(rmse)
    pcc_list.append(pcc)
    
display([mae_list, r2_list, rmse_list, pcc_list])

1.1144221258018299 0.009672559328535

0.22836446556454612 0.00539365593891125

1.6780354464201153 0.01793046387518555

0.4780530394244226 0.0055930411519516466



# DT

In [17]:
from sklearn.tree import DecisionTreeRegressor

In [18]:
mae_list, r2_list, rmse_list, pcc_list = [], [], [], []

for i in range(5):
    regr = DecisionTreeRegressor()

    mae, r2, rmse, pcc = generate_result(regr, seed=i)
    mae_list.append(mae)
    r2_list.append(r2)
    rmse_list.append(rmse)
    pcc_list.append(pcc)
    
display([mae_list, r2_list, rmse_list, pcc_list])

0.2386839441091869 0.00124350684704552

-0.13387190065374371 0.010525747840330404

0.3090639317216862 0.0017004594121298692

0.4228699685864864 0.005563967329324237



# RF

In [19]:
from sklearn.ensemble import RandomForestRegressor

In [20]:
mae_list, r2_list, rmse_list, pcc_list = [], [], [], []

for i in range(5):
    regr = RandomForestRegressor(n_estimators=10)

    mae, r2, rmse, pcc = generate_result(regr, seed=i)
    mae_list.append(mae)
    r2_list.append(r2)
    rmse_list.append(rmse)
    pcc_list.append(pcc)
    
display([mae_list, r2_list, rmse_list, pcc_list])

0.18892604995051904 0.0007533077862002474

0.3259550128825864 0.0034643211185408688

0.23829346232571016 0.000610604011100442

0.5824057357152246 0.002517882628424136



# XGboost

In [72]:
from xgboost import XGBRegressor as xgbr

In [22]:
mae_list, r2_list, rmse_list, pcc_list = [], [], [], []

for i in range(5):
    regr = xgbr()

    mae, r2, rmse, pcc = generate_result(regr, seed=i)
    mae_list.append(mae)
    r2_list.append(r2)
    rmse_list.append(rmse)
    pcc_list.append(pcc)
    
display([mae_list, r2_list, rmse_list, pcc_list])

0.18541524512248886 0.001485148962431849

0.3756330605005225 0.005635216942190714

0.22934384043448502 0.0012889376611867785

0.6131753311595484 0.004701173462523285



# FNN

In [32]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras import regularizers


In [29]:
def build_model(feature_input=1003,
                hidden_dim = 128,
                ):
    
    X_input = layers.Input(shape=(feature_input,))
    inputs_list = [X_input]
    
    out = layers.Dense(hidden_dim, activation="relu", kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4))(X_input)
    out = layers.Dropout(0.5)(out)

    out = layers.Dense(hidden_dim, activation="relu", kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4))(out)
    out = layers.Dropout(0.5)(out)

    out = layers.Dense(hidden_dim, activation="relu", kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4))(out)
    out = layers.Dropout(0.5)(out)
    
    out = layers.Dense(1, activation=None, name='main_output')(out)
    model = keras.models.Model(inputs=inputs_list, outputs=[out])

    model.compile(optimizer='adam', loss="mse")
    #print(model.summary())
    return model

In [33]:
def result(y_true, y_pred):
    return metrics.mean_absolute_error(y_true, y_pred), \
           metrics.r2_score(y_true, y_pred),\
           np.sqrt(metrics.mean_squared_error(y_true, y_pred)),\
           scipy.stats.pearsonr(y_true, y_pred)[0]

def generate_result(seed):
    model = build_model()
    idx_train, idx_val = train_test_split(range(len(y)), train_size=0.85, random_state=seed)
    idx_train, idx_test = train_test_split(range(len(idx_train)), train_size=0.82, random_state=seed)

    earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose=0, mode='min', restore_best_weights=True)
    history = model.fit([X[idx_train]], y[idx_train], epochs=50, batch_size=100, \
                        validation_data=([X[idx_val]], y[idx_val]), verbose=0, callbacks=[earlyStopping])


    y_pred = model.predict([X[idx_test]], verbose=0).reshape(-1)
    mae, r2, rmse, pcc = result(y[idx_test], y_pred)
    return mae, r2, rmse, pcc, y_pred

def display(list_eva):
    for list_ in list_eva:
        print(np.mean(list_), np.std(list_))
        print()

In [34]:
mae_list, r2_list, rmse_list, pcc_list = [], [], [], []

for i in range(5):
    print(i)
    mae, r2, rmse, pcc, y_pred = generate_result(seed=i)
    mae_list.append(mae)
    r2_list.append(r2)
    rmse_list.append(rmse)
    pcc_list.append(pcc)
    
display([mae_list, r2_list, rmse_list, pcc_list])

0
1
2
3
4
1.2258464510481384 0.05209184092972659

0.2095321348088564 0.022237805583200636

1.6983581011100646 0.036137663886514675

0.536885639413436 0.006288476098318938



# High utilizer selection

In [89]:
seed=4

In [90]:
regr = linear_model.Lasso(alpha=0.0001)

In [91]:
idx_train, idx_val = train_test_split(range(len(y)), train_size=0.85, random_state=seed)
idx_train, idx_test = train_test_split(range(len(idx_train)), train_size=0.82, random_state=seed)

regr.fit(X[idx_train], y[idx_train])    
y_pred = regr.predict(X[idx_test])

In [92]:
k = 0.05
top_idx = np.argsort(y_pred)[::-1][:int(len(y_pred)*k)]

print(len(top_idx))
print(np.mean(np.array(target_cost_seq)[idx_test][top_idx]))
print(np.sum(np.array(target_cost_seq)[idx_test][top_idx]))

1094
6362.451782449725
6960522.249999999


In [93]:
k = 0.03
top_idx = np.argsort(y_pred)[::-1][:int(len(y_pred)*k)]

print(len(top_idx))
print(np.mean(np.array(target_cost_seq)[idx_test][top_idx]))
print(np.sum(np.array(target_cost_seq)[idx_test][top_idx]))

656
7706.1428201219505
5055229.6899999995


In [94]:
k = 0.01
top_idx = np.argsort(y_pred)[::-1][:int(len(y_pred)*k)]

print(len(top_idx))
print(np.mean(np.array(target_cost_seq)[idx_test][top_idx]))
print(np.sum(np.array(target_cost_seq)[idx_test][top_idx]))

218
11210.994495412842
2443996.8


In [82]:
seed=4

In [83]:
regr = xgbr()

In [84]:
idx_train, idx_val = train_test_split(range(len(y)), train_size=0.85, random_state=seed)
idx_train, idx_test = train_test_split(range(len(idx_train)), train_size=0.82, random_state=seed)

regr.fit(X[idx_train], y[idx_train])    
y_pred = regr.predict(X[idx_test])



In [85]:
k = 0.05
top_idx = np.argsort(y_pred)[::-1][:int(len(y_pred)*k)]

print(len(top_idx))
print(np.mean(np.array(target_cost_seq)[idx_test][top_idx]))
print(np.sum(np.array(target_cost_seq)[idx_test][top_idx]))

1094
6738.442934186472
7371856.57


In [86]:
k = 0.03
top_idx = np.argsort(y_pred)[::-1][:int(len(y_pred)*k)]

print(len(top_idx))
print(np.mean(np.array(target_cost_seq)[idx_test][top_idx]))
print(np.sum(np.array(target_cost_seq)[idx_test][top_idx]))

656
8444.139070121952
5539355.23


In [87]:
k = 0.01
top_idx = np.argsort(y_pred)[::-1][:int(len(y_pred)*k)]

print(len(top_idx))
print(np.mean(np.array(target_cost_seq)[idx_test][top_idx]))
print(np.sum(np.array(target_cost_seq)[idx_test][top_idx]))

218
14342.184220183484
3126596.1599999997


In [77]:
k = 0.05
top_idx = np.argsort(np.array(input_cost_seq)[idx_test])[::-1][:int(len(np.array(input_cost_seq)[idx_test])*k)]

print(len(top_idx))
print(np.mean(np.array(target_cost_seq)[idx_test][top_idx]))
print(np.sum(np.array(target_cost_seq)[idx_test][top_idx]))

1094
6429.123263254114
7033460.850000001


In [76]:
k = 0.03
top_idx = np.argsort(np.array(input_cost_seq)[idx_test])[::-1][:int(len(np.array(input_cost_seq)[idx_test])*k)]

print(len(top_idx))
print(np.mean(np.array(target_cost_seq)[idx_test][top_idx]))
print(np.sum(np.array(target_cost_seq)[idx_test][top_idx]))

656
8232.909893292683
5400788.89


In [75]:
k = 0.01
top_idx = np.argsort(np.array(input_cost_seq)[idx_test])[::-1][:int(len(np.array(input_cost_seq)[idx_test])*k)]

print(len(top_idx))
print(np.mean(np.array(target_cost_seq)[idx_test][top_idx]))
print(np.sum(np.array(target_cost_seq)[idx_test][top_idx]))

218
12901.323532110093
2812488.5300000003
