In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import _pickle as pickle
import random
import scipy.stats as ss

# cost features

In [2]:
startTime = datetime.now()

input_age_seq = pickle.load(open("../data/baseline/input_age_seq","rb"))
input_sex_seq = pickle.load(open("../data/baseline/input_sex_seq","rb"))

input_cost_seq = pickle.load(open("../data/baseline/input_cost_seq","rb"))
target_cost_seq = pickle.load(open("../data/baseline/target_cost_seq","rb"))

input_medical_cost_seq = pickle.load(open("../data/baseline/input_medical_cost_seq","rb"))
input_monthly_medical_cost_seq = pickle.load(open("../data/baseline/input_monthly_medical_cost_seq","rb"))

input_pharmacy_cost_seq = pickle.load(open("../data/baseline/input_pharmacy_cost_seq","rb"))
input_monthly_pharmacy_cost_seq = pickle.load(open("../data/baseline/input_monthly_pharmacy_cost_seq","rb"))

print(datetime.now() - startTime)

0:00:00.563696


In [3]:
def build_feature(seq, size):    
    X = np.zeros((len(seq), size))
    for i in range(len(seq)):
        value = seq[i]
        X[i][value] = 1
    return X

In [4]:
age_feature = build_feature([i//5 for i in input_age_seq], 4)
sex_feature = build_feature([1 if i=="M" else 0 for i in input_sex_seq], 2)

cost_feature = np.array([np.log(i+1) for i in input_cost_seq]).reshape(-1,1)
medical_cost_feature = np.array([np.log(i+1) for i in input_medical_cost_seq]).reshape(-1,1)
pharmacy_cost_feature = np.array([np.log(i+1) for i in input_pharmacy_cost_seq]).reshape(-1,1)

monthly_medical_cost_feature = np.array([[np.log(i+1) for i in y] for y in input_monthly_medical_cost_seq]).reshape(-1,12)
monthly_pharmacy_cost_feature = np.array([[np.log(i+1) for i in y] for y in input_monthly_pharmacy_cost_seq]).reshape(-1,12)

In [5]:
features = [age_feature, sex_feature, cost_feature, medical_cost_feature, pharmacy_cost_feature,\
            ]

X_cost = np.concatenate(features, axis =1)
X_cost.shape

(143102, 9)

In [6]:
# y = np.array([np.log(i+1) for i in target_cost_seq])
y = np.array([x/len(target_cost_seq) for x in ss.rankdata(target_cost_seq)])

# util sequence

In [7]:
startTime = datetime.now()

input_util_seq = pickle.load(open("../data/advance/input_util_seq","rb"))

print(datetime.now() - startTime)

0:00:01.625799


In [8]:
vocab = {}
for p in input_util_seq:
    for v in p:
        for c in v:
            if c not in vocab: vocab[c] = len(vocab)

In [9]:
def build_seq_feature(seq, vocab):
    X = np.zeros((len(seq),12, len(vocab) ))
    for i in range(len(seq)):
        for j in range(12):
            for value in seq[i][j]:
                X[i][j][vocab[value]] +=1
    return X

In [10]:
X_util = build_seq_feature(input_util_seq, vocab)

In [11]:
X_util = np.concatenate((monthly_medical_cost_feature.reshape(-1,12,1),\
                         monthly_pharmacy_cost_feature.reshape(-1,12,1),\
                         X_util), axis=-1)

In [12]:
X_util.shape

(143102, 12, 38)

# code sequence

In [13]:
startTime = datetime.now()

input_diag_seq = pickle.load(open("../data/advance/input_diag_seq","rb"))
input_proc_seq = pickle.load(open("../data/advance/input_proc_seq","rb"))
input_drug_seq = pickle.load(open("../data/advance/input_drug_seq","rb"))

print(datetime.now() - startTime)

0:00:04.356740


In [14]:
code2int = {"PAD":0}

code_seq = []
for p_diag, p_proc, p_drug in zip(input_diag_seq, input_proc_seq, input_drug_seq):
    new_p = []
    for diag, proc, drug in zip(p_diag, p_proc, p_drug):
        new_v = []
        for d in diag:
            if d not in code2int: code2int[d] = len(code2int)
            new_v.append(code2int[d])
        for p in proc:
            if p not in code2int: code2int[p] = len(code2int)
            new_v.append(code2int[p])
        for dr in drug:
            if dr not in code2int: code2int[dr] = len(code2int)
            new_v.append(code2int[dr])
        new_p.append(new_v)
    code_seq.append(new_p)

In [15]:
def build_seq(seq, max_codes = 50, max_length=12):
    X = np.zeros((len(seq), max_length, max_codes))
    for i, p in enumerate(seq):
        assert len(p) == max_length
        for j, claim in enumerate(p):
            claim = claim[:max_codes]
            X[i][j][:len(claim)] = claim
    return X

In [16]:
X_code = build_seq(code_seq)
X_code.shape

(143102, 12, 50)

# Model

In [17]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
import scipy

In [18]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import backend as K

In [19]:
from attention import Attention

In [20]:
def build_model(max_len =12,
                max_code=50,
                max_util=38,
                max_demo=9,
                feature_code=len(code2int),
                embed_dim = 100,
                lstm_units=32,
                input_ = "code"
               ):
    
    input_code = layers.Input(shape=(max_len, max_code))
    input_util = layers.Input(shape=(max_len, max_util))
    input_demo = layers.Input(shape=(max_demo,))
    inputs_list = [input_code, input_util, input_demo]
    
    # code
    input_code = layers.Embedding(input_dim=feature_code, output_dim=embed_dim, 
                                  mask_zero=True, name='code_embedding')(input_code)
    
    mask_layer = layers.Lambda(lambda x: tf.cast(tf.tile(tf.expand_dims(x._keras_mask, axis=-1), [1, 1, 1,embed_dim]), tf.float32))
    mask_code = mask_layer(input_code)
    input_code = layers.Multiply()([mask_code, input_code])
    input_code = layers.Lambda(lambda x: keras.backend.sum(x, axis=2))(input_code)

    input_code = layers.Dropout(0.5)(input_code)
    input_code = layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=True))(input_code)
    input_code = Attention(max_len)(input_code)
    input_code = layers.Dropout(0.5)(input_code)
    
    # util
    input_util = layers.Dense(lstm_units, activation="relu")(input_util)
    
    input_util = layers.Dropout(0.5)(input_util)
    input_util = layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=True))(input_util)
    input_util = layers.Bidirectional(layers.LSTM(lstm_units, return_sequences=False))(input_util)

    input_util = layers.Dropout(0.5)(input_util)    
    
    if input_=="code":
        out = input_code
    elif input_=="util":
        out = input_util
    elif input_=="demo":
        out = input_demo
        
    
    out = layers.Dense(lstm_units, activation="relu")(out)
    out = layers.Dropout(0.5)(out)

    out = layers.Dense(lstm_units, activation="relu")(out)
    out = layers.Dropout(0.5)(out)
    
    out = layers.Dense(1, activation=None, name='main_output')(out)
    model = keras.models.Model(inputs=inputs_list, outputs=[out])

    model.compile(optimizer='adam', loss="mse")
    #print(model.summary())
    return model

In [21]:
def result(y_true, y_pred):
    return metrics.mean_absolute_error(y_true, y_pred), \
           metrics.r2_score(y_true, y_pred),\
           np.sqrt(metrics.mean_squared_error(y_true, y_pred)),\
           scipy.stats.pearsonr(y_true, y_pred)[0]

def generate_result(seed, input_):
    model = build_model(input_=input_)
    idx_train, idx_val = train_test_split(range(len(y)), train_size=0.85, random_state=seed)
    idx_train, idx_test = train_test_split(range(len(idx_train)), train_size=0.82, random_state=seed)

    earlyStopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose=0, mode='min', restore_best_weights=True)
    history = model.fit([X_code[idx_train], X_util[idx_train], X_cost[idx_train]], y[idx_train], epochs=50, batch_size=100, \
                        validation_data=([X_code[idx_val], X_util[idx_val], X_cost[idx_val]], y[idx_val]), verbose=0, callbacks=[earlyStopping])


    y_pred = model.predict([X_code[idx_test], X_util[idx_test], X_cost[idx_test]], verbose=0).reshape(-1)
    mae, r2, rmse, pcc = result(y[idx_test], y_pred)
    return mae, r2, rmse, pcc, y_pred

def display(list_eva):
    for list_ in list_eva:
        print(np.mean(list_), np.std(list_))
        print()

In [22]:

for input_ in ["demo", 'util', 'code']:
    mae_list, r2_list, rmse_list, pcc_list = [], [], [], []
    print(input_)
    for i in range(5):
        print(i)
        mae, r2, rmse, pcc, y_pred = generate_result(seed=i, input_=input_)
        mae_list.append(mae)
        r2_list.append(r2)
        rmse_list.append(rmse)
        pcc_list.append(pcc)
    
    display([mae_list, r2_list, rmse_list, pcc_list])

demo
0
1
2
3
4
0.21062463903033452 0.006415642797297634

0.27862015576834037 0.031785748022896104

0.2464674141952849 0.005678082215724863

0.5802320319553866 0.007843306330628248

util
0
1
2
3
4
0.19665477319279256 0.005493785836529525

0.34285623881116484 0.02226691333289523

0.23525994023359073 0.004258593745363088

0.6104812318056807 0.00557618758263286

code
0
1
2
3
4
0.19392129964332255 0.0015618339992350511

0.329560621643997 0.008947920459512768

0.23765220284299965 0.0017861234480300568

0.5745075485027786 0.007762600414664745



In [24]:

for input_ in ["demo", 'util', 'code']:
    mae_list, r2_list, rmse_list, pcc_list = [], [], [], []
    print(input_)
    for i in range(5):
        print(i)
        mae, r2, rmse, pcc, y_pred = generate_result(seed=i, input_=input_)
        mae_list.append(mae)
        r2_list.append(r2)
        rmse_list.append(rmse)
        pcc_list.append(pcc)
    
    display([mae_list, r2_list, rmse_list, pcc_list])

demo
0
1
2
3
4
1.122124893224926 0.020032401343759297

0.2286404044567507 0.005932198261082242

1.6777421088863818 0.018935518449822754

0.4865442193632644 0.005188035708022915

util
0
1
2
3
4
1.1742072297502555 0.04041330466479561

0.2355653504305165 0.011530702719676302

1.6702111701972147 0.025503623836266923

0.5182765482947769 0.005197460353786274

code
0
1
2
3
4
1.1076813285657032 0.009048797210910358

0.236998053298806 0.004942671611880203

1.6686571173478615 0.020681352068964258

0.4894094488452283 0.004898838027227226

