In [2]:
# supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# load in data
import torch
tensor = torch.load('multivariate_time_matrix.pt')
patients_visits_tensor = torch.load('patients_visits_tensor.pt')

# cuda for gpu training
cuda_available = torch.cuda.is_available()
device = torch.device('cuda:0' if cuda_available else 'cpu')
torch.backends.cudnn.benchmark = True

from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from torch.utils.tensorboard import SummaryWriter
# Create a summary writer for logging with Tensorboard
writer = SummaryWriter()

In [65]:
# useful global dimension values
tensor_shape = tensor.size()
num_patients = tensor_shape[0]
max_num_visits = tensor_shape[1]
vocab_size = tensor_shape[2]
print(tensor.size())
print(patients_visits_tensor.size())

torch.Size([7537, 42, 856])
torch.Size([7537])


In [68]:
# create features and labels by shifting data, specifically,
# exclude last row vector=visits for features, and exclude first row vector=0 for labels

def construct_input_and_label(tensor, patients_visits_tensor):
    # 7537 (number of patients) x 42 (max number of visits) x 856 (number of unique icd code)
    # how 'useful' visits there are or that are not padded
    dims = patients_visits_tensor - 1
    assert torch.max(dims) == 3
    # maintain same dims as above
    
    X = torch.zeros(num_patients, max_num_visits-1, vocab_size)
    y = torch.zeros(num_patients, max_num_visits-1, vocab_size)
    # which visits are useful and not padding
    mask = torch.zeros(num_patients, max_num_visits-1)
    
    for i in range(num_patients):
        patient_tensor = tensor[i,:,:]
        X[i,:,:] = patient_tensor[:-1,:] # include visits 1-41
        y[i,:,:] = patient_tensor[1:,:] # include visits 2-42
        mask[i,:dims[i]] = 1
        
    return X, y, mask, dims

def PCA_analysis(X):
    from sklearn.decomposition import PCA
    import matplotlib.pyplot as plt
    import numpy as np
    X_arr, y_arr = X.numpy(), y.numpy()
    # num_patients*max_num_visits=num of examples
    X_arr = X_arr.reshape(-1, vocab_size) 

    # fit PCA on training set only
    pca = PCA().fit(X_arr)
    feature_list = list(range(vocab_size))
    percent_variance = np.round(pca.explained_variance_ratio_*100, decimals=2)
    
    # graph scree plot
    plt.figure(figsize=(25,10))
    plt.plot(feature_list, percent_variance, marker='x', markersize=10)
    plt.ylabel('Percentage of Variance Explained')
    plt.xlabel('Principal Component')
    plt.title('PCA Scree Plot')
    plt.show()
    
    
#     pca = PCA(n_components=250).fit(X_train)
#     X_train = pca.transform(X_train)
#     X_test = pca.transform(X_test)

In [69]:
X, y, mask, dims = construct_input_and_label(tensor, patients_visits_tensor)
# PCA_analysis(X)

tensor(41, dtype=torch.int32)


In [17]:
print(X.shape)
print(y.shape)
print('There are {} actual visits that are not padding'.format(mask.sum()))

torch.Size([7537, 41, 856])
torch.Size([7537, 41, 856])
There are 12456.0 actual visits that are not padding


In [59]:
from torch import nn
model = nn.Linear(20, 5) # predict logits for 5 classes
x = torch.randn(1, 20)
y = torch.tensor([[1., 0., 1., 0., 0.]]) # get classA and classC as active

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

for epoch in range(20):
    optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    print('Loss: {:.3f}'.format(loss.item()))

Loss: 0.819
Loss: 0.679
Loss: 0.569
Loss: 0.484
Loss: 0.417
Loss: 0.364
Loss: 0.322
Loss: 0.287
Loss: 0.258
Loss: 0.235
Loss: 0.214
Loss: 0.197
Loss: 0.182
Loss: 0.169
Loss: 0.158
Loss: 0.148
Loss: 0.139
Loss: 0.131
Loss: 0.124
Loss: 0.118


In [6]:
# Override Dataset Loader
class icdDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        # the last sample is not used
        return len(self.X)
    def __getitem(self, index):
        # Generate one sample of data from a patient
        return self.X[index], self.y[index]
    
dataset = icdDataset(X, y)
num_samples = X.size()[0]
import math
# 80/20 split
training_set, validation_set = torch.utils.data.random_split(dataset,
    [math.ceil(num_samples*0.8),math.floor(num_samples*0.2)])
    
# Parameters
params = {'batch_size': 5, 
          'shuffle': True,
          'num_workers': 2}
epochs = 20

training_loader = DataLoader(training_set, **params)
validation_loader = DataLoader(validation_set, **params)

In [9]:
training_set

<torch.utils.data.dataset.Subset at 0x7fcde0cc0dd0>

In [11]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP,self).__init__():
            self.embedding

SyntaxError: unexpected EOF while parsing (<ipython-input-11-191bbdc6c80d>, line 1)

In [None]:
import torch.nn
# CNN architecture
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel,self).__init__()
        norm_layer = torch.nn.BatchNorm2d
        
        
    def forward(self,x):
        
        
# LSTM architecture
class LSTMModel(nn.Module):
    def __init__(self):
        super(LSTMModel,self).__init__()
    def forward(self,x)

In [None]:
# training loop


In [None]:
# TODO:
# alternative: PCA if embedding layer does not work for good representation:

# time series sequence prediction

# time series sequence classifcation

# time series sequence meta-learning, determine disease labels

# include argparse later

# k-fold validation because of small dataset