**Idea inicial, arquitectura inicial de la CNN y usa carry-forward imputation, mas cantidad de mediciones en cada intervalo**

Necesary libraries:

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os, time, random
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn import metrics
import itertools
from sklearn import preprocessing

Loading the data:

**Just run the next box to download the data**

In [6]:
# Run this just the first time
#!pip install -U wget
#!rm -rf data.zip data lib
#!rm -rf preprocessed
!mkdir -p preprocessed
!mkdir -p lib

import wget
wget.download('https://github.com/shengpu1126/BDSI2019-ML/raw/master/lib/config.yaml', 'lib/config.yaml')
wget.download('https://github.com/shengpu1126/BDSI2019-ML/raw/master/lib/helper.py', 'lib/helper.py')
wget.download('https://github.com/shengpu1126/BDSI2019-ML/raw/master/data.zip', 'data.zip')
wget.download('https://github.com/shengpu1126/BDSI2019-ML/raw/master/preprocessed/data_seq.npz', 'preprocessed/data_seq.npz')
wget.download('https://github.com/shengpu1126/BDSI2019-ML/raw/master/lib/prepare_data.py', 'lib/prepare_data.py')

import zipfile
with zipfile.ZipFile("data.zip","r") as zip_ref:
    zip_ref.extractall(".")

In [2]:
#Run this to load the data if you have downloaded the data before
from lib.helper import load_data
raw_data, df_labels = load_data(120000)

Loading files from disk: 100%|██████████| 12000/12000 [00:38<00:00, 309.90it/s]


In [3]:
# GPU support
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('using device:', device)

using device: cpu


Preparing data:

In [48]:
raw_data

{132539:       Time   Variable   Value
 0    00:00        Age   54.00
 1    00:00     Gender    0.00
 2    00:00     Height   -1.00
 3    00:00    ICUType    4.00
 4    00:00     Weight   -1.00
 5    00:07        GCS   15.00
 6    00:07         HR   73.00
 7    00:07  NIDiasABP   65.00
 8    00:07      NIMAP   92.33
 9    00:07   NISysABP  147.00
 10   00:07   RespRate   19.00
 11   00:07       Temp   35.10
 12   00:07      Urine  900.00
 13   00:37         HR   77.00
 14   00:37  NIDiasABP   58.00
 15   00:37      NIMAP   91.00
 16   00:37   NISysABP  157.00
 17   00:37   RespRate   19.00
 18   00:37       Temp   35.60
 19   00:37      Urine   60.00
 20   01:37         HR   60.00
 21   01:37  NIDiasABP   62.00
 22   01:37      NIMAP   87.00
 23   01:37   NISysABP  137.00
 24   01:37   RespRate   18.00
 25   01:37      Urine   30.00
 26   02:37         HR   62.00
 27   02:37  NIDiasABP   52.00
 28   02:37      NIMAP   75.67
 29   02:37   NISysABP  123.00
 ..     ...        ...     ...


In [52]:
%run lib/prepare_data_CF_invariant.py

  config = yaml.load(open('lib/config.yaml'))






Loading files from disk:   0%|          | 0/80 [00:00<?, ?it/s][A[A[A[A[A[A





Loading files from disk:  52%|█████▎    | 42/80 [00:00<00:00, 412.33it/s][A[A[A[A[A[A





Loading files from disk: 100%|██████████| 80/80 [00:00<00:00, 445.48it/s][A[A[A[A[A[A





Generating feature vectors:   0%|          | 0/80 [00:00<?, ?it/s][A[A[A[A[A[A





Generating feature vectors:   1%|▏         | 1/80 [00:00<00:35,  2.24it/s][A[A[A[A[A[A





Generating feature vectors:  40%|████      | 32/80 [00:02<00:15,  3.04it/s][A[A[A[A[A[A





Generating feature vectors:  45%|████▌     | 36/80 [00:02<00:10,  4.17it/s][A[A[A[A[A[A





Generating feature vectors:  48%|████▊     | 38/80 [00:02<00:07,  5.28it/s][A[A[A[A[A[A

IndexError: single positional indexer is out-of-bounds







Generating feature vectors:  48%|████▊     | 38/80 [00:16<00:07,  5.28it/s][A[A[A[A[A[A

In [45]:
class SimpleDataset(Dataset):
    def __init__(self, X, y):
        self.X, self.y = X, y
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx]).float(), torch.tensor([self.y[idx]]).float()
    def __len__(self):
        return len(self.X)

def get_train_val_test(batch_size=64):
    #f = np.load('preprocessed/data_seq.npz')
    f = np.load('data/data_nmiss_invariant.npz')
    f2 = np.load('data/data_miss_invariant.npz')
    #f3 = np.load('data/data_dist_backw.npz')
    #f4 = np.load('data/data_times_backw_neg.npz')
    X, y = f['X'], f['y']
    X = np.concatenate((X,f2['X']), axis=2)
    X = X.transpose((0,2,1))
    print(X.shape, y.shape)
    
    print('Creating splits')
    Xtr, X__, ytr, y__ = train_test_split(X,   y, train_size=0.8, stratify=y, random_state=13)
    Xva, Xte, yva, yte = train_test_split(X__, y__, test_size=0.5, stratify=y__, random_state=13)
    
    tr = SimpleDataset(Xtr, ytr)
    va = SimpleDataset(Xva, yva)
    te = SimpleDataset(Xte, yte)
    
    tr_loader = DataLoader(tr, batch_size=batch_size, shuffle=True)
    va_loader = DataLoader(va, batch_size=batch_size)
    te_loader = DataLoader(te, batch_size=batch_size)
    
    print('Feature shape, Label shape, Class balance:')
    print('\t', tr_loader.dataset.X.shape, tr_loader.dataset.y.shape, tr_loader.dataset.y.mean())
    print('\t', va_loader.dataset.X.shape, va_loader.dataset.y.shape, va_loader.dataset.y.mean())
    print('\t', te_loader.dataset.X.shape, te_loader.dataset.y.shape, te_loader.dataset.y.mean())
    return tr_loader, va_loader, te_loader

In [40]:
tr_loader, va_loader, te_loader = get_train_val_test(batch_size=64)

(80, 105, 48) (80,)
Creating splits
Feature shape, Label shape, Class balance:
	 (64, 105, 48) (64,) 0.15625
	 (8, 105, 48) (8,) 0.125
	 (8, 105, 48) (8,) 0.125




In [34]:
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, in_channels, n_filters, output_size, sequence_len):
        super().__init__()
        self.n_filters = n_filters
        self.conv1 = nn.Conv1d(in_channels, n_filters, 3, padding=1)
        self.pool1 = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(n_filters, n_filters, 3, padding=1)
        self.pool2 = nn.MaxPool1d(2)
        self.fc = nn.Linear(int(sequence_len/2/2)*n_filters, output_size)

    def forward(self, x):
        xm = 1 - x[:,40:,:]
        x = x[:,:40,:]
        N, d, L = x.shape
        x = torch.Tensor(np.concatenate((x,xd,xt), axis=1))
        ones = np.array([np.array([np.ones(L) for i in range(d)]) for j in range(N)])
        xm = torch.Tensor(np.concatenate((xm,ones,ones), axis=1))
        
        # Getting the weights for the trained model
        z = self.conv1(x)
        w = self.conv1.weight
        wm = torch.abs(w) / torch.sum(torch.abs(w), dim=(1,2), keepdim=True)
        
        # Apply the weights to the mask xm
        zm = F.conv1d(xm, wm, padding=1)
        
        # Apply the first set of conv-elu-pool
        z = self.pool1(zm*F.elu(z))
        
        # Apply the second set of conv-elu-pool
        z = self.pool2(F.elu(self.conv2(z)))
        
        # Flatten the output from the convolutional/pooling layers
        z = z.view(N, -1)
        z = self.fc(z)
        #z = torch.stack(self.fc(z), 1)
        
        # Pass through the output layer and apply sigmoid activation
        z = torch.sigmoid(z)

        return z

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

model = CNN(35*3, 64, 1, 48)
print('Number of float-valued parameters:', count_parameters(model))

Number of float-valued parameters: 33345


In [14]:
x = torch.zeros((1, 35*4, 48))
model(x)

tensor([[0.5007]], grad_fn=<SigmoidBackward>)

Train the LSTM:

In [15]:
def _train_epoch(data_loader, model, criterion, optimizer):
    """
    Train the `model` for one epoch of data from `data_loader`
    Use `optimizer` to optimize the specified `criterion`
    """
    model.train()
    for i, (X, y) in enumerate(data_loader):
        X, y = X.to(device), y.to(device)
        
        # clear parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

def _evaluate_epoch(tr_loader, va_loader, model, criterion):
    model.eval()
    with torch.no_grad():
        # Evaluate on train
        y_true, y_score = [], []
        running_loss = []
        for X, y in tr_loader:
            X, y = X.to(device), y.to(device)
            output = model(X)
            y_true.append(y.cpu().numpy())
            y_score.append(output.cpu().numpy())
            running_loss.append(criterion(output, y).item())

        y_true, y_score = np.concatenate(y_true), np.concatenate(y_score)
        train_loss = np.mean(running_loss)
        train_score = metrics.roc_auc_score(y_true, y_score)
        print('tr loss', train_loss, 'tr AUROC', train_score)

        # Evaluate on validation
        y_true, y_score = [], []
        running_loss = []
        for X, y in va_loader:
            X, y = X.to(device), y.to(device)
            with torch.no_grad():
                output = model(X)
                y_true.append(y.cpu().numpy())
                y_score.append(output.cpu().numpy())
                running_loss.append(criterion(output, y).item())

        y_true, y_score = np.concatenate(y_true), np.concatenate(y_score)
        val_loss = np.mean(running_loss)
        val_score = metrics.roc_auc_score(y_true, y_score)
        print('va loss', val_loss, 'va AUROC', val_score)
    return train_loss, val_loss, train_score, val_score

def save_checkpoint(model, epoch, checkpoint_dir):
    state = {
        'epoch': epoch,
        'state_dict': model.state_dict(),
    }

    filename = os.path.join(checkpoint_dir, 'epoch={}.checkpoint.pth.tar'.format(epoch))
    torch.save(state, filename)

In [16]:
#!mkdir -p checkpoint

tr_loader, va_loader, te_loader = get_train_val_test(batch_size=64)
    
torch.random.manual_seed(0)
np.random.seed(0)
random.seed(0)

n_epochs = 30
learning_rate = 1e-3

model = CNN(35*3, 64, 1, 48)
print('Number of float-valued parameters:', count_parameters(model))

model = model.to(device)
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

outputs = []

print('Epoch', 0)
out = _evaluate_epoch(tr_loader, va_loader, model, criterion)
outputs.append(out)

for epoch in range(0, n_epochs):
    print('Epoch', epoch+1)
    # Train model
    _train_epoch(tr_loader, model, criterion, optimizer)

    # Evaluate model
    out = _evaluate_epoch(tr_loader, va_loader, model, criterion)
    outputs.append(out)

    # Save model parameters
    save_checkpoint(model, epoch+1, 'checkpoint/')

train_losses, val_losses, train_scores, val_scores = zip(*outputs)

fig, ax = plt.subplots(figsize=(5,5))
plt.plot(range(n_epochs + 1), train_scores, '--o', label='Train')
plt.plot(range(n_epochs + 1), val_scores, '--o', label='Validation')
plt.xlabel('epoch')
plt.ylabel('AUROC')
plt.legend()
plt.savefig('auroc_dist_times_backw.png', dpi=300)

fig, ax = plt.subplots(figsize=(5,5))
plt.plot(range(n_epochs + 1), train_losses, '--o', label='Train')
plt.plot(range(n_epochs + 1), val_losses, '--o', label='Validation')
plt.xlabel('epoch')
plt.ylabel('Loss (binary cross entropy)')
plt.legend()
plt.savefig('loss_dist_times_backw.png', dpi=300)


(10000, 140, 48) (10000,)
Creating splits




MemoryError: 

Evaluation on test set

In [19]:
def restore_checkpoint(model, checkpoint_dir, cuda=False):
    """
    If a checkpoint exists, restores the PyTorch model from the checkpoint.
    Returns the model and the current epoch.
    """
    cp_files = [file_ for file_ in os.listdir(checkpoint_dir)
        if file_.startswith('epoch=') and file_.endswith('.checkpoint.pth.tar')]

    if not cp_files:
        print('No saved model parameters found')
        if force:
            raise Exception("Checkpoint not found")
        else:
            return model, 0, []
    
    # Find latest epoch
    for i in itertools.count(1):
        if 'epoch={}.checkpoint.pth.tar'.format(i) in cp_files:
            epoch = i
        else:
            break

    print("Which epoch to load from? Choose in range [1, {}].".format(epoch))
    inp_epoch = int(input())
    if inp_epoch not in range(1, epoch+1):
        raise Exception("Invalid epoch number")

    filename = os.path.join(checkpoint_dir,
        'epoch={}.checkpoint.pth.tar'.format(inp_epoch))

    print("Loading from checkpoint {}".format(filename))
    
    if cuda:
        checkpoint = torch.load(filename)
    else:
        # Load GPU model on CPU
        checkpoint = torch.load(filename,
            map_location=lambda storage, loc: storage)

    try:
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        print("=> Successfully restored checkpoint (trained for {} epochs)"
            .format(checkpoint['epoch']))
    except:
        print("=> Checkpoint not successfully restored")
        raise

    return model, inp_epoch

def _evaluate_epoch(data_loader, model, criterion):
    model.eval()
    with torch.no_grad():
        y_true, y_score = [], []
        running_loss = []
        for X, y in data_loader:
            output = model(X)
            y_true.append(y.numpy())
            y_score.append(output)
            running_loss.append(criterion(output, y).item())
        y_true, y_score = np.concatenate(y_true), np.concatenate(y_score)
    
    loss = np.mean(running_loss)
    score = metrics.roc_auc_score(y_true, y_score)
    return loss, score

_, _, te_loader = get_train_val_test(batch_size=64)
model = CNN(35*3, 64, 1, 48)
model, _ = restore_checkpoint(model, 'checkpoint/')
criterion = torch.nn.BCELoss()
loss, score = _evaluate_epoch(te_loader, model, criterion)
print('Test loss :', loss)
print('Test AUROC:', score)

(10000, 140, 48) (10000,)
Creating splits




Feature shape, Label shape, Class balance:
	 (8000, 140, 48) (8000,) 0.142875
	 (1000, 140, 48) (1000,) 0.143
	 (1000, 140, 48) (1000,) 0.143
Which epoch to load from? Choose in range [1, 30].
6
Loading from checkpoint checkpoint/epoch=6.checkpoint.pth.tar
=> Successfully restored checkpoint (trained for 6 epochs)
Test loss : 0.3656015517190099
Test AUROC: 0.7674763975814151
