In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
import re
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import SubsetRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
DEMO = False
WRITE_ERRORS_TO_FILE = True

In [3]:

if DEMO:
    CHARTEVENTS_BY_ICUSTAY_ID = 'data_demo/parquet/'
    REDUCED_CE_BY_ICUSTAY_ID = 'data_demo/samples/'
    NUMERICAL_BY_ICUSTAY_ID = 'data_demo/samples_numerical/'
    STATB_BY_ICUSTAY_ID = 'data_demo/samples_statB/'
    STATB_CSV_BY_ICUSTAY_ID = 'data_demo/samples_statB_csv/'
else:
    CHARTEVENTS_BY_ICUSTAY_ID = 'data/parquet/'
    REDUCED_CE_BY_ICUSTAY_ID = 'data/samples/'
    NUMERICAL_BY_ICUSTAY_ID = 'data/samples_numerical/'
    STATB_BY_ICUSTAY_ID = 'data/samples_statB/'
    DEMO_BY_ICUSTAY_ID = 'data/samples_demographics/'
    ICD9_BY_ICUSTAY_ID = 'data/samples_icd9/'


CHARTEVENTS_FILENAME = 'mimic-iii/CHARTEVENTS.csv'
DIAGNOSES_FILENAME = 'mimic-iii/DIAGNOSES_ICD.csv'
READMISSION_FILENAME = 'data/readmission.csv'
EMBEDDED_FILENAME = 'resources/embedded.parquet'
ANNOTATION_FILENAME = 'data/anotation.csv'
STEP_FAIL_FILE = 'Step9_failed.txt'



In [4]:
K_FOLDS = 5
TEST_SIZE = 0.1
RANDOM_STATE = 1234
EPOCH = 50
BATCH_SIZE = 1
LEARNING_RATE = 0.001
BETA = 0.9

In [5]:
class linearRegression(torch.nn.Module):
    def __init__(self, inputSize, outputSize):
        super(linearRegression, self).__init__()
        self.linear = torch.nn.Linear(inputSize, outputSize,dtype=float)
        #self.linear = torch.nn.Linear(inputSize, outputSize)

    def forward(self, x):
        out = self.linear(x)
        return out

In [6]:
def reset_weights(m):
    if isinstance(m, torch.nn.Conv2d) or isinstance(m, torch.nn.Linear):
        m.reset_parameters()

In [7]:
class ReadmissionDataset(Dataset):
    def __init__(self, annotations_file, numerical_dir=None, statB_dir=None, icd9_dir=None, demo_dir=None):
        self.labels = pd.read_csv(annotations_file)
        self.numerical_dir = numerical_dir
        self.statB_dir = statB_dir
        self.icd9_dir = icd9_dir
        self.demo_dir = demo_dir

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if self.numerical_dir:
            numerical_path = os.path.join(self.numerical_dir, 'ICUSTAY_ID='+str(self.labels.iloc[idx, 0]))
            data = pd.read_parquet(numerical_path).to_numpy()
        if self.statB_dir:
            statB_path = os.path.join(self.statB_dir, 'ICUSTAY_ID='+str(self.labels.iloc[idx, 0]))
            data = pd.read_parquet(statB_path).to_numpy().reshape(1,-1)
            assert data.shape[1] == 29, f'Stay = {self.labels.iloc[idx, 0]}'
        if self.icd9_dir:
            icd9_path = os.path.join(self.icd9_dir, 'ICUSTAY_ID='+str(self.labels.iloc[idx, 0]))
            icd9 = pd.read_parquet(icd9_path).reset_index(drop=True).to_numpy()
            #if self.numerical_dir:
            icd9 = np.vstack([icd9]*data.shape[0])
            data = np.hstack([data, icd9]) 
            assert icd9.shape[1] == 300, f'Stay = {self.labels.iloc[idx, 0]}'
        if self.demo_dir:
            demo_path = os.path.join(self.demo_dir, 'ICUSTAY_ID='+str(self.labels.iloc[idx, 0]))
            demo = pd.read_parquet(demo_path).reset_index(drop=True).to_numpy()
            #if self.numerical_dir:
            demo = np.vstack([demo]*data.shape[0])
            data = np.hstack([data, demo])
            assert demo.shape[1] == 4, f'Stay = {self.labels.iloc[idx, 0]}'

        #data = torch.from_numpy(data.astype(float))
        data = data.astype(float)
        label = np.ndarray((1,1))
        label[0,0] = self.labels.iloc[idx, 1]
        
        return data, label

In [8]:
# data_np = pd.read_parquet(STATB_BY_ICUSTAY_ID+'ICUSTAY_ID=200001').to_numpy()
# icd9_np = pd.read_parquet(ICD9_BY_ICUSTAY_ID+'ICUSTAY_ID=200001').to_numpy()
# demo_df = pd.read_parquet(DEMO_BY_ICUSTAY_ID+'ICUSTAY_ID=200001').to_numpy()
# data_np = np.hstack([data_np,icd9_np])
# data_np.shape

# demo_df = np.vstack([demo_df]*1)
# demo_df = np.hstack([demo_df,demo_df])
# demo_df

In [9]:
# dataset = ReadmissionDataset(ANNOTATION_FILENAME,NUMERICAL_BY_ICUSTAY_ID, STATB_BY_ICUSTAY_ID,ICD9_BY_ICUSTAY_ID, DEMO_BY_ICUSTAY_ID)
dataset = ReadmissionDataset(ANNOTATION_FILENAME, numerical_dir=None, statB_dir=STATB_BY_ICUSTAY_ID, icd9_dir=ICD9_BY_ICUSTAY_ID, demo_dir=DEMO_BY_ICUSTAY_ID)

In [10]:
# dataset.labels

In [11]:
train, test = train_test_split(dataset, test_size=TEST_SIZE, random_state=RANDOM_STATE,shuffle=False)

AssertionError: Stay = 235557

In [None]:
train[0][0].shape

In [None]:
train[0][1]

In [None]:
kfold = KFold(n_splits=K_FOLDS, shuffle=True)

In [None]:

model = linearRegression(dataset[0][0].shape[1], 1)
criterion = torch.nn.MSELoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
loss_list = [[] for i in range(K_FOLDS)]
for fold, (train_index, valid_index) in enumerate(kfold.split(train)):
    print('-'*50)
    print('FOLD = ', fold+1)
    # numpy arrays of indices are created from kfold

    train_sampler = SubsetRandomSampler(train_index)
    valid_sampler = SubsetRandomSampler(valid_index)

    train_dataloader = DataLoader(train, batch_size=None, shuffle=False, sampler=train_sampler)
    valid_dataloader = DataLoader(train, batch_size=None, shuffle=False, sampler=valid_sampler)

    model.apply(reset_weights)

    for epoch in range(EPOCH):
        current_loss = 0.0
        for i, data in enumerate(train_dataloader):
            #print(i)
            #print(data)
            
            x_input, y_actual = data
            optimizer.zero_grad()
            x_input = torch.nan_to_num(x_input)
            #print(x_input)
            y_predict = model(torch.nan_to_num(x_input))
            #print(y_predict)
            # print(y_actual)
            loss = criterion(y_predict, y_actual)
            loss.backward()
            optimizer.step()
            current_loss += loss.item()
            #print('current loss = ',current_loss)
            if i % 1000 == 0:
                print('current loss = ',current_loss)
        
        loss_list[fold].append(current_loss)
    

In [None]:
plt.plot(loss_list[0], label='epoch loss 1')
plt.plot(loss_list[1], label='epoch loss 2')
plt.plot(loss_list[2], label='epoch loss 3')
plt.plot(loss_list[3], label='epoch loss 4')
plt.plot(loss_list[4], label='epoch loss 5')
plt.xlabel('epoch')
plt.title('LR Loss')
plt.legend()
plt.show()

In [None]:
for i,t in enumerate(train):
    if train[i][0].shape != (1,333):
        print(i)

In [None]:
print(train[172][0].shape)
print(train[1018][0].shape)