# Inputs

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

splitPartCount = 5
splitSeed = 27

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # set initial hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))
        
        # decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        
        return out
    
def createModel(input_size):
    model = LSTMModel(input_size, 50, 2, 2)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    return model

# Preproccess

## read data

In [2]:
from utils.class_patient import Patients


patients = Patients()
len(patients)

1213

## remove missing

In [3]:
# fill measures whose null represent false value

from constants import NULLABLE_MEASURES


nullableMeasures = NULLABLE_MEASURES

for measureName in nullableMeasures:
    patients.fillMissingMeasureValue(measureName, 0)

In [4]:
# remove measures with less than 80% of data

measures = patients.getMeasures()

for measure, count in measures.items():
    if count < len(patients) * 80 / 100:
        patients.removeMeasures([measure])
        print(measure, count)

pco2 917
ph 954
po2 917
albumin 406
hba1c 326
lymphocyte 446
height 415
urine-ketone 294
crp 19


In [5]:
# remove patients with less than 80% of data

patients.removePatientByMissingFeatures()
len(patients)

1209

## split patients

In [6]:
splitedPatients = patients.split(splitPartCount, splitSeed)

len(splitedPatients[0])

242

In [7]:
from pandas import Timedelta


splitedPatients[0].getMeasuresBetween(Timedelta(-6), Timedelta(1))

Unnamed: 0,subject_id,hadm_id,stay_id,akd,ag,age,bg,bicarbonate,bun,calcium,...,race,rr,saps2,sbp,scr,sofa,use_NaHCO3,uti,wbc,weight
0,19054290,20046699,30643955,False,,65,,,,,...,WHITE,,34,,,3,0.0,0,,74.50
0,14866589,20066152,33060916,False,,37,,,,,...,WHITE,,22,,,2,0.0,0,,
0,14849360,20152551,32817197,False,,59,,,,,...,UNABLE TO OBTAIN,,22,,,4,0.0,0,,
0,16171124,20167052,38239449,False,,37,,,,,...,HISPANIC/LATINO - DOMINICAN,,15,,,2,0.0,0,,
0,16815664,20240670,37751533,False,,42,,,,,...,BLACK/AFRICAN AMERICAN,,27,,,2,,0,,43.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,15517352,29801212,32814591,False,,20,,,,,...,WHITE,,30,,,1,,0,,
0,10383045,29899941,39755932,False,,58,,,,,...,WHITE,,23,,,1,0.0,0,,59.40
0,15679298,29905973,39834726,False,,50,,,,,...,WHITE,,32,,,3,0.0,0,,47.40
0,10912213,29911812,34040470,False,,47,,,,,...,WHITE,,23,,,2,0.0,0,,89.05


In [8]:
def trainTest():
    for i in range(splitedPatients.__len__()):
        testPatients = splitedPatients[i]

        trainPatientsList = splitedPatients[:i] + splitedPatients[i + 1 :]
        trainPatients = Patients(patients=[])
        for trainPatientsElem in trainPatientsList:
            trainPatients += trainPatientsElem

        yield trainPatients, testPatients

In [9]:
for trainPatients, testPatients in trainTest():
    print(len(trainPatients.patientList), len(testPatients.patientList))

967 242
967 242
967 242
967 242
968 241


In [10]:
import pickle
from constants import CATEGORICAL_MEASURES, TEMP_PATH
from utils.dl_prepare_data import patientsToNumpy


dataset = []
for i, (trainPatients, testPatients) in enumerate(trainTest()):
    npTrainX, categoryEncoder, numericEncoder, columns = patientsToNumpy(
        trainPatients, 1, CATEGORICAL_MEASURES
    )
    npTestX, _, _, _ = patientsToNumpy(
        testPatients, 1, CATEGORICAL_MEASURES, columns, categoryEncoder, numericEncoder
    )
    trainY = [p.akdPositive for p in trainPatients]
    testY = [p.akdPositive for p in testPatients]

    dataset.append((npTrainX, trainY, npTestX, testY))

    print(f"Dataset {i} created")
    pass


(TEMP_PATH / f"dataset-{splitPartCount}-{splitSeed}.pkl").write_bytes(
    pickle.dumps(dataset)
)

Dataset 0 created
Dataset 1 created
Dataset 2 created


ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.


In [12]:
from utils.dl_prepare_data import patientsToNumpy
from constants import CATEGORICAL_MEASURES


for i, (trainPatients, testPatients) in enumerate(trainTest()):
    if i != 3:
        continue
    
    npTrainX, categoryEncoder, numericEncoder, columns = patientsToNumpy(trainPatients, 1, CATEGORICAL_MEASURES)
    npTestX, *_ = patientsToNumpy(testPatients, 1, CATEGORICAL_MEASURES, columns, categoryEncoder, numericEncoder)

In [11]:
columns

Index(['ag', 'age', 'bg', 'bicarbonate', 'bun', 'calcium',
       'chronic_pulmonary_disease', 'congestive_heart_failure', 'dbp', 'egfr',
       'gcs', 'gcs_unable', 'hb', 'history_aci', 'history_ami', 'hr',
       'hypertension', 'macroangiopathy', 'malignant_cancer',
       'mechanical_ventilation', 'microangiopathy', 'oasis', 'phosphate',
       'plt', 'preiculos', 'rr', 'saps2', 'sbp', 'scr', 'sofa', 'use_NaHCO3',
       'uti', 'wbc', 'weight', 'dka_type_0', 'dka_type_1', 'dka_type_2',
       'gender_F', 'gender_M', 'race_AMERICAN INDIAN/ALASKA NATIVE',
       'race_ASIAN', 'race_ASIAN - CHINESE', 'race_ASIAN - SOUTH EAST ASIAN',
       'race_BLACK/AFRICAN', 'race_BLACK/AFRICAN AMERICAN',
       'race_BLACK/CAPE VERDEAN', 'race_BLACK/CARIBBEAN ISLAND',
       'race_HISPANIC OR LATINO', 'race_HISPANIC/LATINO - CUBAN',
       'race_HISPANIC/LATINO - DOMINICAN', 'race_HISPANIC/LATINO - GUATEMALAN',
       'race_HISPANIC/LATINO - MEXICAN', 'race_HISPANIC/LATINO - PUERTO RICAN',
       

In [15]:
numericEncoder.var_

array([2.49201816e+01, 3.15868926e+02, 1.61161992e+04, 2.41337372e+01,
       3.83818604e+02, 6.56404635e-01, 1.38773956e-01, 1.23817091e-01,
       2.30565620e+02, 1.43210947e+03, 1.34447502e+00, 9.76057827e-02,
       4.24323646e+00, 6.00049835e-02, 1.28907516e-01, 2.67287497e+02,
       2.37177424e-01, 1.79090974e-01, 3.48843800e-02, 4.98370891e-02,
       2.27360176e-01, 7.15451470e+01, 1.74720321e+00, 1.20740434e+04,
       9.57369806e+06, 2.91573886e+01, 1.73463677e+02, 4.71046843e+02,
       1.02451655e+00, 6.86179390e+00, 5.64869171e-02, 1.04781470e-01,
       4.24510404e+01, 4.45400987e+02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [16]:
_, _, _, columnsTest = patientsToNumpy(testPatients, 1, CATEGORICAL_MEASURES, columns, categoryEncoder, numericEncoder)
columnsTest

ValueError: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.
