In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, random_split, DataLoader
import torch.nn as nn
from collections import Counter
import os
import zipfile
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix,f1_score, accuracy_score
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import KFold

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import zipfile
zip_path = "/content/drive/MyDrive/DeepHeart/outputs_6leads.zip"
output_dir = "/content"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(output_dir)

In [4]:
class ECG_Data(Dataset):

    def __init__(self, df, window_size):
        self.df = df
        self.window_size = window_size

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        age = torch.tensor([int(self.df.iloc[index, 1])])
        sex = torch.tensor([0]) if self.df.iloc[index, 2] == 'Male' else torch.tensor([1])
        label = torch.tensor([int(self.df.iloc[index, 3])])

        file_name = "/content/outputs_6leads/" + self.df.iloc[index, 0] + ".csv"
        data = pd.read_csv(file_name)
        lead_data = data.iloc[:, 1:7].values.astype(float)
        windowed_lead_data = []
        for i in range(0, len(lead_data) - window_size + 1, window_size):              #windowing the lead data
              windowed_lead_data.append(lead_data[i:i+window_size])
        windowed_lead_data = torch.Tensor(windowed_lead_data)
        windowed_lead_data = torch.sum(windowed_lead_data, dim = 1)

        return windowed_lead_data, label, age, sex

In [5]:
df = pd.read_csv("/content/drive/MyDrive/DeepHeart/patients_outputs_6leads.csv")
df = df.dropna()

# df_ones = df[df['Label'] == 1].copy()
# df_zeros = df[df['Label'] == 0].sample(n=10000, random_state=42).copy()
# df_final = pd.concat([df_ones, df_zeros])
# df_final.reset_index(drop=True, inplace=True)
df_final = df

num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True)

In [6]:
fold_datasets = []
window_size = 100
for train_indices, test_indices in kfold.split(df_final):
    train_data = df_final.iloc[train_indices]
    test_data = df_final.iloc[test_indices]

    train_dataset = ECG_Data(train_data, window_size)
    test_dataset = ECG_Data(test_data, window_size)

    fold_datasets.append((train_dataset, test_dataset))

In [7]:
batch_size = 64

data_loaders = []
for train_dataset, test_dataset in fold_datasets:
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    data_loaders.append((train_loader, test_loader))

device=torch.device('cuda') if torch.cuda.is_available() else 'cpu'
torch.cuda.is_available()

True

In [8]:
for data, label, age, sex in train_loader:
  print(data.size())
  print(age.size())
  print(label.size())
  print(sex.size())
  break

df_final.groupby('Label').count()

  windowed_lead_data = torch.Tensor(windowed_lead_data)


torch.Size([64, 50, 6])
torch.Size([64, 1])
torch.Size([64, 1])
torch.Size([64, 1])


Unnamed: 0_level_0,File_Name,Age,Sex
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,36972,36972,36972
1,8123,8123,8123


In [9]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, device):
        super(Model,self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        '''self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)'''
        #self.self_attention = nn.MultiheadAttention(input_size, num_heads = <place the number of heads here>)
        self.fc = nn.Sequential(nn.Linear(hidden_size * 3, 500),
                                nn.Linear(500, 2))

        self.sigmoid = nn.Sigmoid()

    def positional_encoding(self, var, size):
        inv_freq = 1.0 / (
            10000
            ** (torch.arange(0, size, 2, device=self.device).float() / size)
        )
        pos_enc_a = torch.sin(var.repeat(1, size // 2) * inv_freq)
        pos_enc_b = torch.cos(var.repeat(1, size // 2) * inv_freq)
        pos_enc = torch.cat([pos_enc_a, pos_enc_b], dim=-1)
        return pos_enc


    def forward(self, lead_data, age, sex):
        batch_size = lead_data.size(0)
        seq_length = lead_data.size(1)
        # h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device)
        # c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(self.device)

        h0 = torch.zeros(self.num_layers*2, batch_size, self.hidden_size).to(self.device)
        c0 = torch.zeros(self.num_layers*2, batch_size, self.hidden_size).to(self.device)
        # print(h0.size(), lead_data.size())

        lstm_out, _ = self.lstm(lead_data, (h0, c0))
        # print(lstm_out.size())

        #lead_data = lead_data.permute(1,0,2)
        #attention_out,_ = self.self_attention(lead_data,lead_data,lead_data)
        #attention_out = attention_out.permute(1,0,2)
        #last_output = attention_out[:,-1,:]

        last_output = lstm_out[:, -1, :]

        age = self.positional_encoding(age, 64)
        sex = self.positional_encoding(sex, 64)

        combined = torch.cat((last_output, age, sex), dim=1)

        output = self.fc(combined)   #applying final layers
        output = self.sigmoid(output)

        return output

In [10]:
'''
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier

#data from torch tensor to numpy arrays, ds is the class instance of ECGDataset

lead_data,label,age,sex = ds[0]
lead_data = lead_data.numpy()
label = label.numpy()
age = age.numpy()
sex = sex.numpy()

lead_data_flattened = lead_data.reshape(lead_data.shape[0], -1)
features = np.hstack((lead_data_flattened, age.reshape(-1,1), sex.reshape(-1,1)))
#another approach is that we do not flatten the lead data as shown below
#features = np.concatenate((lead_data, np.repeat(age, lead_data.shape[0], axis=0)[:, np.newaxis], np.repeat(sex, lead_data.shape[0], axis=0)[:, np.newaxis]), axis=2)

X_train,X_test,y_train,y_test = train_test_split(features, label, test_size=0.2, random_state=42)

#this is the RF implementation
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


#this is xgb implementation
params = {
    'booster': 'gbtree',
    'n_estimator': '100',
    'eta': 0.1,
    'max_depth': 5,
    'lambda': 1.0,
    'objective': 'binary:logistic',
    'eval_metric': 'auc'
}

xgb_cl = xgb.XGBClassifier(**params)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
xgb_cl.fit(X_train, y_train)
y_pred = xgb_cl.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

'''

'\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import train_test_split\nimport xgboost as xgb\nfrom xgboost import XGBClassifier\n\n#data from torch tensor to numpy arrays, ds is the class instance of ECGDataset\n\nlead_data,label,age,sex = ds[0]\nlead_data = lead_data.numpy()\nlabel = label.numpy()\nage = age.numpy()\nsex = sex.numpy()\n\nlead_data_flattened = lead_data.reshape(lead_data.shape[0], -1)\nfeatures = np.hstack((lead_data_flattened, age.reshape(-1,1), sex.reshape(-1,1)))\n#another approach is that we do not flatten the lead data as shown below\n#features = np.concatenate((lead_data, np.repeat(age, lead_data.shape[0], axis=0)[:, np.newaxis], np.repeat(sex, lead_data.shape[0], axis=0)[:, np.newaxis]), axis=2)\n\nX_train,X_test,y_train,y_test = train_test_split(features, label, test_size=0.2, random_state=42)\n\n#this is the RF implementation\nrf = RandomForestClassifier(n_estimators=100, random_state=42)\nrf.fit(X_train, y_train)\ny_pred

In [11]:
labels_lst = np.array(df_final['Label'])

class_freq = Counter(labels_lst)
num_classes = len(class_freq)

# Calculate class weights based on inverse frequency
total_samples = len(labels_lst)
class_weights = [total_samples / (class_freq[i] + 1e-8) for i in range(num_classes)]
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class_weights[1] = 4.5

# Define the weighted loss function
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
# criterion = torch.nn.CrossEntropyLoss()
print(class_freq, class_weights, np.unique(labels_lst, return_counts = True))

Counter({0: 36972, 1: 8123}) tensor([1.2197, 4.5000], device='cuda:0') (array([0, 1]), array([36972,  8123]))


In [18]:
def train(train_loader, test_loader, model, optimizer, loss_criteria = nn.BCELoss(), batchsize = batch_size, epochs = 50):
  val_losses = []
  train_losses = []

  for epoch in range(epochs):
      train_loss = 0.0
      val_loss = 0.0
      min_loss = 1000.0


      model.train()
      for i, (lead_data, labels, age, sex) in enumerate(tqdm(train_loader)):

          lead_data = lead_data.to(device)
          age = age.to(device)
          sex = sex.to(device)

          labels = F.one_hot(labels, num_classes=2)
          labels = labels.type(torch.FloatTensor)
          labels = labels.squeeze()
          labels = labels.to(device)

          optimizer.zero_grad()
          outputs = model(lead_data, age, sex)
          loss = loss_criteria(outputs, labels)
          loss.backward()
          optimizer.step()

          train_loss += loss.item() * lead_data.size(0)
          if loss.item() < min_loss:
            min_loss = loss.item()
          if (i + 1) % 50 == 0:
            print("EPOCH : {}/{}, MIN_LOSS : {}, LOSS : {}".format(epoch + 1, epochs, min_loss, loss.item()))

      train_loss /= len(train_loader.dataset)
      train_losses.append(train_loss)

      # Validation
      model.eval()
      predictions = []
      actual = []
      with torch.no_grad():
          for lead_data, labels, age, sex in tqdm(test_loader):
              lead_data = lead_data.to(device)
              age = age.to(device)
              sex = sex.to(device)

              outputs = model(lead_data, age, sex)
              _, predicted = torch.max(outputs, 1)
              for i in range(labels.size(0)):
                  label = labels[i]
                  pred = predicted[i]
                  predictions.append(pred.item())
                  actual.append(label.item())
          cm = confusion_matrix(actual, predictions)
          print(classification_report(actual, predictions))
          disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
          disp.plot()
          plt.show()
          torch.save(model.state_dict(), '/content/drive/MyDrive/DeepHeart/deepheart_bilstm.pt')

In [20]:
for fold_idx, (train_loader, test_loader) in enumerate(data_loaders):
  input_size = 6
  num_layers = 2
  # hidden_size = 256
  hidden_size = 128
  model = Model(input_size,hidden_size,num_layers, device).to(device)
  try:
    model.load_state_dict(torch.load('/content/drive/MyDrive/DeepHeart/deepheart_bilstm.pt'))
  except:
    print("no saved model found")
  optimizer=torch.optim.AdamW(model.parameters(),lr=0.0003)

  train(train_loader, test_loader, model, optimizer, loss_criteria = criterion, epochs = 15)
  print("=====================================================STEP : {}/{} COMPLETE=====================================================".format(fold_idx + 1, num_folds))

  2%|▏         | 11/564 [00:16<13:37,  1.48s/it]


KeyboardInterrupt: ignored