In [1]:
import os
import glob

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange
from sklearn.metrics import confusion_matrix
import random
from matplotlib.ticker import MaxNLocator

# Seed
seed = 2022
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

FOLDERNAME = 'Colab\ Notebooks/3_Split'
%cd drive/MyDrive/$FOLDERNAME

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/3_Split


In [3]:
label_dict = {
    "Non-stroke": 0,
    "ICH": 1,
    "Ischemic": 2,
}

In [4]:
class EGGLSTMDataset(Dataset):
    
    def __init__(self, data_path, label_dict):
        
        self.files = sorted(glob.glob(os.path.join(data_path, "*.txt")))
        self.n = len(self.files)
        self.label_dict = label_dict
        
    def __len__(self):
        return self.n
    
    def __getitem__(self, idx):
        with open(self.files[idx]) as f:
            data = f.readlines()
        x = np.array([np.array([float(v.strip()) for v in d.split(',')[:-1]])[None, :] for d in data[0:4]])
        x = np.concatenate(x)
        # x = x / np.std(x, axis=1, keepdims=True)
        y = label_dict[data[4]]
        x = torch.Tensor(x)
        y = torch.Tensor([y])
        return x, y


In [5]:
ds_train = EGGLSTMDataset("Train_oversample_rnn_new", label_dict)
print("ds_train", len(ds_train))
ds_val = EGGLSTMDataset("Val_rnn", label_dict)
labels = [int(y) for _, y in ds_val]
print("ds_val", len(ds_val))

x, y = ds_train[2851]
print(x.shape)
print(y)

ds_train 3327
ds_val 241
torch.Size([4, 1000])
tensor([1.])


In [6]:
# fig = plt.figure(figsize=(20, 10))

# for i in range(4):
#     plt.subplot(4, 1, i + 1)
#     plt.plot(x[i, :])

## Netwok

In [7]:
model = nn.LSTM(input_size=4, hidden_size=1024, num_layers=2, bidirectional=False, proj_size=2)
model

LSTM(4, 1024, proj_size=2, num_layers=2)

In [8]:
## 如果要load特定model 請改back up之model名
# model.load_state_dict(torch.load("backup_rnn_base_h1024_L2_1e-3epoch000400.pth"))

In [9]:
outputs, _ = model(x.transpose(0, 1).unsqueeze(1))
outputs.shape

torch.Size([1000, 1, 2])

In [10]:
device = torch.device("cuda:0")
model = model.to(device)

## Data Loader

In [11]:
dl_train = DataLoader(ds_train, shuffle=True, batch_size=20, drop_last=True)
dl_val = DataLoader(ds_val, shuffle=False, batch_size=20, drop_last=False)

## Train

In [12]:
def val(model, dl_val, device, labels):
  model.eval()
  predictions = []
  with torch.no_grad():
      for (x, y) in dl_val:
          x = x.to(device)
          y = y.to(device)
          outputs, _ = model(x.transpose(0, 2).transpose(1, 2))
          predictions.append(outputs[-1, :, :])
  
  predictions = np.concatenate([p.cpu().numpy() for p in predictions], axis=0)
  preds = np.argmax(predictions, axis=1)
  cm_norm = confusion_matrix(labels, preds, normalize="true")
  cm_pred = confusion_matrix(labels, preds)
  acc = np.sum(labels == preds) / len(labels)
  return cm_norm, cm_pred, preds, labels, acc
  

In [13]:
print_every = 10
backup_every = 100
model_name = 'backup_rnn_base_3E-4_500'
n_epoch = 500
lr = 0.0003
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

loss_log = []
c0_log = []
c1_log = []
c2_log = []
seen = 0

for i in tqdm(range(n_epoch)):
    j = 0
    for (x, y) in dl_train:
        model.train()
        # print("x", x.size(), "y", y.size())
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        outputs, _ = model(x.transpose(0, 2).transpose(1, 2))
        # print("outputs", outputs.size())
        loss = loss_fn(outputs[-1, :, :], y[:, 0].long())
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            j += loss
            seen = seen + x.size(0)
    j /= len(dl_train)
    
    if (i+1) % print_every == 0:
        print('epoch: ', i, 'cost: ', j.item())
        cm, acc = val(model, dl_val, device, labels)
        # print(cm[0][0], cm[1][1], cm[2][2], acc)
        c0_log.append(float(cm[0][0]))
        c1_log.append(float(cm[1][1]))
        c2_log.append(float(cm[2][2]))
    if (i+1) % backup_every == 0:
      torch.save(model.state_dict(), "{}_{:06d}.pth".format(model_name, n_epoch))   
    loss_log.append(j)


  0%|          | 0/500 [00:00<?, ?it/s]

epoch:  9 cost:  0.6050198078155518


ValueError: ignored

In [None]:
# plot the cost history
plt.plot([loss_log[i].item() for i in range(len(loss_log))])
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
plt.xlabel('iteration')
plt.ylabel('training cost')
plt.title('Training Cost history')
plt.show()

In [None]:
# plot the cm
plt.plot(c0_log, label='class 0')
plt.plot(c1_log, label='class 1')
plt.plot(c2_log, label='class 2')
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
plt.xlabel('every ' + str(print_every) +' epoch')
plt.ylabel('ACC')
plt.title('ACC history')
plt.legend()
plt.show()

In [None]:
# model.eval()

# predictions = []
# with torch.no_grad():
#     for (x, y) in tqdm(dl_val):
#         x = x.to(device)
#         y = y.to(device)
#         outputs, _ = model(x.transpose(0, 2).transpose(1, 2))
#         predictions.append(outputs[-1, :, :])

In [None]:
# predictions = np.concatenate([p.cpu().numpy() for p in predictions], axis=0)
# print("predictions", predictions.shape)

In [None]:
# labels = [int(y) for _, y in ds_val]
# preds = np.argmax(predictions, axis=1)
# cm = confusion_matrix(labels, preds, normalize="true")
# cm

In [None]:
# acc = np.sum(labels == preds) / len(labels)
# print('Acc: ', acc)

In [None]:
cm_norm, cm_pred, preds, labels, acc = val(model, dl_val, device, labels)

In [None]:
print(cm_norm)
print(cm_pred)

In [None]:
print('Acc: ', acc)


In [None]:
import pandas as pd
pred_df = pd.DataFrame(
        preds,
        labels   
)
pred_df

In [None]:
pred_df.to_csv('{}.csv'.format(model_name))