In [1]:
# %matplotlib ipympl
# Load the autoreload extension
%load_ext autoreload
# Set autoreload to reload all modules
%autoreload 2

import cv2
import matplotlib.pyplot as plt
import numpy as np
import json
import pandas as pd
import torch
import torch.nn.functional as F
import os
import psutil
from utils.preprocess import *

# Get available memory
available_memory = psutil.virtual_memory().available

# Assuming np.float64
element_size = np.dtype(np.float64).itemsize

# Calculate max array size
max_array_size = available_memory / element_size

print(f"Maximum array size (np.float64): {max_array_size} elements")
# from mpl_interactions import ipyplot as iplt
# from matplotlib.cm import ScalarMappable

Maximum array size (np.float64): 36735434752.0 elements


In [9]:
# Data loading Version 2 - toy data previously
# data_root = '/scratch/bbsg/hangy6/RLS/data' # delta data root
data_root = 'data' # HAL data root
patient_dirs = [
    '',
    f'{data_root}/patient01-08-27-2023',
    '',
    '',
    '',
    f'{data_root}/patient05-02-15-2024',
    f'{data_root}/patient06-02-17-2024',
    '',
    '',
    f'{data_root}/patient09-03-01-2024',
    '',
    f'{data_root}/patient11-03-15-2024',
    '',
    f'{data_root}/patient13-03-31-2024',
    f'{data_root}/patient14-04-03-2024',
    f'{data_root}/patient15-04-12-2024-relabeled',
    f'{data_root}/patient16-04-13-2024',
    f'{data_root}/patient17-04-14-2024',
    f'{data_root}/patient18-04-15-2024',
    f'{data_root}/patient19-04-16-2024',
    f'{data_root}/patient20-04-18-2024',
    f'{data_root}/patient21-04-26-2024',
    f'{data_root}/patient22-04-27-2024',
    f'{data_root}/patient23-04-28-2024',
    f'{data_root}/patient24-04-29-2024',
    f'{data_root}/patient25-05-10-2024',
    f'{data_root}/patient26-05-11-2024',
    f'{data_root}/patient27-05-13-2024',
    f'{data_root}/patient28-05-13-2024',
    f'{data_root}/patient29-05-14-2024-relabeled',
    f'{data_root}/patient30-05-25-2024',
    f'{data_root}/patient31-05-27-2024',
    f'{data_root}/patient32-05-28-2024',
    '',
    f'{data_root}/patient34-05-30-2024',
    f'{data_root}/patient35-06-06-2024',
    f'{data_root}/patient36-06-07-2024',
    f'{data_root}/patient37-06-08-2024',
    f'{data_root}/patient38-06-09-2024',
    f'{data_root}/patient39-06-10-2024',
    f'{data_root}/patient40-06-11-2024',
    f'{data_root}/patient41-06-12-2024',
    f'{data_root}/patient42-06-21-2024',
]

patient_dir = patient_dirs[34]

print(f'Memory available before loading data: {psutil.virtual_memory().available}')
try:
    del matrices, timestamps, data
except:
    pass

data_file = os.path.join(patient_dir, 'raw_pressure_data.npy')
timestamps_file = os.path.join(patient_dir, 'raw_timestamps.npy')
if os.path.exists(data_file) and os.path.exists(timestamps_file):
    print('load existing pressure data...')
    matrices = np.load(data_file).astype(np.float32)
    timestamps = np.load(timestamps_file)
    print(f'successfully loaded pressure data from {data_file} and timestamps from {timestamps_file}')
else:
    print('Generating pressure data from raw json file...')
    pressure_data = load_json(os.path.join(patient_dir, 'raw_data.json'))
    matrices, timestamps = get_pressure_matrices(pressure_data)
    np.save(data_file, np.array(matrices))
    np.save(timestamps_file, np.array(timestamps))
    print('Successfully saved pressure data and timestamps')
# matrices = 5 * (matrices / 500)**2
matrices.shape, matrices.max(), np.unravel_index(matrices.argmax(), shape=matrices.shape)

Memory available before loading data: 271541338112
load existing pressure data...
successfully loaded pressure data from data/patient34-05-30-2024/raw_pressure_data.npy and timestamps from data/patient34-05-30-2024/raw_timestamps.npy


((161372, 48, 22), 7340.0, (126346, 28, 14))

In [10]:
save_data = True
clip_len = 40
prune_negative=False
overlap_train=False
    
EMG_label_path = f'{patient_dir}/positive_timestamps.csv'
wake_mask = f'{patient_dir}/wake_mask.csv'
train_data, train_label, val_data, val_label, _ = \
make_tal_labels(matrices, timestamps, EMG_label_path, filter_timestamp=wake_mask, \
            clip_len=clip_len, overlap_train=False, \
            split_ratio=0.7, plot_statistics=False, calibrate_value=4.65, prune_negative=prune_negative)

# positive_idx = train_label > 0
# negative_idx = train_label == 0
# positive_data = train_data[positive_idx]
# negative_data = train_data[negative_idx]
# batch1_positive = positive_data[:, ...]
# # batch2_positive = positive_data[50:100, ...]
# batch1_negative = negative_data[:, ...]
# # batch2_negative = negative_data[25:100, ...]
# batch1 = np.concatenate([batch1_positive, batch1_negative], axis=0)
# # batch2 = np.concatenate([batch2_positive, batch2_negative], axis=0)
# cosine_matrix = cosine_similarity(batch1, batch1)
# plt.imshow(cosine_matrix)
# plt.colorbar()
# len_pos, N = len(positive_data), len(train_data)
# # plt.plot([len_pos] * N, np.arange(N), color='red')
# # plt.plot(np.arange(N), [len_pos] * N, color='red')
# plt.savefig(f'{patient_dir}/win{clip_len}_data_cosine_similarity.png')
if save_data:
    np.save(f'{patient_dir}/win{clip_len}_tal_train_data.npy', train_data[:, :, :, :])
    np.save(f'{patient_dir}/win{clip_len}_tal_val_data.npy', val_data[:, :, :, :])
    np.save(f'{patient_dir}/win{clip_len}_tal_train_label.npy', train_label)
    np.save(f'{patient_dir}/win{clip_len}_tal_val_label.npy', val_label)
    train_label = train_label.reshape(-1)
    val_label = val_label.reshape(-1)
    print(f'positive train: {train_label.sum()}, negative train: {len(train_label) - train_label.sum()}')
    print(f'positive train: {val_label.sum()}, negative train: {len(val_label) - val_label.sum()}')
    train_data, train_label, val_data, val_label, _ = \
    make_tal_labels(matrices, timestamps, EMG_label_path, filter_timestamp=wake_mask, \
            clip_len=clip_len, overlap_train=False, \
            split_ratio=1.0, plot_statistics=False, calibrate_value=4.65, prune_negative=prune_negative)
    np.save(f'{patient_dir}/win{clip_len}_tal_train_data_full.npy', train_data[:, :, :, :])
    np.save(f'{patient_dir}/win{clip_len}_tal_train_label_full.npy', train_label)
    train_data, train_label, val_data, val_label, _ = \
    make_tal_labels(matrices, timestamps, EMG_label_path, filter_timestamp=wake_mask, \
            clip_len=clip_len, overlap_train=False, \
            split_ratio=0, plot_statistics=False, calibrate_value=4.65, prune_negative=prune_negative)
    np.save(f'{patient_dir}/win{clip_len}_tal_val_data_full.npy', val_data[:, :, :, :])
    np.save(f'{patient_dir}/win{clip_len}_tal_val_label_full.npy', val_label)
    val_label = val_label.reshape(-1)
    print(f'positive val: {val_label.sum()}, negative val: {len(val_label) - val_label.sum()}, total: {len(val_label)}')

(3011, 40, 48, 22)
timestamps at break: 2024-05-31T04:53:33.7890066-05:00
(3011, 40) 0.0 1.0
(2107, 40, 48, 22) (2107, 40) (904, 40, 48, 22) (904, 40) 2817.0 580.0
positive train: 2817.0, negative train: 81463.0
positive train: 580.0, negative train: 35580.0
(3011, 40, 48, 22)
timestamps at break: 2024-05-31T04:53:33.7890066-05:00
(3011, 40) 0.0 1.0
(3011, 40, 48, 22) (3011, 40) (0, 40, 48, 22) (0, 40) 3397.0 0.0
(3011, 40, 48, 22)
timestamps at break: 2024-05-31T04:53:33.7890066-05:00
(3011, 40) 0.0 1.0
(0, 40, 48, 22) (0, 40) (3011, 40, 48, 22) (3011, 40) 0.0 3397.0
positive val: 3397.0, negative val: 117043.0, total: 120440


# Merged Cell

In [None]:
for i in range(15, 43):
    # Data loading Version 2 - toy data previously
    # data_root = '/scratch/bbsg/hangy6/RLS/data' # delta data root
    data_root = 'data' # HAL data root
    patient_dirs = [
        '',
        f'{data_root}/patient01-08-27-2023',
        '',
        '',
        '',
        f'{data_root}/patient05-02-15-2024',
        f'{data_root}/patient06-02-17-2024',
        '',
        '',
        f'{data_root}/patient09-03-01-2024',
        '',
        f'{data_root}/patient11-03-15-2024',
        '',
        f'{data_root}/patient13-03-31-2024',
        f'{data_root}/patient14-04-03-2024',
        f'{data_root}/patient15-04-12-2024-relabeled',
        f'{data_root}/patient16-04-13-2024',
        f'{data_root}/patient17-04-14-2024',
        f'{data_root}/patient18-04-15-2024',
        f'{data_root}/patient19-04-16-2024',
        f'{data_root}/patient20-04-18-2024',
        f'{data_root}/patient21-04-26-2024',
        f'{data_root}/patient22-04-27-2024',
        f'{data_root}/patient23-04-28-2024',
        f'{data_root}/patient24-04-29-2024',
        f'{data_root}/patient25-05-10-2024',
        f'{data_root}/patient26-05-11-2024',
        f'{data_root}/patient27-05-13-2024',
        f'{data_root}/patient28-05-13-2024',
        f'{data_root}/patient29-05-14-2024-relabeled',
        f'{data_root}/patient30-05-25-2024',
        f'{data_root}/patient31-05-27-2024',
        f'{data_root}/patient32-05-28-2024',
        '',
        f'{data_root}/patient34-05-30-2024',
        f'{data_root}/patient35-06-06-2024',
        f'{data_root}/patient36-06-07-2024',
        f'{data_root}/patient37-06-08-2024',
        f'{data_root}/patient38-06-09-2024',
        f'{data_root}/patient39-06-10-2024',
        f'{data_root}/patient40-06-11-2024',
        f'{data_root}/patient41-06-12-2024',
        f'{data_root}/patient42-06-21-2024',
    ]

    patient_dir = patient_dirs[i]
    if not patient_dir:
        continue

    print(f'Memory available before loading data: {psutil.virtual_memory().available}')
    try:
        del matrices, timestamps, data
    except:
        pass

    data_file = os.path.join(patient_dir, 'raw_pressure_data.npy')
    timestamps_file = os.path.join(patient_dir, 'raw_timestamps.npy')
    if os.path.exists(data_file) and os.path.exists(timestamps_file):
        print('load existing pressure data...')
        matrices = np.load(data_file).astype(np.float32)
        timestamps = np.load(timestamps_file)
        print(f'successfully loaded pressure data from {data_file} and timestamps from {timestamps_file}')
    else:
        print('Generating pressure data from raw json file...')
        pressure_data = load_json(os.path.join(patient_dir, 'raw_data.json'))
        matrices, timestamps = get_pressure_matrices(pressure_data)
        np.save(data_file, np.array(matrices))
        np.save(timestamps_file, np.array(timestamps))
        print('Successfully saved pressure data and timestamps')
    matrices.shape, matrices.max(), np.unravel_index(matrices.argmax(), shape=matrices.shape)

    save_data = True
    clip_len = 100
    prune_negative=False
    overlap_train=False

    EMG_label_path = f'{patient_dir}/positive_timestamps.csv'
    wake_mask = f'{patient_dir}/wake_mask.csv'
    train_data, train_label, val_data, val_label, _ = \
    make_tal_labels(matrices, timestamps, EMG_label_path, filter_timestamp=wake_mask, \
                clip_len=clip_len, overlap_train=False, \
                split_ratio=0.7, plot_statistics=False, calibrate_value=4.65, prune_negative=prune_negative)

    if save_data:
        np.save(f'{patient_dir}/win{clip_len}_tal_train_data.npy', train_data[:, :, :, :])
        np.save(f'{patient_dir}/win{clip_len}_tal_val_data.npy', val_data[:, :, :, :])
        np.save(f'{patient_dir}/win{clip_len}_tal_train_label.npy', train_label)
        np.save(f'{patient_dir}/win{clip_len}_tal_val_label.npy', val_label)
        train_label = train_label.reshape(-1)
        val_label = val_label.reshape(-1)
        print(f'positive train: {train_label.sum()}, negative train: {len(train_label) - train_label.sum()}')
        print(f'positive train: {val_label.sum()}, negative train: {len(val_label) - val_label.sum()}')
        train_data, train_label, val_data, val_label, _ = \
        make_tal_labels(matrices, timestamps, EMG_label_path, filter_timestamp=wake_mask, \
                clip_len=clip_len, overlap_train=False, \
                split_ratio=1.0, plot_statistics=False, calibrate_value=4.65, prune_negative=prune_negative)
        np.save(f'{patient_dir}/win{clip_len}_tal_train_data_full.npy', train_data[:, :, :, :])
        np.save(f'{patient_dir}/win{clip_len}_tal_train_label_full.npy', train_label)
        train_data, train_label, val_data, val_label, _ = \
        make_tal_labels(matrices, timestamps, EMG_label_path, filter_timestamp=wake_mask, \
                clip_len=clip_len, overlap_train=False, \
                split_ratio=0, plot_statistics=False, calibrate_value=4.65, prune_negative=prune_negative)
        np.save(f'{patient_dir}/win{clip_len}_tal_val_data_full.npy', val_data[:, :, :, :])
        np.save(f'{patient_dir}/win{clip_len}_tal_val_label_full.npy', val_label)
        val_label = val_label.reshape(-1)
        print(f'positive val: {val_label.sum()}, negative val: {len(val_label) - val_label.sum()}, total: {len(val_label)}')

In [5]:
for i in range(15, 43):
    # Data loading Version 2 - toy data previously
    # data_root = '/scratch/bbsg/hangy6/RLS/data' # delta data root
    data_root = 'data' # HAL data root
    patient_dirs = [
        '',
        f'{data_root}/patient01-08-27-2023',
        '',
        '',
        '',
        f'{data_root}/patient05-02-15-2024',
        f'{data_root}/patient06-02-17-2024',
        '',
        '',
        f'{data_root}/patient09-03-01-2024',
        '',
        f'{data_root}/patient11-03-15-2024',
        '',
        f'{data_root}/patient13-03-31-2024',
        f'{data_root}/patient14-04-03-2024',
        f'{data_root}/patient15-04-12-2024-relabeled',
        f'{data_root}/patient16-04-13-2024',
        f'{data_root}/patient17-04-14-2024',
        f'{data_root}/patient18-04-15-2024',
        f'{data_root}/patient19-04-16-2024',
        f'{data_root}/patient20-04-18-2024',
        f'{data_root}/patient21-04-26-2024',
        f'{data_root}/patient22-04-27-2024',
        f'{data_root}/patient23-04-28-2024',
        f'{data_root}/patient24-04-29-2024',
        f'{data_root}/patient25-05-10-2024',
        f'{data_root}/patient26-05-11-2024',
        f'{data_root}/patient27-05-13-2024',
        f'{data_root}/patient28-05-13-2024',
        f'{data_root}/patient29-05-14-2024-relabeled',
        f'{data_root}/patient30-05-25-2024',
        f'{data_root}/patient31-05-27-2024',
        f'{data_root}/patient32-05-28-2024',
        '',
        f'{data_root}/patient34-05-30-2024',
        f'{data_root}/patient35-06-06-2024',
        f'{data_root}/patient36-06-07-2024',
        f'{data_root}/patient37-06-08-2024',
        f'{data_root}/patient38-06-09-2024',
        f'{data_root}/patient39-06-10-2024',
        f'{data_root}/patient40-06-11-2024',
        f'{data_root}/patient41-06-12-2024',
        f'{data_root}/patient42-06-21-2024',
    ]

    patient_dir = patient_dirs[i]
    if not patient_dir:
        continue

    print(f'Memory available before loading data: {psutil.virtual_memory().available}')
    try:
        del matrices, timestamps, data
    except:
        pass

    data_file = os.path.join(patient_dir, 'raw_pressure_data.npy')
    timestamps_file = os.path.join(patient_dir, 'raw_timestamps.npy')
    if os.path.exists(data_file) and os.path.exists(timestamps_file):
        print('load existing pressure data...')
        matrices = np.load(data_file).astype(np.float32)
        timestamps = np.load(timestamps_file)
        print(f'successfully loaded pressure data from {data_file} and timestamps from {timestamps_file}')
    else:
        print('Generating pressure data from raw json file...')
        pressure_data = load_json(os.path.join(patient_dir, 'raw_data.json'))
        matrices, timestamps = get_pressure_matrices(pressure_data)
        np.save(data_file, np.array(matrices))
        np.save(timestamps_file, np.array(timestamps))
        print('Successfully saved pressure data and timestamps')
    matrices.shape, matrices.max(), np.unravel_index(matrices.argmax(), shape=matrices.shape)

    save_data = True
    clip_len = 6

    EMG_label_path = f'{patient_dir}/positive_timestamps.csv'
    train_data, train_label, val_data, val_label = \
    make_context_labels(matrices, timestamps, EMG_label_path, \
                clip_len=clip_len, split_ratio=0.7, calibrate_value=4.65)

    if save_data:
        np.save(f'{patient_dir}/win{clip_len}_context_train_data.npy', train_data[:, :, :, :])
        np.save(f'{patient_dir}/win{clip_len}_context_val_data.npy', val_data[:, :, :, :])
        np.save(f'{patient_dir}/win{clip_len}_context_train_label.npy', train_label)
        np.save(f'{patient_dir}/win{clip_len}_context_val_label.npy', val_label)
        train_label = train_label.reshape(-1)
        val_label = val_label.reshape(-1)
        print(f'positive train: {train_label.sum()}, negative train: {len(train_label) - train_label.sum()}')
        print(f'positive train: {val_label.sum()}, negative train: {len(val_label) - val_label.sum()}')
        train_data, train_label, val_data, val_label = \
        make_context_labels(matrices, timestamps, EMG_label_path,\
                clip_len=clip_len, split_ratio=1.0, calibrate_value=4.65)
        np.save(f'{patient_dir}/win{clip_len}_context_train_data_full.npy', train_data[:, :, :, :])
        np.save(f'{patient_dir}/win{clip_len}_context_train_label_full.npy', train_label)
        train_data, train_label, val_data, val_label = \
        make_context_labels(matrices, timestamps, EMG_label_path,\
                clip_len=clip_len, split_ratio=0, calibrate_value=4.65)
        np.save(f'{patient_dir}/win{clip_len}_context_val_data_full.npy', val_data[:, :, :, :])
        np.save(f'{patient_dir}/win{clip_len}_context_val_label_full.npy', val_label)
        val_label = val_label.reshape(-1)
        print(f'positive val: {val_label.sum()}, negative val: {len(val_label) - val_label.sum()}, total: {len(val_label)}')

Memory available before loading data: 284837347328
load existing pressure data...
successfully loaded pressure data from data/patient15-04-12-2024-relabeled/raw_pressure_data.npy and timestamps from data/patient15-04-12-2024-relabeled/raw_timestamps.npy
timestamps at break: 2024-04-13T05:27:30.2963677-05:00
positive train: 16038.0, negative train: 16038.0
positive train: 2573.0, negative train: 53054.0
timestamps at break: 2024-04-13T05:27:30.2963677-05:00
timestamps at break: 2024-04-13T05:27:30.2963677-05:00
positive val: 18611.0, negative val: 166810.0, total: 185421
Memory available before loading data: 278322479104
load existing pressure data...
successfully loaded pressure data from data/patient16-04-13-2024/raw_pressure_data.npy and timestamps from data/patient16-04-13-2024/raw_timestamps.npy
timestamps at break: 2024-04-14T05:24:09.4178089-05:00
positive train: 1827.0, negative train: 1827.0
positive train: 1094.0, negative train: 53044.0
timestamps at break: 2024-04-14T05:24:0