<a href="https://colab.research.google.com/github/Enterprise-D/sc_multimodal/blob/main/multimodal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Multiome Data Preprocessing

Install and load h5df modules:

In [None]:
!pip install hdf5plugin
import h5py
import hdf5plugin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hdf5plugin
  Downloading hdf5plugin-3.3.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.7 MB)
[K     |████████████████████████████████| 9.7 MB 5.2 MB/s 
Installing collected packages: hdf5plugin
Successfully installed hdf5plugin-3.3.1


handle inputs:

In [None]:
path_train_multi_inputs = '/content/drive/MyDrive/multimodal/train_multi_inputs.h5'
file_train_multi_inputs = h5py.File(path_train_multi_inputs, "r")

path_test_multi_inputs = '/content/drive/MyDrive/multimodal/test_multi_inputs.h5'
file_test_multi_inputs = h5py.File(path_test_multi_inputs, "r")

file_train_multi_inputs.keys(), file_test_multi_inputs.keys()

(<KeysViewHDF5 ['train_multi_inputs']>, <KeysViewHDF5 ['test_multi_inputs']>)

In [None]:
group_train_multi_inputs = file_train_multi_inputs['train_multi_inputs']

group_test_multi_inputs = file_test_multi_inputs['test_multi_inputs']

group_train_multi_inputs.keys(), group_test_multi_inputs.keys()

(<KeysViewHDF5 ['axis0', 'axis1', 'block0_items', 'block0_values']>,
 <KeysViewHDF5 ['axis0', 'axis1', 'block0_items', 'block0_values']>)

In [None]:
group_train_multi_inputs['block0_values'].shape, \
group_test_multi_inputs['block0_values'].shape
# axis0 & block0_items: features (genomic coordinates, GRCh38)
# axis1: cell_ids
# block0_values: entries

((105942, 228942), (55935, 228942))

In [None]:
group_train_multi_inputs['block0_values'][1:10,1:6]

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]], dtype=float32)

handle targets:

In [None]:
path_train_multi_targets = '/content/drive/MyDrive/multimodal/train_multi_targets.h5'
file_train_multi_targets = h5py.File(path_train_multi_targets, "r")

file_train_multi_targets.keys()

<KeysViewHDF5 ['train_multi_targets']>

In [None]:
group_train_multi_targets = file_train_multi_targets['train_multi_targets']
group_train_multi_targets.keys()

<KeysViewHDF5 ['axis0', 'axis1', 'block0_items', 'block0_values']>

In [None]:
group_train_multi_targets['block0_values'].shape
# axis0 & block0_items: features (expression reads, GRCh38)
# axis1: cell_ids
# block0_values: entries

(105942, 23418)

In [None]:
group_train_multi_targets['block0_values'][1:10,1:6]

array([[0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [4.5079365, 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ]],
      dtype=float32)

In [None]:
import numpy as np
import pandas as pd
import os
import scipy.sparse as sps
from tqdm import tqdm as tqdm
import gc

Datasets are too sparse:

In [None]:
%load_ext Cython

In [None]:
%%cython

import cython
cimport cython
cimport numpy as np
import numpy as np
from tqdm import tqdm, trange

ctypedef np.int64_t INT64_t

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[INT64_t, ndim=1] create_indptr(INT64_t[:] row_indices, int start_pos, int nrows):
    cdef int shape = row_indices.shape[0]
    res = np.zeros(nrows, dtype=np.int64)
    cdef INT64_t[:] res_view = res
    
    cdef int i
    cdef int curr_row = 0
    cdef int prev = row_indices[0]
    
    for i in range(shape):
        if row_indices[i] != prev:
            curr_row += 1
            res_view[curr_row] = i
            prev = row_indices[i]
    # res_view[curr_row + 1] = shape
    return res + start_pos

In [None]:
def create_csr_arrays(h5_file_path):
    def check_size(xs, ys, datas):
        return (xs.nbytes + ys.nbytes + datas.nbytes) * 1e-9

    print(f"\n\nProcessing File {h5_file_path}")
    pbar = tqdm()

    # Initialize Variables
    chunksize = 1000 # Keep it low
    loaded_rows = chunksize
    start = 0
    start_pos = 0
    file_pointer = 0

    # Initialize CSR arrays
    indptr = np.array([], dtype=np.int64)
    indices = np.array([], dtype=np.int32)
    data_s = np.array([], dtype=np.float32)
    
    prefix_filename = h5_file_path.split('/')[-1].replace('.h5', '')

    while chunksize == loaded_rows:

        # Check current size: if the total sum of sizes are > 7GB, then save three arrays and re-initialize them
        size_gb = check_size(indptr, indices, data_s)
        if size_gb > 7.0:
            pbar.set_description(f"Total size is {size_gb}. Saving ..")
            np.save(f"{prefix_filename}_indptr_{file_pointer}.npy", indptr)
            np.save(f"{prefix_filename}_indices_{file_pointer}.npy", indices)
            np.save(f"{prefix_filename}_data_{file_pointer}.npy", data_s)
            # Re-initialize
            indptr = np.array([], dtype=np.int64)
            indices = np.array([], dtype=np.int32)
            data_s = np.array([], dtype=np.float32)
            # Increment pointer
            file_pointer += 1

        pbar.set_description("Reading .h5 chunk")
        df = pd.read_hdf(h5_file_path, start=start, stop=start+chunksize)
        pbar.set_description("Extracting non-zero values")
        x_coords, y_coords = df.values.nonzero()
        tmp_data = df.values[df.values != 0.0]

        loaded_rows = df.shape[0]

        # Convert types
        y_coords = y_coords.astype(np.int32, copy=False)
        tmp_data = tmp_data.astype(np.float32, copy=False)

        # Compress x_coords
        pbar.set_description("Compressing rows values")
        x_coords = create_indptr(x_coords, start_pos=start_pos, nrows=loaded_rows)

        gc.collect()

        # Update variables
        pbar.set_description("Update variables")
        start_pos += y_coords.shape[0]
        start += chunksize
        # Append data at the end of each array
        indptr = np.hstack((indptr, x_coords))
        indices = np.hstack((indices, y_coords))
        data_s = np.hstack((data_s, tmp_data))

        pbar.update(loaded_rows)

    print('Done. Save last files')
    np.save(f"{prefix_filename}_indptr_{file_pointer}.npy", indptr)
    np.save(f"{prefix_filename}_indices_{file_pointer}.npy", indices)
    np.save(f"{prefix_filename}_data_{file_pointer}.npy", data_s)
    
    del indptr, indices, data_s

In [None]:
create_csr_arrays(path_train_multi_inputs)



Processing File /content/drive/MyDrive/multimodal/train_multi_inputs.h5


Update variables: : 105942it [25:20, 59.47it/s]

Done. Save last files


Update variables: : 105942it [25:38, 68.87it/s]


In [None]:
indptr = np.load('train_multi_inputs_indptr_0.npy')
indices = np.load('train_multi_inputs_indices_0.npy')
data = np.load('train_multi_inputs_data_0.npy')
indptr = np.append(indptr, indptr[-1] + indices[indptr[-1]:].shape)
N_ROWS = group_train_multi_inputs['block0_values'].shape[0]
N_COLS = group_train_multi_inputs['block0_values'].shape[1]
csr_matrix = sps.csr_matrix((data, indices, indptr), shape=(N_ROWS, N_COLS))
sps.save_npz('train_multi_inputs_sparse.npz', csr_matrix)
del csr_matrix, indices, indptr, data

In [None]:
create_csr_arrays(path_train_multi_targets)



Processing File /content/drive/MyDrive/multimodal/train_multi_targets.h5


Update variables: : 105942it [03:37, 444.17it/s]

Done. Save last files


Update variables: : 105942it [03:49, 461.64it/s]


In [None]:
indptr = np.load('train_multi_targets_indptr_0.npy')
indices = np.load('train_multi_targets_indices_0.npy')
data = np.load('train_multi_targets_data_0.npy')
indptr = np.append(indptr, indptr[-1] + indices[indptr[-1]:].shape)
N_ROWS = group_train_multi_targets['block0_values'].shape[0]
N_COLS = group_train_multi_targets['block0_values'].shape[1]
csr_matrix = sps.csr_matrix((data, indices, indptr), shape=(N_ROWS, N_COLS))
sps.save_npz('train_multi_targets_sparse.npz', csr_matrix)
del csr_matrix, indices, indptr, data

In [None]:
create_csr_arrays(path_test_multi_inputs)



Processing File /content/drive/MyDrive/multimodal/test_multi_inputs.h5


Update variables: : 55935it [15:07, 62.40it/s]

Done. Save last files


Update variables: : 55935it [15:17, 60.99it/s]


In [None]:
indptr = np.load('test_multi_inputs_indptr_0.npy')
indices = np.load('test_multi_inputs_indices_0.npy')
data = np.load('test_multi_inputs_data_0.npy')
indptr = np.append(indptr, indptr[-1] + indices[indptr[-1]:].shape)
N_ROWS = group_test_multi_inputs['block0_values'].shape[0]
N_COLS = group_test_multi_inputs['block0_values'].shape[1]
csr_matrix = sps.csr_matrix((data, indices, indptr), shape=(N_ROWS, N_COLS))
sps.save_npz('test_multi_inputs_sparse.npz', csr_matrix)
del csr_matrix, indices, indptr, data

## Load Compiled Data

In [73]:
import torch
import torch.nn as nn
import torch.utils.data as Data
from torch.utils.data.sampler import SubsetRandomSampler

import numpy as np
import pandas as pd
from sklearn import decomposition

import os
import scipy.sparse as sps
from tqdm import tqdm as tqdm
import gc

import matplotlib.pyplot as plt

Copy data to local disk and load into python:

In [None]:
!cp /content/drive/MyDrive/multimodal_sparse/*.npz /content/

In [None]:
train_input = sps.load_npz('/content/train_multi_inputs_sparse.npz')
train_target = sps.load_npz('/content/train_multi_targets_sparse.npz')

## Denoising and Normalization

In [None]:
!pip install fbpca
import fbpca

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
k_svd_input = 128
U_input, s_input, Va_input = fbpca.pca(train_input,k=k_svd_input)
U_input.shape, s_input.shape, Va_input.shape, train_input.shape

((105942, 128), (128,), (128, 228942), (105942, 228942))

In [None]:
k_svd_target = 128
U_target, s_target, Va_target = fbpca.pca(train_target,k=k_svd_target)
U_target.shape, s_target.shape, Va_target.shape, train_target.shape

((105942, 128), (128,), (128, 23418), (105942, 23418))

Normalize targets:

In [None]:
U_target_std = U_target.std(axis=1).reshape(-1, 1)
U_target_norm = U_target / U_target_std

## PyTorch Workflow

In [None]:
class TrainDataset(Data.Dataset):
    def __init__(self):
      self.inputs_data = U_input.astype('float32')
      self.targets_data = U_target_norm.astype('float32')

    def __len__(self):
      return self.inputs_data.shape[0]

    def __getitem__(self, idx):
      inputs_sliced = self.inputs_data[idx, ...]#.toarray()
      targets_sliced = self.targets_data[idx, ...]#.toarray()

      return inputs_sliced, targets_sliced

Train data:  1656


In [None]:
validation_split = .2
shuffle_dataset = True
random_seed = 42

# Creating data indices for training and validation splits:
dataset_size = len(TrainDataset())
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

In [None]:
batch_size = 128

train_dataloader = torch.utils.data.DataLoader(TrainDataset(), batch_size=batch_size, 
                                                sampler=train_sampler)
valid_dataloader = torch.utils.data.DataLoader(TrainDataset(), batch_size=batch_size,
                                                sampler=valid_sampler)

len(train_dataloader), len(valid_dataloader)

(663, 166)

Construct models:

In [None]:
class DenseModel(nn.Module):
    def __init__(self, input_size=k_svd_input, hidden_size=32, output_size=k_svd_target):
        super(DenseModel, self).__init__()
        self.dense1 = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU())
        
        self.dense2 = nn.Sequential(
            #nn.BatchNorm1d(1),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU())
                
        self.dense3 = nn.Sequential(
            #nn.BatchNorm1d(1),
            nn.Linear(hidden_size, output_size))

    def forward(self, input):
        output = self.dense1(input)
        output = self.dense2(output)
        output = self.dense3(output)
        return output

In [None]:
dense_model = DenseModel().cuda()

In [69]:
criterion = nn.MSELoss()

num_epochs = 100
learning_rate = 0.01
optimizer = torch.optim.Adam(params=dense_model.parameters(), lr=learning_rate)

total_step = len(train_dataloader)

In [70]:
metric_train = np.zeros(shape=num_epochs)
metric_valid = np.zeros(shape=num_epochs)

for epoch in range(num_epochs):
    dense_model.train()
    loss_train = np.zeros(shape=len(train_dataloader))

    for i, (inputs, targets) in enumerate(train_dataloader):

        inputs = inputs.cuda()
        targets = targets.cuda()

        outputs = dense_model(inputs)
        loss = criterion(outputs, targets)
        loss_train[i] = loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #if (i+1) % 100 == 0:
        #    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.10f}'
        #           .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

    dense_model.eval()
    loss_valid = np.zeros(shape=len(valid_dataloader))

    with torch.no_grad():
        for i, (inputs, targets) in enumerate(valid_dataloader):

          inputs = inputs.cuda()
          targets = targets.cuda()

          outputs = dense_model(inputs)
          loss = criterion(outputs, targets)
          loss_valid[i] = loss

    metric_train[epoch] = np.mean(loss_train)
    metric_valid[epoch] = np.mean(loss_valid)

    print('Epoch', epoch+1,'| train:',round(metric_train[epoch],4) ,'| valid:',round(metric_valid[epoch],4))

Epoch 1 | train: 0.9232 | valid: 0.9221
Epoch 2 | train: 0.9214 | valid: 0.9212
Epoch 3 | train: 0.9201 | valid: 0.9196
Epoch 4 | train: 0.9192 | valid: 0.9183
Epoch 5 | train: 0.9185 | valid: 0.9189
Epoch 6 | train: 0.9181 | valid: 0.9179
Epoch 7 | train: 0.9177 | valid: 0.9178
Epoch 8 | train: 0.9174 | valid: 0.9185
Epoch 9 | train: 0.9171 | valid: 0.9181
Epoch 10 | train: 0.9169 | valid: 0.9175


In [72]:
plt.plot(metric_train, color='r')
plt.plot(metric_valid, color='g')
plt.show()

NameError: ignored