<a href="https://colab.research.google.com/github/Enterprise-D/sc_multimodal/blob/main/multimodal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load Multiome Data

Install and load h5df modules:

In [None]:
!pip install hdf5plugin
import h5py
import hdf5plugin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


handle inputs:

In [None]:
path_train_multi_inputs = '/content/drive/MyDrive/multimodal/train_multi_inputs.h5'
file_train_multi_inputs = h5py.File(path_train_multi_inputs, "r")

path_test_multi_inputs = '/content/drive/MyDrive/multimodal/test_multi_inputs.h5'
file_test_multi_inputs = h5py.File(path_test_multi_inputs, "r")

file_train_multi_inputs.keys(), file_test_multi_inputs.keys()

(<KeysViewHDF5 ['train_multi_inputs']>, <KeysViewHDF5 ['test_multi_inputs']>)

In [None]:
group_train_multi_inputs = file_train_multi_inputs['train_multi_inputs']

group_test_multi_inputs = file_test_multi_inputs['test_multi_inputs']

group_train_multi_inputs.keys(), group_test_multi_inputs.keys()

(<KeysViewHDF5 ['axis0', 'axis1', 'block0_items', 'block0_values']>,
 <KeysViewHDF5 ['axis0', 'axis1', 'block0_items', 'block0_values']>)

In [None]:
group_train_multi_inputs['block0_values'].shape, \
group_test_multi_inputs['block0_values'].shape
# axis0 & block0_items: features (genomic coordinates, GRCh38)
# axis1: cell_ids
# block0_values: entries

((105942, 228942), (55935, 228942))

In [None]:
group_train_multi_inputs['block0_values'][1:10,1:6]

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]], dtype=float32)

handle targets:

In [None]:
path_train_multi_targets = '/content/drive/MyDrive/multimodal/train_multi_targets.h5'
file_train_multi_targets = h5py.File(path_train_multi_targets, "r")

file_train_multi_targets.keys()

<KeysViewHDF5 ['train_multi_targets']>

In [None]:
group_train_multi_targets = file_train_multi_targets['train_multi_targets']
group_train_multi_targets.keys()

<KeysViewHDF5 ['axis0', 'axis1', 'block0_items', 'block0_values']>

In [None]:
group_train_multi_targets['block0_values'].shape
# axis0 & block0_items: features (expression reads, GRCh38)
# axis1: cell_ids
# block0_values: entries

(105942, 23418)

In [None]:
group_train_multi_targets['block0_values'][1:10,1:6]

array([[0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [4.5079365, 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       ]],
      dtype=float32)

Construct a training dataloader:

In [None]:
import torch
import torch.nn as nn
import torch.utils.data as Data
import numpy as np
from sklearn import decomposition

In [None]:
class TrainDataset(Data.Dataset):
    def __init__(self):
        self.inputs_data = group_train_multi_inputs['block0_values']
        self.targets_data = group_train_multi_targets['block0_values']

    def __len__(self):
        return len(self.inputs_data)

    def __getitem__(self, idx):
        return self.inputs_data[idx, ...], self.targets_data[idx, ...]

In [None]:
train_dataloader = Data.DataLoader(TrainDataset(), batch_size=64, shuffle=True)
print('Train data: ', len(train_dataloader))

Train data:  1656


Datasets are too sparse (and too):

In [None]:
pca = decomposition.IncrementalPCA(n_components=50, batch_size=64)
reduction = pca.fit(group_train_multi_inputs['block0_values'])

Construct a test dataloader:

In [None]:
class TestDataset(Data.Dataset):
    def __init__(self):
        self.inputs_data = group_train_multi_inputs['block0_values'].to_sparse()
        self.targets_data = group_train_multi_targets['block0_values'].to_sparse()

    def __len__(self):
        return len(self.inputs_data)

    def __getitem__(self, idx):
        return self.inputs_data[idx, ...], self.targets_data[idx, ...]

Construct models:

In [None]:
class DenseModel(nn.Module):
    def __init__(self, input_size=228942, hidden_size=1024, output_size=23418):
        super(DenseModel, self).__init__()
        self.dense1 = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU())
        
        self.dense2 = nn.Sequential(
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU())
                
        self.dense3 = nn.Sequential(
            nn.BatchNorm1d(hidden_size),
            nn.Linear(hidden_size, output_size))

    def forward(self, input):
        output = self.dense1(input)
        output = self.dense2(output)
        output = self.dense3(output)
        return output

In [None]:
dense_model = DenseModel().cuda()

In [None]:
criterion = nn.BCEWithLogitsLoss()

num_epochs = 10
learning_rate = 0.001
optimizer = torch.optim.Adam(params=dense_model.parameters(), lr=learning_rate)

total_step = len(train_dataloader)

#err_train = np.zeros(shape=num_epochs)
#err_valid = np.zeros(shape=num_epochs)

In [None]:
for epoch in range(num_epochs):
    dense_model.train()

    for i, (inputs, targets) in enumerate(train_dataloader):

        inputs = inputs.cuda()
        targets = targets.cuda()

        outputs = dense_model(inputs)
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

    #err_train[epoch] = accuracy_wrap(recu_model,train_loader)
    #err_valid[epoch] = accuracy_wrap(recu_model,valid_loader)

    #print('Epoch', epoch+1,'| train:',round(err_train[epoch],4) ,'| valid:',round(err_valid[epoch],4))

KeyboardInterrupt: ignored