In [1]:
import scanpy as sc
import torch
import anndata as ad
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from utils import * 
from models import MLP, MLPWithSinkhorn
import torch.nn as nn
import torch.optim as optim
import argparse
import math
import matplotlib.pyplot as plt
import muon 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = "data/pbmc_10k_protein_v3_raw_feature_bc_matrix.h5"
adata = sc.read_10x_h5(data, genome=None, gex_only=False, backup_url=None)

adata.var_names_make_unique()
adata.layers["counts"] = adata.X.copy()
sc.pp.filter_genes(adata, min_counts=100) # number of times that RNA is present in the dataset
sc.pp.filter_cells(adata, min_counts=500) # number of biomolecules in each cell

protein = adata[:, adata.var["feature_types"] == "Antibody Capture"].copy()
rna = adata[:, adata.var["feature_types"] == "Gene Expression"].copy()
# Filtering cells not expressing both types of biomolecules
sc.pp.filter_cells(rna, min_counts=1)
sc.pp.filter_cells(protein, min_counts=1)
common_cells = protein.obs_names.intersection(rna.obs_names)
protein = protein[common_cells, :]
rna = rna[common_cells, :]

# Doing normalization and SVD steps
sc.pp.log1p(rna)
rna_norm = zscore_normalization_and_svd(rna.X.toarray(), n_components=300) # Same as ScLinear authors
muon.prot.pp.clr(protein)
protein_norm = protein.X.toarray()

# 80/20 split rule
split = math.ceil(rna_norm.shape[0] * 0.8)
validation_split = math.ceil(rna_norm.shape[0] * 0.95)
gex_train = rna_norm[:split, :]
gex_test = rna_norm[split:validation_split, :]
gex_valid =  rna_norm[validation_split:, :]

adx_train = protein_norm[:split, :]
adx_test = protein_norm[split:validation_split, :]
adx_valid = protein_norm[validation_split:, :]
print(f'Normalized RNA array shape: {rna_norm.shape}')
print(f'Normalized Protein array shape: {protein_norm.shape}')
print(f'Original RNA shape: {rna.X.shape}')
print(f'Original Protein shape: {protein.X.shape}')

  utils.warn_names_duplicates("var")
  view_to_actual(adata)


Normalized RNA array shape: (10377, 300)
Normalized Protein array shape: (10377, 17)
Original RNA shape: (10377, 11254)
Original Protein shape: (10377, 17)


  warn("adata.X is sparse but not in CSC format. Converting to CSC.")


In [40]:
protein[:,3].X.toarray()

array([[0.28033075],
       [2.1909363 ],
       [2.155962  ],
       ...,
       [2.7864826 ],
       [0.30448335],
       [0.4172013 ]], dtype=float32)

In [24]:
protein.var_names

Index(['CD3_TotalSeqB', 'CD4_TotalSeqB', 'CD8a_TotalSeqB', 'CD14_TotalSeqB',
       'CD15_TotalSeqB', 'CD16_TotalSeqB', 'CD56_TotalSeqB', 'CD19_TotalSeqB',
       'CD25_TotalSeqB', 'CD45RA_TotalSeqB', 'CD45RO_TotalSeqB',
       'PD-1_TotalSeqB', 'TIGIT_TotalSeqB', 'CD127_TotalSeqB',
       'IgG2a_control_TotalSeqB', 'IgG1_control_TotalSeqB',
       'IgG2b_control_TotalSeqB'],
      dtype='object')

In [39]:
protein[:,'CD14_TotalSeqB'].X.toarray()

array([[0.28033075],
       [2.1909363 ],
       [2.155962  ],
       ...,
       [2.7864826 ],
       [0.30448335],
       [0.4172013 ]], dtype=float32)

In [42]:
result = [item.split('_')[0] for item in protein.var_names]

result


list

In [None]:
# Trying some different models --> see main.py
# 1) Generic Models
# 2) Specialized methods via the Dance package (currently not working, see note below)
# https://github.com/OmicsML/dance-tutorials/blob/main/dance_tutorial.ipynb

In [None]:
# NOTE: Babel is no good, the team is fixing the bugs making it unusable right now, try again in a week: https://discuss.dgl.ai/t/cannot-find-dgl-c-graphbolt-library/4429/12
# import os
# os.environ["DGLBACKEND"] = "pytorch"
# from pprint import pprint
# from dance.modules.multi_modality.predict_modality.babel import BabelWrapper