In [1]:
import scanpy as sc
import torch
import anndata as ad
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

In [2]:
# data_full = 'data/GSE194122_openproblems_neurips2021_cite_BMMC_processed.h5ad'
# # Load the dataset into Scanpy using the backup_url argument
# adata_full = sc.read_h5ad(data_full)
# adata_full

In [3]:

data_small = "data/pbmc_10k_protein_v3_raw_feature_bc_matrix.h5"
adata_small = sc.read_10x_h5(data_small, genome=None, gex_only=False, backup_url=None)

  utils.warn_names_duplicates("var")


In [4]:
adata_small.var_names_make_unique()
adata_small.layers["counts"] = adata_small.X.copy()
sc.pp.filter_genes(adata_small, min_counts=10) # number of times that RNA is present in the dataset
sc.pp.filter_cells(adata_small, min_counts=100) # number of molecules in each cell (can be antibody or rna)
adata_small

AnnData object with n_obs × n_vars = 80773 × 15989
    obs: 'n_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'pattern', 'read', 'sequence', 'n_counts'
    layers: 'counts'

How this data is structured:
- Everything is in the counts matrix, where there are 15972 columns for the genes + 17 columns for the proteins
- Entries are the number of biomolucules detected, where the row is the cell and columns tells you which gene/protein  

In [5]:
adata_small.var["feature_types"].value_counts()
# The number of unique genes/antibodies in the dataset.

feature_types
Gene Expression     15972
Antibody Capture       17
Name: count, dtype: int64

In [6]:
protein = adata_small[:, adata_small.var["feature_types"] == "Antibody Capture"].copy()
rna = adata_small[:, adata_small.var["feature_types"] == "Gene Expression"].copy()

In [7]:
np.sum(np.all(adata_small.X.toarray() < 0.01, axis=1))


0

In [8]:
# Number of cells with no proteins measured
np.sum(np.all(protein.X.toarray() < 0.01, axis=1))

598

In [9]:
# Number of cells with no proteins measured
np.sum(np.all(rna.X.toarray() < 0.01, axis=1))

281

In [10]:
# Filtering cells not expressing both types of biomolecules
sc.pp.filter_cells(rna, min_counts=1)
sc.pp.filter_cells(protein, min_counts=1)

In [11]:
common_cells = rna.obs_names.intersection(protein.obs_names)
protein = protein[common_cells, :]
rna = rna[common_cells, :]

In [12]:
rna.X

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 13173648 stored elements and shape (79894, 15972)>

In [13]:
# Trying some different models --> see main.py
# 1) Generic Models
# 2) Specialized methods via the Dance package (currently not working, see note below)
# https://github.com/OmicsML/dance-tutorials/blob/main/dance_tutorial.ipynb

In [14]:
# NOTE: Babel is no good, the team is fixing the bugs making it unusable right now, try again in a week: https://discuss.dgl.ai/t/cannot-find-dgl-c-graphbolt-library/4429/12
# import os
# os.environ["DGLBACKEND"] = "pytorch"
# from pprint import pprint
# from dance.modules.multi_modality.predict_modality.babel import BabelWrapper