In [1]:
import pandas
import os
from gene_panel_selection import ExpressionDataset, PROPOSEMethod, SeuratMethod

In [2]:
import torch.cuda
print("cuda devices:")
for i in range(torch.cuda.device_count()):
    print(f"  {i}: {torch.cuda.get_device_name(i)}")    

cuda devices:
  0: Quadro P1000


In [3]:
# load data manually:
# annotations = pandas.read_feather('anno.feather').set_index('sample_id')
# genes = pandas.read_feather('data/genes.feather')
# expression = pandas.read_feather('data/expression_matrix_not_normalized.feather')
# expression = expression.set_index(genes['gene']).T

# exp_data = ExpressionDataset(
#     expression=expression,  # dataframe, one gene per row, index is gene names, columns are samples
#     annotations=annotations,  # dataframe, index is sample names, has columns 'cluster', 'subclass', 'class'
# )

In [13]:
# or for common formats, load automatically:
data_path = '../../../data/macaque_mtg'
exp_data = ExpressionDataset.load_arrow(
    gene_file=os.path.join(data_path, 'genes.feather'),
    annotation_file=os.path.join(data_path, 'anno.feather'),
    expression_file=os.path.join(data_path, 'expression_matrix_not_normalized.feather'),
    expression_type='raw',
)
exp_data.expression_data.shape

(10772, 40280)

In [14]:
#  Shorten gene list to speed up PROPOSE 
seurat = SeuratMethod()
hv_selection = seurat.select_gene_panel(size=10000, data=exp_data, flavor='seurat_v3')
hv_data = exp_data.select_genes(hv_selection.gene_panel)
hv_data.expression_data.shape

(10772, 10000)

In [15]:
propose = PROPOSEMethod()
selection = propose.select_gene_panel(size=140, data=hv_data, use_classes=True)

10772 total examples, 8617 training examples, 1078 validation examples, 1077 test examples
using CrossEntropyLoss, starting with lam = 0.0001


Training epochs:   0%|          | 0/500 [00:00<?, ?it/s]

lam = 0.000100 yielded 573 genes
done, lam = 0.000100 yielded 573 genes


Training epochs:   0%|          | 0/500 [00:00<?, ?it/s]

done, selected 140 genes


In [16]:
list(selection.gene_panel)

['RCSD1',
 'LOC100425469',
 'NTNG1',
 'COL11A1',
 'BCAR3',
 'CACHD1',
 'AJAP1',
 'CMTM8',
 'EPHB1',
 'P2RY14',
 'PLCH1',
 'LOC106997013',
 'CCK',
 'LOC106996877',
 'FRMD4B',
 'FSTL1',
 'CRYBG3',
 'LOC114676463',
 'LOC106995898',
 'PCP4',
 'ERG',
 'LOC106997315',
 'COBL',
 'CREB5',
 'LOC694004',
 'THSD7A',
 'TAC1',
 'HGF',
 'CHRM2',
 'SMOC2',
 'MAN1A1',
 'TPD52L1',
 'MOXD1',
 'NMBR',
 'SAMD5',
 'VIP',
 'OPRM1',
 'COL12A1',
 'LOC106997985',
 'LOC114677421',
 'C4H6orf141',
 'SPON2',
 'LOC106998299',
 'LOC114678288',
 'ADAMTS3',
 'LOC100499503',
 'HOPX',
 'NPNT',
 'COL25A1',
 'LOC106998467',
 'ARSJ',
 'LOC703224',
 'QRFPR',
 'GAB1',
 'NPY1R',
 'HPGD',
 'LOC106998686',
 'LOC106998719',
 'PLCXD3',
 'ARHGEF28',
 'LOC106995639',
 'FSTL4',
 'SH3RF2',
 'HTR4',
 'SGCD',
 'LOC114678966',
 'INSYN2B',
 'KCNIP1',
 'RYR3',
 'NR2F2',
 'PRKD1',
 'RGS6',
 'DPF3',
 'CCDC88C',
 'NKAIN3',
 'EYA1',
 'KCNB2',
 'STMN2',
 'ANGPT1',
 'PLXDC2',
 'LOC106996696',
 'SLC16A9',
 'LOC114669989',
 'CRTAC1',
 'CTBP2',
 '

In [None]:
# Report on what happened in the final selection process
selection.report()

In [None]:
# Evaluation of final gene panel
#   number of DE genes per cluster
#   overall expression levels too high/low?
#   includes genes used for sanity checking
#   excludes genes known not to work (per-ST-method)
#   classification accuracy, confusion matrix
#   manifold preservation
evaluate_gene_panel(selection)