In [7]:
import pandas
import os
from gene_panel_selection import ExpressionDataset, PROPOSEMethod

In [8]:
# load data manually:
# annotations = pandas.read_feather('anno.feather').set_index('sample_id')
# genes = pandas.read_feather('data/genes.feather')
# expression = pandas.read_feather('data/expression_matrix_not_normalized.feather')
# expression = expression.set_index(genes['gene']).T

# exp_data = ExpressionDataset(
#     expression=expression,  # dataframe, one gene per row, index is gene names, columns are samples
#     annotations=annotations,  # dataframe, index is sample names, has columns 'cluster', 'subclass', 'class'
# )

In [None]:
# or for common formats, load automatically:
data_path = '../../../data/macaque'
exp_data = ExpressionDataset.load_arrow(
    gene_file=os.path.join(data_path, 'genes.feather'),
    annotation_file=os.path.join(data_path, 'anno.feather'),
    expression_file=os.path.join(data_path, 'expression_matrix_not_normalized.feather'),
    cpm_file=os.path.join(data_path, 'expression_matrix_cpm_normalized.feather'),
)

In [None]:
# pre-select 5000 genes with highly variable expression
hv_selection = select_gene_panel(method='seurat_hv', size=5000, data=exp_data)

In [None]:
# Report on what happened in the pre-selection process
hv_selection.report()

In [None]:
hv_exp_data = hv_selection.expression_dataset()
final_selection = select_gene_panel(method='PROPOSE', size=140, data=hv_exp_data)

In [None]:
# Report on what happened in the final selection process
final_selection.report()

In [None]:
# Evaluation of final gene panel
#   number of DE genes per cluster
#   overall expression levels too high/low?
#   includes genes used for sanity checking
#   excludes genes known not to work (per-ST-method)
#   classification accuracy, confusion matrix
#   manifold preservation
evaluate_gene_panel(final_selection)