In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import nsforest as ns
from nsforest import utils
from nsforest import preprocessing as pp
from nsforest import nsforesting
from nsforest import evaluating as ev
from nsforest import plotting as pl

In [2]:
adata = sc.read_h5ad("/mnt/data01/yuanzhen/01.Vertebrate_cell_evo/01.data/02.atlas_final/2.samap/4.final/Mmus.wb.iter_cluster_annotated.h5ad")

In [3]:
adata.X = adata.raw.X
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [4]:
cluster_header = "Refined family"
output_folder = "/mnt/data01/yuanzhen/01.Vertebrate_cell_evo/01.data/03.markers/3.nsforest/Mmus/"
label = "Mmus.TFs"
TF = "/mnt/data01/yuanzhen/01.Vertebrate_cell_evo/01.data/07.species_signals/6.TF_vs_species_signals/TFs/Mmus.predicted_TFs.txt"

In [5]:
adata = ns.pp.prep_medians(adata, cluster_header)

Calculating medians...


Calculating medians (means) per cluster: 100%|██████████| 22/22 [02:16<00:00,  6.22s/it]


Saving calculated medians as adata.varm.medians_Refined family
--- 136.76024985313416 seconds ---
median: 0.0
mean: 0.060371134
std: 0.32646406
Only positive genes selected. 4274 positive genes out of 18787 total genes


In [6]:
adata = ns.pp.prep_binary_scores(adata, cluster_header)

Calculating binary scores...


Calculating binary scores per cluster: 100%|██████████| 22/22 [01:21<00:00,  3.70s/it]


Saving calculated binary scores as adata.varm.binary_scores_Refined family
--- 81.53088879585266 seconds ---
median: 0.0
mean: 0.12005337250115405
std: 0.2815771612594577


In [7]:
adata.varm['medians_Refined family']

Unnamed: 0_level_0,Astrocytes,Choroid plexus epithelial cells,Diencephalon cholinergic neurons,Diencephalon glutamatergic neurons,Ependymal cells,Immune cells,Mesencephalon GABAergic neurons,Mesencephalon cholinergic neurons,Mesencephalon glutamatergic neurons,Microglia,...,Olfactory ensheathing cells,Oligodendrocyte precursor cells,Oligodendrocytes,Peptidergic neurons,Rhombencephalon GABAergic neurons,Rhombencephalon cholinergic neurons,Rhombencephalon glutamatergic neurons,Telencephalon GABAergic neurons,Telencephalon glutamatergic neurons,Vascular cells
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Sox4,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
Rbms3,0.0,0.0,1.043302,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.206273,0.0,0.0,0.0,0.0
Tcf7l2,0.0,0.0,0.000000,2.574118,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
Kcnb2,0.0,0.0,0.607236,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.563470,0.0,0.0,0.0,0.0
Grin3a,0.0,0.0,0.917071,0.810723,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Actr10,0.0,0.0,0.649978,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.560476,0.0,0.0,0.0,0.0
Micu1,0.0,0.0,0.399424,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.468952,0.0,0.0,0.0,0.0
Usp20,0.0,0.0,0.594259,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.668984,0.0,0.0,0.0,0.0
Sh3glb2,0.0,0.0,0.800416,0.830624,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.770463,0.0,0.0,0.0,0.0


In [None]:
results = nsforesting.NSForest(adata, cluster_header, save_supplementary = True, output_folder = ".",
        outputfilename_prefix = f'{label}.{cluster_header}', gene_selection = "BinaryFirst_high",
        n_top_genes = 20, n_binary_genes = 20)

Preparing adata...
--- 14.065593242645264 seconds ---
Pre-selecting genes based on binary scores...
	BinaryFirst_high Threshold (mean + 2 * std): 0.6832076950200694
	Average number of genes after gene_selection in each cluster: 422.27272727272725
Saving number of genes selected per cluster as...
.Mmus.TFs.Refined family_gene_selection.csv
Number of clusters to evaluate: 22
1 out of 22:
	Astrocytes
	Pre-selected 62 genes to feed into Random Forest.
	['Ntsr2']
	fbeta: 0.9142731466070664
	PPV: 0.946173720472441
	recall: 0.8056253928346951
2 out of 22:
	Choroid plexus epithelial cells
	Pre-selected 292 genes to feed into Random Forest.
	['Ttr']
	fbeta: 0.9847401049117788
	PPV: 0.9809976247030879
	recall: 1.0
3 out of 22:
	Diencephalon cholinergic neurons
	Pre-selected 2712 genes to feed into Random Forest.
	['Ngfr', 'Ntrk1', 'Slc18a3', 'Tmem176a']
	fbeta: 0.7529411764705882
	PPV: 0.8311688311688312
	recall: 0.5470085470085471
4 out of 22:
	Diencephalon glutamatergic neurons
	Pre-selected 8