In [1]:
import scanpy as sc
import anndata
import pandas as pd
from pathlib import Path
from ikarus import classifier, utils, data

In [2]:
adata = sc.read_h5ad('/mnt/c/Users/ouyangkang/Downloads/integrate_tmp_3data_withAnno.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 5406 × 22857
    obs: 'orig.ident', 'x', 'y', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'total_counts_hb', 'pct_counts_hb', 'total_counts_hsp', 'pct_counts_hsp', 'chip', 'n_genes', 'leiden', 'anno'
    var: 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'anno_colors', 'chip_colors', 'hvg', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'rank_genes_groups', 'umap'
    obsm: 'X_pca', 'X_umap', 'spatial'
    varm: 'PCs'
    layers: 'raw'
    obsp: 'connectivities', 'distances'

In [4]:
signatures_path = Path("/mnt/c/Users/ouyangkang/Downloads/signatures.gmt")

In [5]:
pd.read_csv(signatures_path, sep="\t", header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1305,1306,1307,1308,1309,1310,1311,1312,1313,1314
0,Normal,ikarus,RP11-128M1.1,TRAV8-2,PTPRD-AS1,MEOX2,CXCL12,KLRC4-KLRK1,BCAS1,SCNN1A,...,C22ORF15,CYP4F11,AK8,LRRC18,LMO2,COL12A1,ITGA11,EGFL6,RGS11,PCDHB15
1,Tumor,ikarus,RP11-277P12.10,RP13-895J2.6,BNC1,MAGEA6,ISX,MAGEA3,RP13-614K11.2,CDH7,...,,,,,,,,,,


In [14]:
model_path = Path("/mnt/c/Users/ouyangkang/Downloads/core_model.joblib")
model = classifier.Ikarus(signatures_gmt=signatures_path, out_dir="./")
model.load_core_model(model_path)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [117]:
adata.X = adata.layers['raw']
adata = data.preprocess_adata(adata)



In [90]:
adata.X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3582309 stored elements and shape (5406, 22857)>

In [37]:
_ = model.predict(adata, "prediction", save=True)

Less than 80% of the genes in Normal are present in the expression matrix.
Less than 80% of the genes in Tumor are present in the expression matrix.


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [40]:
from pyscenic.aucell import aucell, derive_auc_threshold
from ctxcore.genesig import GeneSignature

In [45]:
gs = GeneSignature.from_gmt(
            str(signatures_path), field_separator="\t", gene_separator="\t"
        )

In [118]:
df = adata.to_df()

In [92]:
percentiles = derive_auc_threshold(df)
print(percentiles)

0.01    0.008925
0.05    0.009669
0.10    0.010588
0.50    0.021875
1.00    0.150195
dtype: float64


In [120]:
scores = aucell(
            exp_mtx=df,
            signatures=gs,
            auc_threshold=percentiles[0.01],
            seed=2,
            normalize=False,
        )

Less than 80% of the genes in Tumor are present in the expression matrix.
Less than 80% of the genes in Normal are present in the expression matrix.


In [122]:
scores

Regulon,Normal,Tumor
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1
751.0-C03427D4,0.0,0.0
438.0-C03427D4,0.0,0.0
816.0-C03427D4,0.0,0.0
491.0-C03427D4,0.0,0.0
790.0-C03427D4,0.0,0.0
...,...,...
890.0-A04230E2,0.0,0.0
754.0-A04230E2,0.0,0.0
654.0-A04230E2,0.0,0.0
624.0-A04230E2,0.0,0.0


In [108]:
gs[0].gene2weight.keys()

dict_keys(['RP11-128M1.1', 'TRAV8-2', 'PTPRD-AS1', 'MEOX2', 'CXCL12', 'KLRC4-KLRK1', 'BCAS1', 'SCNN1A', 'HCST', 'OR51E1', 'PCLO', 'CD7', 'COL1A2', 'CFAP100', 'CCDC60', 'WNT4', 'CFAP45', 'RP11-522B15.3', 'RP11-462G2.1', 'ANKRD66', 'CYTIP', 'HSPB2', 'USH1C', 'NCR3', 'SMIM24', 'MYL6B', 'CD79A', 'TRBV5-1', 'IGKV2D-28', 'CDH6', 'XDH', 'AK7', 'COL5A1', 'PTPRC', 'EBF2', 'IL17F', 'CD48', 'GATA6-AS1', 'RP11-109M17.2', 'DPT', 'COL15A1', 'C5ORF49', 'CD8B', 'PLS1', 'ODAM', 'CCDC114', 'TM6SF2', 'RASL12', 'ECT2L', 'TAGLN', 'ANKUB1', 'RSPH4A', 'SIRPG', 'LINC01133', 'CD52', 'TRAV8-6', 'GIP', 'RP11-428G5.5', 'PTPN22', 'AC092580.4', 'GJC1', 'LINC01207', 'MSLN', 'ABLIM3', 'FAM180A', 'LINC01266', 'RP11-247C2.2', 'TM4SF4', 'FBXO15', 'TRPC4', 'FMO1', 'GNGT1', 'AC104820.2', 'FHL5', 'SFRP4', 'CREM', 'SOX2', 'RSPH1', 'SLC44A4', 'NTF3', 'EMILIN1', 'OGN', 'XCL2', 'PCGF2', 'ADAMTSL1', 'CORIN', 'STMND1', 'KLRC3', 'RSPH9', 'XIRP1', 'CD36', 'RBBP8NL', 'TOX3', 'DUOXA2', 'PRDM6', 'KCNA3', 'SERPINB4', 'CLDN16', 'RP11-3

In [53]:
"PCLO" in adata.var_names

True

In [109]:
[i for i in ['RP11-128M1.1', 'TRAV8-2', 'PTPRD-AS1', 'MEOX2', 'CXCL12', 'KLRC4-KLRK1', 'BCAS1', 'SCNN1A', 'HCST', 'OR51E1', 'PCLO', 'CD7', 'COL1A2', 'CFAP100', 'CCDC60', 'WNT4', 'CFAP45'] if i in adata.var_names]

['TRAV8-2',
 'MEOX2',
 'CXCL12',
 'KLRC4-KLRK1',
 'BCAS1',
 'SCNN1A',
 'HCST',
 'PCLO',
 'CD7',
 'COL1A2',
 'CFAP100',
 'CCDC60',
 'WNT4',
 'CFAP45']

In [None]:
['TRAV8-2',
 'MEOX2',
 'CXCL12',
 'KLRC4-KLRK1',
 'BCAS1',
 'SCNN1A',
 'HCST',
 'PCLO',
 'CD7',
 'COL1A2',
 'CFAP100',
 'CCDC60',
 'WNT4',
 'CFAP45']