In [None]:
import os
import sys
import pandas as pd
import scanpy as sc

sys.path.append("../..")

from src.utils.utils import get_gene_list_real_data
from src.data.preprocess_data import preprocess
from src.scoring_methods.gene_signature_scoring import score_signature
from data.constants import BASE_PATH_DATA, BASE_PATH_EXPERIMENTS

sc.settings.verbosity = 1


## First attempt for construction 
Densify entire matrix and compute rank and or/likelihood measure on it.
NOTE: does not scale if we want to score multiple sampples at the same time

In [None]:
adata = sc.read_h5ad('../data/real_data/P23T_adata.h5ad')
adata = adata[adata.obs['healthy'] != 'undecided', :]


In [None]:
preprocess(adata, 
           min_genes=500, 
           min_cells=10,
           target_sum=1e4, 
           copy=False, 
           verbose=1, 
           log=None)


In [None]:
gene_list = get_gene_list_real_data(
    adata,
    dge_method="wilcoxon",
    dge_key="wilcoxon",
    dge_pval_cutoff=0.01,
    dge_log2fc_min=0.0025,
    nr_de_genes=100,
    mode="random",
    log='get_gene_list',
    copy=False,
    verbose=1
)


In [None]:
data_df = pd.DataFrame(adata.X.todense(), index=adata.obs_names, columns=adata.var_names)

In [None]:
def rank_calculation(cell_data, genes):
    subdata = cell_data[cell_data!=0]
    cell_data_ranked = subdata.rank(na_option='bottom')
    sig_data_ranked = cell_data_ranked[cell_data_ranked.index.isin(genes)]
    if len(sig_data_ranked)>0:
        cumsum = sig_data_ranked.mean(skipna=True)
    else:
        cumsum =0 
    return (cumsum/len(subdata))

In [None]:
def or_calculation(data, genes):
    ge  = data.loc[:,data.columns.isin(genes)]
    nge = data.loc[:,data.columns.isin(genes)==False] 

    SigGenesExp = ge.apply(func=(lambda x: len(x[x!=0])), axis=1)
    NSigGenesExp =nge.apply(func=(lambda x: len(x[x!=0])), axis=1)

    SigGenesNE = ge.shape[1] - SigGenesExp
    SigGenesNE = SigGenesNE.replace(0,1)

    NSigGenesExp = NSigGenesExp.replace(0,1)

    NSigGenesNE = nge.shape[1] - (SigGenesExp + NSigGenesExp)
    NSigGenesNE = NSigGenesNE - SigGenesNE

    OR_score = (SigGenesExp * NSigGenesNE) / (SigGenesNE * NSigGenesExp)

    return OR_score



In [None]:
def likelihood_calculation(data, genes):
    ge  = data.loc[:,data.columns.isin(genes)]
    nge = data.loc[:,data.columns.isin(genes)==False] 

    SigGenesExp = ge.apply(func=(lambda x: len(x[x!=0])), axis=1)
    NSigGenesExp =nge.apply(func=(lambda x: len(x[x!=0])), axis=1)

    SigGenesNE = ge.shape[1] - SigGenesExp
    SigGenesNE = SigGenesNE.replace(0,1)

    NSigGenesExp = NSigGenesExp.replace(0,1)

    NSigGenesNE = nge.shape[1] - (SigGenesExp + NSigGenesExp)
    NSigGenesNE = NSigGenesNE - SigGenesNE

    LR_one = SigGenesExp * (NSigGenesExp + NSigGenesNE)
    LR_two = NSigGenesExp * (SigGenesExp + SigGenesNE)
    LR_score = LR_one/LR_two

    return LR_score



In [None]:
ranked_data_df = data_df.apply(func=(lambda x: rank_calculation(x, gene_list)), axis=1)

In [None]:
#normalize
ranked_data_df = (ranked_data_df - ranked_data_df.min())/(ranked_data_df.max()- ranked_data_df.min())

In [None]:
or_scores = or_calculation(data_df, gene_list)
or_scores = (or_scores-or_scores.min())/(or_scores.max()-or_scores.min())

In [None]:
lr_scores = likelihood_calculation(data_df, gene_list)
lr_scores = (lr_scores-lr_scores.min())/(lr_scores.max()-lr_scores.min())

In [None]:
JAS_Scores_1 = (or_scores + ranked_data_df)/2

In [None]:
JAS_Scores_2 = (lr_scores + ranked_data_df)/2

In [None]:
print(JAS_Scores_1)
print(JAS_Scores_2)

In [None]:
adata.obs['JAS_Scores_1'] = JAS_Scores_1
adata.obs['JAS_Scores_2'] = JAS_Scores_2

In [None]:
from src.utils.utils import get_test_statistics
from src.scoring_methods.gene_signature_scoring import score_signature

In [None]:
score_signature(
    method='jasmine_scoring',
    adata=adata,
    gene_list=gene_list,
    score_method = 'likelihood', 
    score_name = 'jasmine_lh'
)
score_signature(
    method='jasmine_scoring',
    adata=adata,
    gene_list=gene_list,
    score_method = 'oddsratio', 
    score_name = 'jasmine_or'
)

In [None]:
test_stat = get_test_statistics(adata,
                                ['JAS_Scores_1','jasmine_or' , 'JAS_Scores_2','jasmine_lh' ],
                                test_method="auc",
                                label_col='healthy',
                                label_whsc='unhealthy',
                                save=False,
                                store_data_path=None)
test_stat

## Second attempt for construction 
Goal: We want to be able to run jasmine scoring on all the data
How to: We work on a large dataset 

In [None]:
adata = sc.read_h5ad(os.path.join(BASE_PATH_DATA,'real_data/multi.h5ad'))
adata = adata.raw.to_adata()
adata.var_names = adata.var['_index']
adata.var_names.name = None

In [None]:
# TODO define path
#adata = sc.read_h5ad('../data/synthetic_data/8B_diff_groups_ratio_w_BE/dataset.hdf5')

In [None]:
adatas = {}
for group in adata.obs.groupby('orig.ident'):
#for group in adata.obs.groupby('Batch'):
    adatas[group[0]] = adata[group[1].index,].copy()
del adata

for key, adata in adatas.items():
    preprocess(adata,
               min_genes=500,
               min_cells=5,
               target_sum=1e4)

In [None]:
adata = sc.concat(list(adatas.values()), merge='same',join='inner')
adata

In [None]:
#adata.obs.Batch.astype('category')

In [None]:
curr_adata = adatas['P1_0']
del adatas

In [None]:
#gene_list = adata.var.DEFacGroup1.nlargest(20).index.tolist()

In [None]:
DE_of_celltypes = pd.read_csv(os.path.join(BASE_PATH_DATA, 'real_data/DE_by_celltype.csv'))
gene_list = DE_of_celltypes[DE_of_celltypes['Cell Type']=='CD4 Proliferating'].nlargest(20, columns=['Average Log Fold Change'])['Gene'].values.tolist()
gene_list

Rank sparse matrix

In [None]:
from scipy.stats import rankdata
import numpy as np
from scipy.sparse import issparse

In [None]:
curr_adata.X.nonzero()

In [None]:
gene_list_idx = np.where(adata.var_names.isin(gene_list))[0]

In [None]:
%%time
avg_sig_ranks = np.zeros(curr_adata.X.shape[0])
for i in range(curr_adata.X.shape[0]):
    curr_row = curr_adata.X[i,:]
    print(curr_row.getnnz())
    ranked_data = rankdata(curr_row.data)
    curr_row_sig_idx = [x in gene_list_idx for x in curr_row.indices]
    
    sig_genes_ranks = ranked_data[curr_row_sig_idx]
    
    if len(sig_genes_ranks) > 0:
        cumsum = np.nanmean(sig_genes_ranks)
    else:
        cumsum = 0
    
    avg_sig_ranks[i] = cumsum / len(curr_row.nonzero()[1])
    
#this is quite slow 

In [None]:
def rank_calculation(cell_data, genes):
    subdata = cell_data[cell_data != 0]
    cell_data_ranked = subdata.rank(na_option='bottom')
    sig_data_ranked = cell_data_ranked[cell_data_ranked.index.isin(genes)]
    if len(sig_data_ranked) > 0:
        cumsum = np.nanmean(sig_data_ranked)
    else:
        cumsum = 0
    return cumsum / len(subdata)

In [None]:
%%time
sparse_X = issparse(adata.X)

# create groups of managable sizes
bss = pd.cut(np.arange(adata.obs.shape[0]), (adata.obs.shape[0] // 1000 + 1), labels=False)

# for each group compute for each cell the ranks of the genes and select the ranks that belong to the signature
# genes
avg_sig_ranks = []
for group in adata.obs.groupby(bss):
    if sparse_X:
        data_df = pd.DataFrame(
            adata[group[1].index,].X.todense(), index=group[1].index, columns=adata.var_names
        )
    else:
        data_df = pd.DataFrame(
            adata[group[1].index,].X, index=group[1].index, columns=adata.var_names
        )
    res = data_df.apply(func=(lambda x: rank_calculation(x, gene_list)), axis=1)
    avg_sig_ranks.append(res)
del data_df
del res
del bss

In [None]:
avg_sig_ranks = pd.concat(avg_sig_ranks, axis=0)
avg_sig_ranks = (avg_sig_ranks - avg_sig_ranks.min()) / (avg_sig_ranks.max() - avg_sig_ranks.min())

In [None]:
def preparation(adata, genes):
    
    SG_X = adata[:,gene_list].X
    NSG_X = adata[:,adata.var_names.isin(gene_list)==False].X
    
    NSG = list(set(adata.var_names).difference(set(gene_list)))
    
    if sparse_X: 
        ge = pd.DataFrame.sparse.from_spmatrix(SG_X, index=adata.obs_names, columns=gene_list)
        nge = pd.DataFrame.sparse.from_spmatrix(NSG_X, index=adata.obs_names, columns=NSG)
    else:
        ge = pd.DataFrame(SG_X, index=adata.obs_names, columns=gene_list)
        nge = pd.DataFrame(NSG_X, index=adata.obs_names, columns=NSG)
    
    SigGenesExp = ge.astype(bool).sum(axis=1)
    NSigGenesExp = nge.astype(bool).sum(axis=1)
    
    SigGenesNE = ge.shape[1] - SigGenesExp
    SigGenesNE = SigGenesNE.replace(0, 1)

    NSigGenesExp = NSigGenesExp.replace(0, 1)

    NSigGenesNE = nge.shape[1] - (SigGenesExp + NSigGenesExp)
    NSigGenesNE = NSigGenesNE - SigGenesNE

    return SigGenesExp, SigGenesNE, NSigGenesExp, NSigGenesNE


def or_calculation(adata, genes):
    SigGenesExp, SigGenesNE, NSigGenesExp, NSigGenesNE = preparation(adata, genes)

    OR_score = (SigGenesExp * NSigGenesNE) / (SigGenesNE * NSigGenesExp)

    return OR_score


def likelihood_calculation(adata, genes):
    SigGenesExp, SigGenesNE, NSigGenesExp, NSigGenesNE = preparation(adata, genes)

    LR_one = SigGenesExp * (NSigGenesExp + NSigGenesNE)
    LR_two = NSigGenesExp * (SigGenesExp + SigGenesNE)
    LR_score = LR_one / LR_two

    return LR_score

In [None]:
%%time
scores = or_calculation(adata, gene_list)

In [None]:
scores = (scores - scores.min()) / (scores.max() - scores.min())
score = (scores + avg_sig_ranks) / 2

In [None]:
score[adata.obs['celltype.l3']=='CD4 Proliferating'].hist(density=True, alpha=0.5, label='CD4 Proliferating')
score[adata.obs['celltype.l3']!='CD4 Proliferating'].hist(density=True, alpha=0.5, label=' not CD4 Proliferating')

In [None]:
%%time
scores = likelihood_calculation(adata, gene_list)

In [None]:
scores = (scores - scores.min()) / (scores.max() - scores.min())
score = (scores + avg_sig_ranks) / 2

In [None]:
score[adata.obs['celltype.l3']=='CD4 Proliferating'].hist(density=True, alpha=0.5, label='CD4 Proliferating')
score[adata.obs['celltype.l3']!='CD4 Proliferating'].hist(density=True, alpha=0.5, label=' not CD4 Proliferating')

### Paralellizing scoring _method

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
orig_adata = sc.read_h5ad(os.path.join(BASE_PATH_DATA, 'real_data/esophag/preoprocessed_data.h5ad'))

mes_sig = pd.read_csv(os.path.join(BASE_PATH_DATA, 'annotations/esophag/genesig_Mes.csv')).Mes.tolist()

In [None]:
%%time
score_signature(method="jasmine_scoring",
                adata=orig_adata,
                gene_list=mes_sig,
                score_method = 'likelihood',
                score_name='mes_sig_scores_lh'
                )

In [None]:
%%time
score_signature(method="jasmine_scoring",
                adata=orig_adata,
                gene_list=mes_sig,
                score_method = 'oddsratio',
                score_name='mes_sig_scores_or'
                )

In [None]:
import sys
from typing import Optional, Sequence

import numpy as np
import pandas as pd
import scanpy as sc
from anndata import AnnData
from scanpy._utils import AnyRandom, _check_use_raw
from scipy.sparse import issparse, isspmatrix_csr,isspmatrix_csc, csr_matrix
import multiprocessing
from joblib import Parallel, delayed
import warnings

sys.path.append("../..")

from src.utils.utils import check_signature_genes


def rank_calculation(cell_data, genes):
    subdata = cell_data[cell_data != 0]
    cell_data_ranked = subdata.rank(na_option='bottom')
    sig_data_ranked = cell_data_ranked[cell_data_ranked.index.isin(genes)]
    if len(sig_data_ranked) > 0:
        cumsum = sig_data_ranked.mean(skipna=True)
    else:
        cumsum = 0
    return cumsum / len(subdata)

def compute_avg_ranks_sig_subset(X_data, index, columns,gene_list,X_indices=None,X_indptr=None,X_shape=None):
    
    if any([x is None for x in [X_indices,X_indptr, X_shape]]):
        data_df = pd.DataFrame(
            X_data, index=index, columns=columns
        )
    else:
        data_df = pd.DataFrame(
            csr_matrix((X_data, X_indices,X_indptr),X_shape, copy=True).todense(), index=index, columns=columns
        )

    return data_df.apply(func=(lambda x: rank_calculation(x, gene_list)), axis=1)

def compute_avg_ranks_signature(adata,sparse_X,gene_list, bs, joblib_kwargs):
    # create groups of managable sizes --> for each group compute for each cell the ranks of the genes and select the
    # ranks that belong to the signature genes
    # create groups of managable sizes
    bss = pd.cut(np.arange(adata.obs.shape[0]), (adata.obs.shape[0] // bs + 1), labels=False)
    
    num_cores = multiprocessing.cpu_count()
                                
    avg_sig_ranks = Parallel(**joblib_kwargs)(
        delayed(compute_avg_ranks_sig_subset)(X_data=adata[group[1].index,].X.data if sparse_X else adata[group[1].index,].X,
                                         X_indices=adata[group[1].index,].X.indices if sparse_X else None,
                                         X_indptr=adata[group[1].index,].X.indptr if sparse_X else None,
                                         X_shape = adata[group[1].index,].X.shape  if sparse_X else None,
                                         index = group[1].index,
                                         columns = adata.var_names,
                                         gene_list = gene_list) for group in adata.obs.groupby(bss))
    avg_sig_ranks = pd.concat(avg_sig_ranks, axis=0)
    return avg_sig_ranks

def preparation(adata, genes):
    sg_x = adata[:, genes].X
    nsg_x = adata[:, adata.var_names.isin(genes) == False].X

    nsg = list(set(adata.var_names).difference(set(genes)))

    if issparse(adata.X):
        ge = pd.DataFrame.sparse.from_spmatrix(sg_x, index=adata.obs_names, columns=genes)
        nge = pd.DataFrame.sparse.from_spmatrix(nsg_x, index=adata.obs_names, columns=nsg)
    else:
        ge = pd.DataFrame(sg_x, index=adata.obs_names, columns=genes)
        nge = pd.DataFrame(nsg_x, index=adata.obs_names, columns=nsg)

    sig_genes_exp = ge.astype(bool).sum(axis=1)
    n_sig_genes_exp = nge.astype(bool).sum(axis=1)

    sig_genes_ne = ge.shape[1] - sig_genes_exp
    sig_genes_ne = sig_genes_ne.replace(0, 1)

    n_sig_genes_exp = n_sig_genes_exp.replace(0, 1)

    n_sig_genes_ne = nge.shape[1] - (sig_genes_exp + n_sig_genes_exp)
    n_sig_genes_ne = n_sig_genes_ne - sig_genes_ne

    return sig_genes_exp, sig_genes_ne, n_sig_genes_exp, n_sig_genes_ne


def or_calculation(adata, genes):
    sig_genes_exp, sig_genes_ne, n_sig_genes_exp, n_sig_genes_ne = preparation(adata, genes)

    or_score = (sig_genes_exp * n_sig_genes_ne) / (sig_genes_ne * n_sig_genes_exp)

    return or_score


def likelihood_calculation(adata, genes):
    sig_genes_exp, sig_genes_ne, n_sig_genes_exp, n_sig_genes_ne = preparation(adata, genes)

    lr_one = sig_genes_exp * (n_sig_genes_exp + n_sig_genes_ne)
    lr_two = n_sig_genes_exp * (sig_genes_exp + sig_genes_ne)
    lr_score = lr_one / lr_two

    return lr_score


def score_genes(
        adata: AnnData,
        gene_list: Sequence[str],
        score_method: str = 'likelihood',
        bs: int = 500,
        score_name: str = "score",
        random_state: AnyRandom = 0,
        copy: bool = False,
        use_raw: Optional[bool] = None,
        verbose: int = 0,
        joblib_kwargs: dict = {'n_jobs':multiprocessing.cpu_count()}
) -> Optional[AnnData]:
    start = sc.logging.info(f"computing score {score_name!r}")
    if verbose > 0:
        print(f"computing score {score_name!r}")

    adata = adata.copy() if copy else adata

    use_raw = _check_use_raw(adata, use_raw)

    _adata = adata.raw if use_raw else adata

    if random_state is not None:
        np.random.seed(random_state)

    # remove genes from gene_list not available in the data
    gene_list = check_signature_genes(_adata.var_names, gene_list)

    # check type of rank
    if score_method not in ['oddsratio', 'likelihood']:
        raise ValueError(f"method {score_method} must be one of the obptions ['oddsratio','likelihood']")
    elif score_method == 'oddsratio':
        f_score_method = or_calculation
    else:
        f_score_method = likelihood_calculation
        
    sparse_X = issparse(_adata.X)
    
    if not sparse_X:
        avg_sig_ranks = compute_avg_ranks_signature(_adata,sparse_X,gene_list, bs, joblib_kwargs)
        scores = f_score_method(_adata, gene_list)
    elif sparse_X and isspmatrix_csc(_adata.X):
        scores = f_score_method(_adata, gene_list)
        _adata.X = _adata.X.tocsr()
        avg_sig_ranks = compute_avg_ranks_signature(_adata,sparse_X,gene_list, bs, joblib_kwargs)
        warnings.warn(f'Changed sparse format to CSR for performance reasons')
    elif sparse_X and isspmatrix_csr(_adata.X):
        avg_sig_ranks = compute_avg_ranks_signature(_adata,sparse_X,gene_list, bs, joblib_kwargs)
        _adata.X = _adata.X.tocsc()
        scores = f_score_method(_adata, gene_list)
        warnings.warn(f'Changed sparse format to CSC for performance reasons')
    else:
        raise ValueError('Unknown sparse matrix format. Allowd are CSR and CSC')
    
    
    avg_sig_ranks = (avg_sig_ranks - avg_sig_ranks.min()) / (avg_sig_ranks.max() - avg_sig_ranks.min())
    scores = (scores - scores.min()) / (scores.max() - scores.min())

    score = (scores + avg_sig_ranks) / 2

    adata.obs[score_name] = score

    sc.logging.info(
        "    finished",
        time=start,
        deep=("added\n" f"    {score_name!r}, score of gene set (adata.obs)."),
    )
    return adata if copy else None


In [None]:
%%time
scores=score_genes(orig_adata, mes_sig, score_method = 'likelihood',
                score_name='mes_sig_scores_lh_refactored')

In [None]:
%%time
scores=score_genes(orig_adata, mes_sig, score_method = 'oddsratio',
                score_name='mes_sig_scores_or_refactored')

In [None]:
100*30*20/60/60

In [None]:
all(orig_adata.obs.mes_sig_scores_lh_refactored==orig_adata.obs.mes_sig_scores_lh)

In [None]:
all(orig_adata.obs.mes_sig_scores_or_refactored==orig_adata.obs.mes_sig_scores_or)