In [None]:
import os
import sys
import scanpy as sc
import random
from signaturescoring import score_signature
from signaturescoring.utils.utils import get_mean_and_variance_gene_expression
from sklearn.metrics import roc_auc_score
import pandas as pd

sys.path.append('../..')
from data.constants import BASE_PATH_DATA

fn = os.path.join(BASE_PATH_DATA, 'preprocessed/pp_luad.h5ad')

In [None]:
adata = sc.read_h5ad(fn)

In [None]:
gene_list = adata.var_names

In [None]:
gene_list = random.sample(gene_list.tolist(), 200)

In [None]:
gene_list

In [None]:
max_sig_length = 15
scm_params = {"ctrl_size": 100, "score_name": "ANS"}

In [None]:
results = []
for curr_sig_len in range(1, max_sig_length + 1):
    curr_gene_list = gene_list[0:curr_sig_len]

    score_signature(
        method='adjusted_neighborhood_scoring',
        adata=adata,
        gene_list=curr_gene_list,
        **scm_params
    )
    curr_scores = adata.obs[scm_params['score_name']].copy()
    aucs = []
    for (sid, data) in adata.obs.groupby(by='sample_id'):
        aucs.append((
                len(curr_gene_list),
                sid,
                1 - roc_auc_score(data.malignant_key, curr_scores[data.index])
        ))
    results.append(pd.DataFrame(aucs, columns=['signature_length', 'sample_id', 'AUCROC']))
results = pd.concat(results, axis=0)
results = pd.pivot(results, columns='sample_id', index='signature_length')

In [None]:
results

In [None]:
nr_sims = 10

In [None]:
results = []
for curr_sig_len in range(1, max_sig_length + 1):
    sim_results = []
    for i in range(nr_sims):
        curr_gene_list = random.sample(gene_list, curr_sig_len)

        score_signature(
            method='adjusted_neighborhood_scoring',
            adata=adata,
            gene_list=curr_gene_list,
            **scm_params
        )
        curr_scores = adata.obs[scm_params['score_name']].copy()
        aucs = []
        for (sid, data) in adata.obs.groupby(by='sample_id'):
            aucs.append((
                    i,
                    len(curr_gene_list),
                    sid,
                    1 - roc_auc_score(data.malignant_key, curr_scores[data.index])
            ))

        sim_results.append(pd.DataFrame(aucs, columns=['simuation_nr', 'signature_length', 'sample_id', 'AUCROC']))
    sim_results = pd.concat(sim_results, axis=0)
    results.append(sim_results.groupby(['signature_length', 'sample_id'])['AUCROC'].mean().reset_index())
results = pd.concat(results, axis=0)
results = pd.pivot(results, columns='sample_id', index='signature_length')

In [None]:
results

In [None]:
def score_genes_and_evaluate(adata, gene_list, df_mean_var, sc_method, scm_params, col_sid='sample_id'):
    score_signature(
        method=sc_method,
        adata=adata,
        gene_list=gene_list,
        df_mean_var=df_mean_var,
        **scm_params
    )
    curr_scores = adata.obs[scm_params['score_name']].copy()
    aucs = []
    for (sid, data) in adata.obs.groupby(by=col_sid):
        aucs.append((
                len(gene_list),
                sid,
                1 - roc_auc_score(data.malignant_key, curr_scores[data.index])
        ))
    return pd.DataFrame(aucs, columns=['signature_length', 'sample_id', 'AUCROC'])


def run_experiment_decreasing_dgex(adata, gene_list, df_mean_var, sc_method, scm_params):
    results = []
    max_sig_length = len(gene_list)
    for curr_sig_len in range(1, max_sig_length + 1):
        curr_gene_list = gene_list[0:curr_sig_len]
        results.append(score_genes_and_evaluate(
            adata,
            curr_gene_list,
            df_mean_var,
            sc_method,
            scm_params
        ))
    results = pd.concat(results, axis=0)
    results = pd.pivot(results, columns='sample_id', index='signature_length')
    return results


def run_experiment_random_dgex(adata, gene_list, df_mean_var, sc_method, scm_params, max_sig_length, nr_sims):
    results = []
    for curr_sig_len in range(1, max_sig_length + 1):
        curr_res = []
        for j in range(nr_sims):
            curr_gene_list = random.sample(gene_list, curr_sig_len)
            curr_res.append(score_genes_and_evaluate(
                adata,
                curr_gene_list,
                df_mean_var,
                sc_method,
                scm_params
            ))
        curr_res_df = pd.concat(results, axis=0)
    return results