In [13]:
import os
import glob
import argparse
import scanpy as sc
import loompy as lp
import pandas as pd
from pyscenic.cli.utils import load_signatures
from scipy.stats import ttest_1samp
from pyscenic.utils import add_correlation

In [14]:
def calc_p_value(importances):
    _, p_value = ttest_1samp(importances, 0)
    return p_value

def regulon2sadj(
    regulons,
):
    net_lst = []
    for tf in regulons:
        tf_name = tf.name.split("(")[0]
        tf_targets = tf.gene2weight
        for target, weight in tf_targets.items():
            net_lst.append([tf_name, target, weight])
    net = pd.DataFrame(net_lst, columns=["TF", "target", "importance"])
    return net

In [15]:
path_loom = "/cellar/users/aklie/data/datasets/neurips2021_small/analysis/scenic/2024_05_03/rna.loom"
path_scenic = "/cellar/users/aklie/data/datasets/neurips2021_small/analysis/scenic/2024_05_03"

In [17]:
print("Starting consolidation...")
reg_csvs = sorted(glob.glob(os.path.join(path_scenic, "*reg.csv")))
reg_csvs

Starting consolidation...


['/cellar/users/aklie/data/datasets/neurips2021_small/analysis/scenic/2024_05_03/run2_reg.csv',
 '/cellar/users/aklie/data/datasets/neurips2021_small/analysis/scenic/2024_05_03/run4_reg.csv']

In [18]:
print("Reading regulons...")
all_edges = pd.DataFrame()
for reg_csv in reg_csvs:
    regulons = load_signatures(reg_csv)
    adj_df = regulon2sadj(regulons)
    all_edges = pd.concat([all_edges, adj_df])
all_edges.head()
print(f"Total edges: {len(all_edges)}")

Reading regulons...


EmptyDataError: No columns to parse from file

In [None]:
print("Grouping by source and target and filtering edges...")
grouped = all_edges.groupby(['TF', 'target'])
filtered = grouped.filter(lambda x: len(x) > 1)
print(f"{len(all_edges) - len(filtered)} edges dropped")

In [None]:
print("Calculating mean importance for each edge...")
mean_importance = filtered.groupby(['TF', 'target'])['importance'].mean()
print(f"Total unique edges: {len(mean_importance)}")

In [None]:
print("Calculating empirical p-value...")
from tqdm import tqdm
tqdm.pandas()
import numpy as np
TINY = np.finfo(np.float32).tiny
p_values_series = filtered.groupby(['TF', 'target'])['importance'].progress_apply(calc_p_value)
p_values = p_values_series.values + TINY

In [None]:
print("Transforming values...")
neg_log_p = -np.log10(p_values)
normalized_importance = (mean_importance - mean_importance.min()) / (mean_importance.max() - mean_importance.min())

In [None]:
print("Adding correlation...")
adata = sc.read_loom(path_loom, sparse=True)
filtered = add_correlation(filtered, adata.to_df())
mean_corr = filtered.groupby(['TF', 'target'])['rho'].mean()

In [None]:
consolidated = pd.DataFrame({
    'source': mean_importance.index.get_level_values('TF'),
    'target': mean_importance.index.get_level_values('target'),
    'weight_signed': np.nan,
    'weight_unsigned': mean_importance.values,
    'weight_minmax_normalized': normalized_importance.values,
    'p': p_values,
    '-logp': neg_log_p,
    'description': np.nan,
    'corr': mean_corr.values if path_loom is not None else np.nan
}).reset_index(drop=True)

In [None]:
# Remove self-loops
print("Removing self-loops...")
consolidated = consolidated[consolidated["source"] != consolidated["target"]]

In [None]:
# Save
output_path = os.path.join(args.scenic_out_dir, args.out_file)
consolidated.to_csv(output_path, sep="\t", index=False)

# DONE!

---