# TF activity for a new set of TFs

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
import scanpy as sc
import decoupler as dc
import os
from pathlib import Path
import numpy as np
from scipy.stats import pearsonr

In [2]:
root_dir = "/tscc/projects/ps-epigen/users/biy022/biccn/data/SNAREdata/scenicplus/"
os.chdir(root_dir)

# RC axis

In [3]:
included_celltypes = ["L2_3_IT", "L4_IT", "L5_IT", "L6_IT", "L6_CT", "SST"]
included_tfs = [
    "JDP2", "KLF9", "KLF12", "BACH2", "RARB", "IRF2", "NFE2L1",
    "NFIC", "LIN28B", "SATB1", "ZBTB20", "EGR1", "ZNF385D",
    "BCL11A", "TCF12", "ELF2", "NF1", "NFIA", "SMAD2", "LHX2",
    "NFIB", "SMAD3", "ZNF846", "RFX2", "PKNOX2", "TCF4", "RFX3"
]

In [4]:
result_df = pd.DataFrame(
    np.zeros((len(included_tfs), 2 * len(included_celltypes))),
    index=included_tfs,
    columns=["{}-gene".format(xx) for xx in included_celltypes] + 
            ["{}-peak".format(xx) for xx in included_celltypes]
)

In [5]:
for celltype in included_celltypes:
    direct_table = pd.read_csv(Path(celltype) / "scplus_outputs/eRegulon_direct.tsv", sep="\t", header=0)
    extend_table = pd.read_csv(Path(celltype) / "scplus_outputs/eRegulons_extended.tsv", sep="\t", header=0)
    regulons_table = pd.concat([direct_table, extend_table], axis=0)
    regulons_table.reset_index(drop=True, inplace=True)
    regulons_table = regulons_table[regulons_table["eRegulon_name"].str.endswith("+/+")].copy()
    for tf in included_tfs:
        tf_table = regulons_table[regulons_table["TF"] == tf].copy()
        if tf_table.shape[0] == 0:
            result_df.loc[tf, "{}-gene".format(celltype)] = 0.0
            result_df.loc[tf, "{}-peak".format(celltype)] = 0.0
            continue
        tf_table["Region"] = tf_table.apply(lambda r: "-".join(r["Region"].split(":", 1)), axis=1)
        celltype_gcorr_table = pd.read_csv(
            Path("other_celltypes/corr_genes_7e-1_1e-2_expr_filtered/") / 
            "{}_RC_table.tsv".format(celltype),
            header=0, sep="\t"
        )
        celltype_rcorr_table = pd.read_csv(
            Path("other_celltypes/corr_regions_5e-1_5e-2_expr_filtered/") / 
            "{}_RC_table.tsv".format(celltype),
            header=0, sep="\t"
        )
        tf_target_genes = tf_table[tf_table["Gene"].isin(
            celltype_gcorr_table["gene"])]["Gene"].unique().tolist()
        tf_target_peaks = tf_table[tf_table["Region"].isin(
            celltype_rcorr_table["region"])]["Region"].unique().tolist()
        result_df.loc[tf, "{}-gene".format(celltype)] = celltype_gcorr_table[
            celltype_gcorr_table["gene"].isin(tf_target_genes)]["corr"].median()
        result_df.loc[tf, "{}-peak".format(celltype)] = celltype_rcorr_table[
            celltype_rcorr_table["region"].isin(tf_target_peaks)]["corr"].median()
result_df = result_df.fillna(0.0)

In [6]:
result_df.to_csv(
    "/tscc/projects/ps-epigen/users/biy022/biccn/analysis/final_files/selected_tfs_20250421.tsv",
    header=True, sep="\t", index=True
)

# ST axis

In [7]:
included_celltypes = ["L2_3_IT", "L4_IT", "L5_IT", "L6_IT", "L6_CT", "SST", "PVALB"]
included_tfs = [
    "JDP2", "RORA", "NFE2L1", "NFIC", "ZNF385D", "KLF9", "IRF2",
    "RARB", "FOSL2", "SMARCC1", "KLF12", "ETS1", "BACH2", "NFIA",
    "LIN28B", "SATB1", "EGR1", "ELF2", "TCF12", "NFIX", "KLF13",
    "NF1", "SMAD3", "NFIB", "LHX2", "ZNF846", "SMAD9", "KLF8",
    "TCF4", "RFX7", "RFX3", "RFX2"
]

In [8]:
result_df = pd.DataFrame(
    np.zeros((len(included_tfs), 2 * len(included_celltypes))),
    index=included_tfs,
    columns=["{}-gene".format(xx) for xx in included_celltypes] + 
            ["{}-peak".format(xx) for xx in included_celltypes]
)

In [9]:
for celltype in included_celltypes:
    direct_table = pd.read_csv(Path(celltype) / "scplus_outputs/eRegulon_direct.tsv", sep="\t", header=0)
    extend_table = pd.read_csv(Path(celltype) / "scplus_outputs/eRegulons_extended.tsv", sep="\t", header=0)
    regulons_table = pd.concat([direct_table, extend_table], axis=0)
    regulons_table.reset_index(drop=True, inplace=True)
    regulons_table = regulons_table[regulons_table["eRegulon_name"].str.endswith("+/+")].copy()
    for tf in included_tfs:
        tf_table = regulons_table[regulons_table["TF"] == tf].copy()
        if tf_table.shape[0] == 0:
            result_df.loc[tf, "{}-gene".format(celltype)] = 0.0
            result_df.loc[tf, "{}-peak".format(celltype)] = 0.0
            continue
        tf_table["Region"] = tf_table.apply(lambda r: "-".join(r["Region"].split(":", 1)), axis=1)
        celltype_gcorr_table = pd.read_csv(
            Path("other_celltypes/corr_genes_ST_7e-1_1e-2_expr_filtered/") / 
            "{}_RC_table.tsv".format(celltype),
            header=0, sep="\t"
        )
        celltype_rcorr_table = pd.read_csv(
            Path("other_celltypes/corr_regions_ST_5e-1_5e-2_expr_filtered/") / 
            "{}_ST_table.tsv".format(celltype),
            header=0, sep="\t"
        )
        tf_target_genes = tf_table[tf_table["Gene"].isin(
            celltype_gcorr_table["gene"])]["Gene"].unique().tolist()
        tf_target_peaks = tf_table[tf_table["Region"].isin(
            celltype_rcorr_table["region"])]["Region"].unique().tolist()
        result_df.loc[tf, "{}-gene".format(celltype)] = celltype_gcorr_table[
            celltype_gcorr_table["gene"].isin(tf_target_genes)]["corr"].median()
        result_df.loc[tf, "{}-peak".format(celltype)] = celltype_rcorr_table[
            celltype_rcorr_table["region"].isin(tf_target_peaks)]["corr"].median()
result_df = result_df.fillna(0.0)

In [10]:
result_df.to_csv(
    "/tscc/projects/ps-epigen/users/biy022/biccn/analysis/final_files/selected_tfs_ST_20250421.tsv",
    header=True, sep="\t", index=True
)