In [1]:
import os
import sys
import pandas as pd
import numpy as np
import glob
import time
import gget
import scipy
import matplotlib.patches as patches
from scipy.sparse import csr_matrix
import anndata as an
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy.stats import fisher_exact
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.patches as mpatches
import matplotlib.cm as cm
from pycirclize import Circos
from scipy.interpolate import splprep, splev
import networkx as nx
import random
from importlib import reload
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
from itertools import combinations
import ot
from scipy.spatial.distance import pdist, squareform
from matplotlib.colors import ListedColormap

import surprise as sup

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler

"""WARNING: no warnings"""
warnings.filterwarnings("ignore")

# local imports
import anndata_utils as anntools

source_path = os.path.abspath("../source/")
sys.path.append(source_path)
import centrality as central
import matrix
import utils as ut
import plotting as plt2

# Load pathways

In [2]:
def load_pathway(fpath):
    """
    Loads an Enrichr-like database file into a boolean DataFrame.

    Args:
        fpath (str): Path to the Enrichr-like database file.

    Returns:
        pandas.DataFrame: A boolean DataFrame where:
            - Index: Genes
            - Columns: Pathways
            - Values: True if the gene is in the pathway, False otherwise.
    """

    result = []
    with open(fpath,  encoding='utf-8') as f:
        for line in f:
            split_line = [x for x in line.strip().split('\t') if x]  # Remove empty strings directly

            row = {'label': split_line[0]}
            for gene in split_line[1:]:
                row[gene] = 1

            result.append(row)

    df = pd.DataFrame(result)
    df = df.fillna(0.0).set_index('label').astype(bool).T  # Chained operations for clarity

    return df

fpath = "../../ONT-single-cell/resources/PanglaoDB_Augmented_2021.txt"
pdf = load_pathway(fpath)
stem_genes = list(pdf[pdf['Embryonic Stem Cells']].index)
stem_genes = [x.title() for x in stem_genes]
stem_genes[:10]

['Gjb1',
 'Amotl2',
 'Yap1',
 'Fbln1',
 'Uaca',
 'Antxr1',
 'Fermt2',
 'Serpinh1',
 'Pls3',
 'Gpx8']

In [3]:
""" LOAD the TF list """
fpath = "/nfs/turbo/umms-indikar/shared/projects/twin_cell/data/b_matrix/SCENIC/scenic/tf_lists/allTFs_mm.txt"
tf_list = [x.strip() for x in open(fpath)]

# Load expression

In [4]:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/expression_table/rna_table.parquet"
tdf = pd.read_parquet(fpath)
print(f"(raw) {tdf.shape=}")
print(f"(filtered) {tdf.shape=}")

expression_map = dict(zip(tdf['gene_name'].values, tdf['TPM'].values))

tdf.sample(5)

(raw) tdf.shape=(51883, 19)
(filtered) tdf.shape=(51883, 19)


Unnamed: 0,gene_id,transcript_id(s),length,effective_length,expected_count,TPM,FPKM,posterior_mean_count,posterior_standard_deviation_of_count,pme_TPM,pme_FPKM,TPM_ci_lower_bound,TPM_ci_upper_bound,TPM_coefficient_of_quartile_variation,FPKM_ci_lower_bound,FPKM_ci_upper_bound,FPKM_coefficient_of_quartile_variation,ens_gene_id,gene_name
30929,ENSMUSG00000093416.1,ENSMUST00000176980.1,1748.0,1496.24,0.0,0.0,0.0,0.0,0.0,0.05,0.04,8.85816e-07,0.141585,0.657567,6.92389e-07,0.110727,0.657528,ENSMUSG00000093416,Gm18294
5906,ENSMUSG00000027075.16,"ENSMUST00000028469.13,ENSMUST00000111624.7,ENS...",2298.15,2046.4,457.0,16.54,12.33,457.0,0.0,16.12,12.61,13.9561,18.4984,0.048884,10.9021,14.4574,0.048862,ENSMUSG00000027075,Slc43a1
24952,ENSMUSG00000083394.2,ENSMUST00000118352.2,483.0,231.37,1.01,0.32,0.24,1.02,0.16,0.61,0.48,0.012067,1.46053,0.476422,0.00885218,1.14189,0.476211,ENSMUSG00000083394,Gm11703
31109,ENSMUSG00000093711.1,ENSMUST00000176234.1,931.0,679.24,0.0,0.0,0.0,0.0,0.0,0.1,0.08,2.39322e-07,0.312362,0.653377,1.87473e-07,0.244212,0.653296,ENSMUSG00000093711,Vmn2r-ps125
12328,ENSMUSG00000041718.15,"ENSMUST00000040338.8,ENSMUST00000070801.10,ENS...",2937.8,2686.05,4468.0,123.19,91.87,4468.0,0.0,119.4,93.35,111.889,126.778,0.021625,87.6009,99.2267,0.021644,ENSMUSG00000041718,Alg13


# Load core scores

In [5]:
resolution = 1000000
score_column = 'global_hge_logexp_RNA_weighted'
core_threshold_quantile = 0.75

fpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/global_core_score/population_mESC_{resolution}_scores.csv"
scores = pd.read_csv(fpath)
print(f"{scores.shape=}")

threshold = np.quantile(scores[score_column].fillna(0.0), core_threshold_quantile)
scores['is_core'] = scores[score_column] > threshold

scores.head()

scores.shape=(2431, 36)


Unnamed: 0,bin_name,bin_index,bin_start,bin_end,bin,chrom,chrom_bin,degree,genes,n_genes,...,ce_pagerank,hge_singular_vector_1,hge_logexp_unweighted,hge_logexp_degree_weighted,hge_logexp_RNA_weighted,hge_logexp_ATAC_weighted,global_singular_vector_1,global_hge_logexp_unweighted,global_hge_logexp_RNA_weighted,is_core
0,chr1:3,2235,3000000,4000000,3,1,3,2953,Gm37363;Gm37381;Gm7341;Gm38148;Gm37180;Gm19938...,15,...,0.117995,0.0,0.148099,0.0,0.17437,0.22238,0.330855,0.501567,0.167764,False
1,chr1:4,964,4000000,5000000,4,1,4,3331,Gm7369;A930006A01Rik;Gm37381;Gm38076;Mrpl15;Gm...,22,...,0.372869,0.103845,0.239187,0.281148,0.46827,0.560302,0.412235,0.609261,0.369136,True
2,chr1:5,1823,5000000,6000000,5,1,5,3341,Atp6v1h;Gm17101;Gm16041;Rgs20;Gm36965;Gm38264;...,12,...,0.224249,0.057865,0.647027,0.201096,0.243125,0.405148,0.425601,0.528269,0.216383,False
3,chr1:6,1288,6000000,7000000,6,1,6,3422,St18;Gm7449;Gm19214;Gm37108;Gm2147;Gm19026;473...,10,...,0.53494,0.172229,0.760979,0.401754,0.250762,0.521777,0.448264,0.492633,0.258762,False
4,chr1:7,144,7000000,8000000,7,1,7,3289,Gm23274;Gm5694;Rps2-ps2;Gm37225;Gm37489;Gm2690...,18,...,0.0996,0.053319,0.0,0.160235,0.190846,0.204818,0.414267,0.510526,0.204855,False


# single-cell transcription factories

In [6]:
""" LOAD the genes """
fpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/lightweight/singlecell_mESC_{resolution}_gdf.parquet"
df = pd.read_parquet(fpath)
df['is_tf'] = df['gene_name'].isin(tf_list)
df['is_marker'] = df['gene_name'].isin(stem_genes)


""" MERGE the core scores """
merge_columns = [
    'bin_name',
    'ATACSeq_1',
    'CTCF',
    'H3K27ac', 
    'H3K27me3',
    'RNA_5',
    'global_hge_logexp_RNA_weighted'
]

df = pd.merge(
    df, scores[merge_columns].fillna(0.0), how="left",
)

""" MERGE expression information """
df['expression'] = df['gene_name'].map(expression_map)

print(f"{df.shape=}")
df.head()

df.shape=(59016150, 13)


Unnamed: 0,gene_name,gene_biotype,read_name,bin_name,is_tf,is_marker,ATACSeq_1,CTCF,H3K27ac,H3K27me3,RNA_5,global_hge_logexp_RNA_weighted,expression
0,Jazf1,protein_coding,07e775f2-ee00-4bd0-a2e4-601ded549c44,chr6:52,True,False,0.716487,0.916573,0.903254,2.147158,0.352027,0.222275,0.08
1,Nmur1,protein_coding,19a335b5-31d1-44a2-ae3e-73433ef3a519,chr1:86,False,False,0.803481,1.4746,2.044946,0.44264,4.862435,0.727237,0.08
2,Nmur1,protein_coding,e880790d-0a46-4fdf-b4d8-1c944a8c79bf,chr1:86,False,False,0.803481,1.4746,2.044946,0.44264,4.862435,0.727237,0.08
3,Nmur1,protein_coding,0293c95c-85d4-4db9-b18a-840971c78330,chr1:86,False,False,0.803481,1.4746,2.044946,0.44264,4.862435,0.727237,0.08
4,Nmur1,protein_coding,9610ddea-ac3e-4b49-a118-97485a0675d8,chr1:86,False,False,0.803481,1.4746,2.044946,0.44264,4.862435,0.727237,0.08
