In [2]:
import sys
import os
import pandas as pd
import numpy as np
import scipy
from scipy import stats
from importlib import reload
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import cooler
import time
from sklearn.decomposition import PCA
import glob
import pyBigWig

import warnings
warnings.simplefilter("ignore", category=RuntimeWarning)

# local imports
source_path = os.path.abspath("../../source/")
sys.path.insert(0, source_path)
print(source_path)
import utils as ut
import plotting as plt2
import hypercore as hc
import matrix as matrix
import centrality as central


# hyperlocal imports
import core_utils
import nb_utils as nb
import homocore as homoc

/home/machoi/higher-order-single-cell/source
/home/machoi/higher-order-single-cell/source


# Get chromsizes

In [3]:
reload(core_utils)
# load chromsizes 
fpath = "/scratch/indikar_root/indikar1/shared_data/population/references/GRCm39.chrom.sizes"
chroms, chrom_starts = core_utils.load_chrom_sizes(fpath)
chroms.head()

Unnamed: 0,chrom,size,bp_start
0,1,195154279,0
1,2,181755017,195154279
2,3,159745316,376909296
3,4,156860686,536654612
4,5,151758149,693515298


# Load population pore-c

In [4]:
reload(core_utils)
    
dpath = "/scratch/indikar_root/indikar1/shared_data/population/align_table/"
file_list = glob.glob(f"{dpath}*")
resolution = 1e6
chromosomes = ['2']
df = core_utils.load_pore_c(
    file_list, 
    chrom_starts, 
    resolution=resolution,
    chroms=chromosomes
)
print(f"{df.shape=}")
df['ones'] = 1
df.head()

batch04 (177353, 8)
batch02 (18558, 8)
batch03 (92905, 8)
batch01 (162436, 8)
df.shape=(451252, 8)


Unnamed: 0,read_name,align_id,order,chrom,local_position,global_bin,local_bin,basename,ones
264,0003227f-dfb9-4d87-85f9-db8f860f47dd,8036161,2,2,98497512.0,294,99,batch04,1
272,0003227f-dfb9-4d87-85f9-db8f860f47dd,8036169,2,2,34287050.0,230,35,batch04,1
283,000333f9-5e77-4813-912d-faf4123b1631,1539627,2,2,128099396.0,324,129,batch04,1
286,000333f9-5e77-4813-912d-faf4123b1631,1539630,2,2,134768640.0,330,135,batch04,1
323,000378e6-d6eb-498c-b717-aedcaa5c3257,817274,4,2,53060104.0,249,54,batch04,1


In [5]:
H = nb.incidence_by_pivot(
    df, 
    index='read_name',
    columns='local_bin',
    values='ones',
)
print(f"Total: {H.shape=}")
H = H.T.drop_duplicates().T
print(f"Unique: {H.shape=}")
H.head()

Total: H.shape=(179, 186255)
Unique: H.shape=(179, 53888)


read_name,00005e8d-30b2-4c05-b193-5f954681d44e,0000803f-34e7-43c4-bcd9-beea8925c4c0,00012f2d-c107-4d01-b41b-68d537917d7c,00023d22-f6bc-4012-b148-c983a910ddf9,0002a895-1877-4f69-8ded-c15568d64e8a,0002e105-2bde-491a-8d43-53d8d91ca99f,00031758-8c53-4ec6-8219-b9dfaa01bf67,00031c17-503f-4c17-a459-1a88f59278a8,0003227f-dfb9-4d87-85f9-db8f860f47dd,000333f9-5e77-4813-912d-faf4123b1631,...,ffef4414-ca17-41fa-a10c-ca0e9713e58d,fff1209b-7169-4d92-9b54-51370a1d2c2b,fff40cb0-1890-4f36-a1ac-63a05eb92fe8,fff8dc51-afc0-445d-bfe4-c692410915c2,fff97148-f68f-4469-a728-0abd65f751bb,fffbb892-66d2-4ee6-bdeb-f7d854d75b35,fffc0ee8-4407-4a43-b527-b3237ba2e9d8,fffc84e9-9d73-470e-85c4-679049a68baf,fffed480-36e9-42b6-b3c6-27dc516e6a34,fffef91a-27b8-466a-a913-61292d2d5efe
local_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
H

read_name,00005e8d-30b2-4c05-b193-5f954681d44e,0000803f-34e7-43c4-bcd9-beea8925c4c0,00012f2d-c107-4d01-b41b-68d537917d7c,00023d22-f6bc-4012-b148-c983a910ddf9,0002a895-1877-4f69-8ded-c15568d64e8a,0002e105-2bde-491a-8d43-53d8d91ca99f,00031758-8c53-4ec6-8219-b9dfaa01bf67,00031c17-503f-4c17-a459-1a88f59278a8,0003227f-dfb9-4d87-85f9-db8f860f47dd,000333f9-5e77-4813-912d-faf4123b1631,...,ffef4414-ca17-41fa-a10c-ca0e9713e58d,fff1209b-7169-4d92-9b54-51370a1d2c2b,fff40cb0-1890-4f36-a1ac-63a05eb92fe8,fff8dc51-afc0-445d-bfe4-c692410915c2,fff97148-f68f-4469-a728-0abd65f751bb,fffbb892-66d2-4ee6-bdeb-f7d854d75b35,fffc0ee8-4407-4a43-b527-b3237ba2e9d8,fffc84e9-9d73-470e-85c4-679049a68baf,fffed480-36e9-42b6-b3c6-27dc516e6a34,fffef91a-27b8-466a-a913-61292d2d5efe
local_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
h_sample = H.sample(20, axis=1)
h_sample.to_numpy()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
reload(homoc)
t = time.time()
G = homoc.incidence_matrix_to_hyperedges(h_sample.to_numpy())
print('Done 1')
df_centrality = homoc.get_core(h_sample, [])
print('Done 2')
centrality_dict = df_centrality['node_centrality'].to_dict()
G2 = homoc.edges_by_centrality(G, centrality_dict)
print('Done 3')
core_nodes_homo = homoc.betterSolve(G2)
elapsed = time.time()-t
print('time it took to run program ', elapsed)

/home/machoi/higher-order-single-cell/source
Done 1
Done 2
Done 3
(41, 44, 73, 95, 155)
(91, 141)
Possible mappings to check 42
found the map: {91: 73, 141: 41}
(56, 91)
Possible mappings to check 6
weird tuple  (73, 73)
weird tuple  (73, 73)
These are the mappings that are causing an error {56: 73, 73: 73}
Possible mappings to check 5
found the map: {56: 41}
(60, 95, 156)
Possible mappings to check 42
found the map: {60: 73, 156: 41}
(73, 176)
Possible mappings to check 6
weird tuple  (73, 73)
weird tuple  (73, 73)
These are the mappings that are causing an error {73: 73, 176: 73}
Possible mappings to check 5
found the map: {176: 41}
(48, 161)
Possible mappings to check 42
found the map: {48: 161, 161: 73}
(112, 150)
Possible mappings to check 56
weird tuple  (73, 73)
weird tuple  (73, 73)
These are the mappings that are causing an error {161: 73, 73: 73}
found the map: {112: 161, 150: 73}
(16, 170)
Possible mappings to check 56
weird tuple  (73, 73)
weird tuple  (73, 73)
weird tuple 

In [None]:
break

# Centrality

In [None]:
def get_core(H, outlier_indices, function='log-exp', q=0.75, maxiter=10000):
    """
    Calculates nonlinear eigenvector centrality and related metrics.

    Args:
        H: The adjacency matrix (pandas DataFrame).
        outlier_indices: List of indices to drop from H.
        function: Nonlinear function for centrality calculation (default: 'log-exp').
        q: Quantile threshold for defining 'core' nodes (default: 0.75).
        maxiter: Maximum iterations for centrality calculation.

    Returns:
        pandas.DataFrame: A DataFrame with node information and centrality metrics.
    """
    
    # Remove outliers and zero-sum columns (without explicit checks)
    Hhat = H.drop(outlier_indices).loc[:, (H.sum(axis=0) != 0)]

    # Calculate centrality using NetworkX
    nodes_cent, _ = central.nonlinear_eigenvector_centrality(
        Hhat, 
        function=function, 
        maxiter=maxiter,
    )

    # Create DataFrame with results
    nodes = pd.DataFrame({
        'local_bin': Hhat.index,
        'node_centrality': nodes_cent,
        'zscores': stats.zscore(nodes_cent),
        'node_centrality_norm': nb.min_max(nodes_cent),  # Assuming you have ut.min_max elsewhere
        'core' : nodes_cent >= np.quantile(nodes_cent, q),
    })

    return nodes


nodes = get_core(H, outlier_indices)
print(f"{nodes.shape=}")

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 4, 1.5

sns.lineplot(
    data=nodes,
    x='local_bin',
    y='zscores',
    lw=1,
)

sns.scatterplot(
    data=nodes[nodes['core']],
    x='local_bin',
    y='zscores',
    s=10,
    c='r',
    ec='k', 
    zorder=5,
)

plt.axhline(y=0, lw=1, c='k')

plt.ylabel('Centrality')
plt.xlabel('Loci')

core_nodes = sorted(nodes[nodes['core']]['local_bin'].values)
print(f"{len(core_nodes)=}")

nodes = nodes.sort_values(by='zscores', ascending=False)
nodes['bin_number'] = nodes['local_bin'].astype(float)

nodes

In [None]:
# break

# Centrality and clique-expanded eigenvector

In [None]:
A = matrix.clique_expand_incidence(H, zero_diag=False) 
A = A.sort_index(axis=1)
A = A.sort_index(axis=0)

# find and report repeat regions
outlier_indices = matrix.find_outlier_row_indices(A, threshold=3)
print(f"'{outlier_indices=}'")

# drop outliers
A = A.drop(outlier_indices, axis=1)
A = A.drop(outlier_indices, axis=0)

A = matrix.normalize_oe(matrix.normalize_kr(A).todense())
A = np.asarray(A)

print(f"{A.shape=}")

pca = PCA(n_components=10)
pca.fit(A)
X_pca = pca.transform(A)
pc_1 = stats.zscore(X_pca[:, 0]) 

print(f"{pc_1.shape=} {nodes.shape=}")

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 6, 2

sns.barplot(
    x=range(4, len(X_pca)+4),
    y=pc_1,
    label='Clique-Expanded',
    # alpha=0.6,
)


sns.barplot(
    data=nodes.head(177), 
    x='local_bin',
    y='zscores',
    label='Higher-Order',
)

r, pval = scipy.stats.pearsonr(pc_1, nodes['zscores'].values)
print(f"Correlation: {r=:.3f}, (pval={pval:.5f})")

plt.xticks([])
plt.xlabel('Loci')
plt.ylabel('Magnitude')

# Define the core
### nodes with high centrality

In [None]:
"""
1. Nodes with high centrality
2. Higher-order contacts
3. Non-empty hyperedges (this is hanmdled in the get_core function)
"""
nodes = get_core(H, outlier_indices)
node_idx = nodes[nodes['core'] == True]['local_bin'].values

core = H.copy()
core = core.iloc[node_idx, :]
core = core.loc[:, (core.sum(axis=0) > 2)]

print(f"{core.shape=}")

In [None]:
Acore = matrix.clique_expand_incidence(core, zero_diag=False) 
Acore = Acore.sort_index(axis=1)
Acore = Acore.sort_index(axis=0)

plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 7, 7

sns.heatmap(
    np.log1p(Acore), 
    cmap='plasma',
    square=True, 
    cbar_kws={'shrink' : 0.5, 'label' : 'Contacts (log)'},
)

# plt.yticks([])
plt.xticks([])

plt.title('The Core (Clique-Expanded)')
plt.ylabel("Loci (1Mb)")
plt.xlabel("")

# plot the incidence matrix

In [None]:
node_params = {
    's' : 1,
    'ec' : 'k',
    'lw' : 1,
    'marker' : ".",
    'zorder' : 2,
}

line_params = {
    'lw' : 0.1,
    'alpha' : 0.5,
    'zorder' : 1,
}

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 5

sample_size = 500
plt2.plot_incidence(ut.sort_by_lowest_index(H.drop(outlier_indices).T.sample(sample_size).T), 
                    node_color='k',
                    node_params=node_params,
                    line_params=line_params)

plt.title(f"Chromosome {chromosomes[0]}")

In [None]:
reload(plt2)
dir(plt2)
plt2.__file__

In [None]:
node_params = {
    's' : 1,
    'ec' : 'k',
    'lw' : 1,
    'marker' : ".",
    'zorder' : 2,
}

line_params = {
    'lw' : 0.1,
    'alpha' : 0.5,
    'zorder' : 1,
}

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 5

sample_size = 500
plt2.plot_incidence(ut.sort_by_lowest_index(core.T.sample(sample_size).T), 
                    node_color='k',
                    node_params=node_params,
                    line_params=line_params)

plt.title(f"The Core of Chromosome {chromosomes[0]}")

# ATAC Seq

In [None]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/poreC/data/4DN_Features/ATACSeq/4DNFIPVAKPXA.bw"

reload(core_utils)
atac = core_utils.load_chromosome_feature(
    fpath, 
    chrom='chr2', 
    resolution=resolution,
)

atac['zscores'] = stats.zscore(atac['value'])

print(f"{atac.shape=}")
atac.head()

In [None]:
plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 6, 2


pdf = nodes.copy()
pdf = pd.merge(
    nodes, atac,
    how='left',
    left_on='local_bin',
    right_on='local_bin',
    suffixes=('_ho', '_atac')
)

start = 4
stop = 150

pdf = pdf.iloc[start:stop]

print(f"{pdf.shape=}")


sns.barplot(
    data=pdf, 
    x='local_bin',
    y='zscores_atac',
    label='ATAC-Seq',
)

sns.barplot(
    data=pdf, 
    x='local_bin',
    y='zscores_ho',
    label='Higher-Order',
)

r, pval = scipy.stats.pearsonr(pdf['zscores_atac'], pdf['zscores_ho'])
print(f"Correlation: {r=:.3f}, (pval={pval:.5f})")

plt.xticks([])
plt.xlabel('Loci')
plt.ylabel('Magnitude')

In [None]:

node_params = {
    's' : 1,
    'ec' : 'k',
    'lw' : 1,
    'marker' : ".",
    'zorder' : 2,
}

line_params = {
    'lw' : 0.1,
    'alpha' : 0.5,
    'zorder' : 1,
}

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 5

sample_size = 500
plt2.plot_incidence(ut.sort_by_lowest_index(core.T.sample(sample_size).T), 
                    node_color='k',
                    node_params=node_params,
                    line_params=line_params)

plt.title(f"The Core and ATAC-seq Peaks")

# plot at the atac seq signal
for idx, row in pdf.iterrows():
    
    if row['zscores_atac'] > 0:
        plt.gca().axhspan(
            row['local_bin'] - 0.5, 
            row['local_bin'] + 0.5,
            facecolor='red', 
            alpha=0.5,
        )


# Load Genes

In [None]:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/analysis/genes.parquet"

LOG_EXPRESSION_THRESHOLD = 2

gdf = pd.read_parquet(fpath)
gdf = gdf[gdf['is_gene']]
gdf = gdf[gdf['is_pt_gene']]

gdf = gdf[gdf['Chromosome'].isin(chromosomes)]
gdf['local_bin'] = gdf['midpoint'].apply(lambda x: np.ceil(x // resolution))
gdf['in_core'] = gdf['local_bin'].isin(core.index)

gdf['expressed_above_threshold'] = gdf['expression_log'] > LOG_EXPRESSION_THRESHOLD

print(f"{gdf.shape=}")
gdf.head()

In [None]:
threshold = 75

node_params = {
    's' : 1,
    'ec' : 'k',
    'lw' : 1,
    'marker' : ".",
    'zorder' : 2,
}

line_params = {
    'lw' : 0.1,
    'alpha' : 0.5,
    'zorder' : 1,
}

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 5

sample_size = 500
plt2.plot_incidence(ut.sort_by_lowest_index(core.T.sample(sample_size).T), 
                    node_color='k',
                    node_params=node_params,
                    line_params=line_params)

plt.title(f"The Core and Highly Expressed Genes")

# plot at the atac seq signal
for idx, row in gdf.iterrows():
    
    if row['expression'] > threshold:

        plt.gca().axhspan(
            row['local_bin'] - 0.5, 
            row['local_bin'] + 0.5,
            facecolor='blue', 
            alpha=0.25,
        )

In [None]:
core_genes = gdf[gdf['in_core'] & gdf['is_expressed']]
print(f"{core_genes.shape=}")

core_genes = core_genes.sort_values(by='expression', ascending=False)
core_genes[[
    'gene_name', 
    'is_tf', 
    'mESC_panglaoDB_marker',
    'mESC_panglaoDB_marker',
]].head(15)

In [None]:
genes_by_bin = gdf.groupby('local_bin').agg(
    n_genes = ('gene_name', 'nunique'),
    expressed_above_threshold = ('expressed_above_threshold', 'sum'),
    n_pangloa = ('mESC_panglaoDB_marker', 'sum'),
    n_go = ('mESC_GO_marker', 'sum'),
    mean_expression = ('expression', 'mean'),
    mean_log_expression = ('expression_log', 'mean'),
).reset_index()

genes_by_bin = pd.merge(genes_by_bin, nodes, 
                       how='left')

genes_by_bin = genes_by_bin.sort_values(by='mean_expression', ascending=False)
genes_by_bin.head(10)

In [None]:
plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 4, 2

fig, axs = plt.subplots(1, 3)
axs = axs.ravel()

columns = {
    'n_genes' : 'Genes',
    'expressed_above_threshold' : 'Expressed Genes',
    'mean_expression' : 'Mean Expression',
}


for i, (column, label) in enumerate(columns.items()):
    
    sns.boxplot(data=genes_by_bin,
                x='core',
                y=column,
                hue='core',
                showfliers=False,
                ax=axs[i],
                legend=False,
               )
    axs[i].set_xlabel("Core")
    axs[i].set_ylabel(label)
    
plt.tight_layout()

# Load Hi-C

In [None]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/poreC/data/f1219_population_hic/4DNFICF9PA9C.mcool"

chrom = "chr2"
clr = cooler.Cooler(f'{fpath}::resolutions/{1000000}')
Ahic = clr.matrix(balance=False).fetch(str(chrom))[:]
Ahic = np.nan_to_num(Ahic)
print(f"{Ahic.shape=}")

print(f"{Ahic.shape=}")

Ahic = pd.DataFrame(Ahic)
Ahic.head()

In [None]:
hic = pd.DataFrame({
    'local_bin' : list(range(len(Ahic))),
    'degree' : Ahic.sum(axis=1),
})

hic['in_core'] = hic['local_bin'].isin(core.index)

plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 1, 2
sns.boxplot(
    data=hic,
    x='in_core',
    y='degree',
    hue='in_core',
    showfliers=False,
    legend=False,
)

plt.ylabel("Hi-C Degree")
plt.xlabel("Core")

hic.head()

In [None]:
# find and report repeat regions
outlier_indices = matrix.find_outlier_row_indices(Ahic, threshold=3)
print(f"'{outlier_indices=}'")

# drop outliers
Hic_norm = Ahic.copy()
Hic_norm = Hic_norm.drop(outlier_indices, axis=1)
Hic_norm = Hic_norm.drop(outlier_indices, axis=0)

Hic_norm = matrix.normalize_oe(matrix.normalize_kr(Hic_norm.loc[3:, 3:]).todense())
Hic_norm = np.asarray(Hic_norm)

print(f"{Hic_norm.shape=}")

pca = PCA(n_components=10)
pca.fit(Hic_norm)
X_pca = pca.transform(Hic_norm)
hic_pc_1 = stats.zscore(X_pca[:, 0]) 

print(f"{hic_pc_1.shape=} {nodes.shape=}")

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 6, 2

sns.barplot(
    x=range(4, len(X_pca)+4),
    y=hic_pc_1,
    label='Hi-C',
)

sns.barplot(
    data=nodes.head(177), 
    x='local_bin',
    y='zscores',
    label='Higher-Order',
)

r, pval = scipy.stats.pearsonr(hic_pc_1, nodes['zscores'].values)
print(f"Correlation: {r=:.3f}, (pval={pval:.5f})")

plt.xticks([])
plt.xlabel('Loci')
plt.ylabel('Magnitude')

In [None]:
# compute overlap between bins

hic = pd.DataFrame({
    'local_bin' : range(4, len(hic_pc_1) + 4),
    'hic' : hic_pc_1,
})

hic = pd.merge(
    hic, 
    nodes, 
    how='left',
)

hic['hic_core'] = hic['hic'].values > np.quantile(hic['hic'].values, 0.75)

print(f"{hic.shape=}")

hic[['core', 'hic_core']].value_counts().reset_index()

# Single-cell

In [None]:
dpath = "/scratch/indikar_root/indikar1/shared_data/single_cell/align_table/"
file_list = glob.glob(f"{dpath}*")
np.random.shuffle(file_list)
file_list[:10]

In [None]:
break

In [None]:
reload(core_utils)
sample_size = 379

df_sc = core_utils.load_pore_c(
    file_list[:sample_size], 
    chrom_starts, 
    resolution=resolution, 
    chroms=chromosomes
)

df_sc['ones'] = 1
print(f"{df_sc.shape=}")
df_sc.head()

In [None]:
"""
Note: we keep the the full set because we may want to
find reads across multiple cells
"""
H_sc_full = ut.incidence_by_pivot(
    df_sc, 
    index='read_name',
    columns='local_bin',
    values='ones',
)
print(f"Total: {H_sc_full.shape=}")
H_sc = H_sc_full.T.drop_duplicates().T
print(f"Unique: {H_sc.shape=}")
H_sc.head()

In [None]:
A_sc = matrix.clique_expand_incidence(H_sc_full, zero_diag=False) 
A_sc = A_sc.sort_index(axis=1)
A_sc = A_sc.sort_index(axis=0)

# find and report repeat regions
outlier_indices = matrix.find_outlier_row_indices(A_sc, threshold=1.5)
print(f"'{outlier_indices=}'")

# drop outliers
A_sc = A_sc.drop(outlier_indices, axis=1)
A_sc = A_sc.drop(outlier_indices, axis=0)

print(f"{A_sc.shape=}")

plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = 7, 7

sns.heatmap(np.log1p(A_sc), 
            cmap='Reds',
            square=True, 
            cbar_kws={'shrink' : 0.45, 'label' : 'Contacts (log)'},
           )

plt.yticks([])
plt.xticks([])

plt.title(f"Chromosome {chromosomes[0]}")
plt.ylabel(f"Loci (1Mb)")
plt.xlabel("")

In [None]:
core_of_A_sc = A_sc.copy()

sns.heatmap(
    np.log1p(core_of_A_sc.loc[core.index, core.index]), 
    cmap='plasma',
    square=True, 
    cbar_kws={'shrink' : 0.5, 'label' : 'Contacts (log)'},
)

# plt.yticks([])
plt.xticks([])

plt.title('The Core in Single-Cells (Clique-Expanded)')
plt.ylabel("Loci (1Mb)")
plt.xlabel("")

In [None]:
# scpore-c core

core_sc = H_sc.copy()
core_sc = core_sc.loc[core.index]
core_sc = core_sc.loc[:, (core_sc.sum(axis=0) > 1)]
print(f"{core_sc.shape=}")

node_params = {
    's' : 7,
    'ec' : 'k',
    'lw' : 1,
    'marker' : ".",
    'zorder' : 2,
}

line_params = {
    'lw' : 0.1,
    'alpha' : 0.5,
    'zorder' : 1,
}

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 5

plt2.plot_incidence(
    ut.sort_by_lowest_index(core_sc), 
    node_color='k',
    node_params=node_params,
    line_params=line_params
)

plt.title("Core Nodes in Single-Cells")

# plot at the atac seq signal
for idx, row in pdf.iterrows():
    
    if row['zscores_atac'] > 0:
        plt.gca().axhspan(
            row['local_bin'] - 0.5, 
            row['local_bin'] + 0.5,
            facecolor='red', 
            alpha=0.5,
        )


In [None]:
break

In [None]:
print(f"{core.shape=}")
print(f"{H_sc.shape=}")

core_hyperedges = ut.incidence_to_list_of_list(core)
core_hyperedges = [sorted(x) for x in core_hyperedges]

sc_hyperedges = ut.incidence_to_list_of_list(H_sc)
sc_hyperedges = [sorted(x) for x in sc_hyperedges]


def jaccard_similarity_matrix(list1, list2):
    """
    Computes the Jaccard similarity matrix between two lists of lists.

    Args:
        list1: The first list of lists.
        list2: The second list of lists.

    Returns:
        A NumPy array where each element (i, j) is the Jaccard similarity 
        between the i-th set in list1 and the j-th set in list2.
    """

    # Convert to sets for efficient operations
    list1 = [set(l) for l in list1]
    list2 = [set(l) for l in list2]

    # Create arrays for efficient computation
    intersection_sizes = np.empty((len(list1), len(list2)))
    union_sizes = np.empty((len(list1), len(list2)))

    for i, set1 in enumerate(list1):
        for j, set2 in enumerate(list2):
            intersection_sizes[i, j] = len(set1 & set2)
            union_sizes[i, j] = len(set1 | set2)

    # Handle potential division by zero (empty sets)
    with np.errstate(divide='ignore', invalid='ignore'):
        jaccard_matrix = np.where(
            union_sizes == 0, 0, intersection_sizes / union_sizes
        )
    # Convert to float64 for numerical stability
    jaccard_matrix = jaccard_matrix.astype('float64')

    return jaccard_matrix


jsm = jaccard_similarity_matrix(sc_hyperedges, core_hyperedges)
jsm = pd.DataFrame(
    jsm,
    index=H_sc.columns,
    columns=core.columns,
)
print(f"{jsm.shape=}")
jsm.head()

In [None]:
# drop scpore-c reads with no similarity to the core

print(f"Total scPore-C Concatemers: {len(jsm)}")
potential_core = jsm[jsm.sum(axis=1) > 0]
print(f"Potential scPore-C Core Concatemers: {len(potential_core)}")
print()

for sc_read, vector in potential_core.head(10).iterrows():
    
    core_idx = np.argmax(vector)
    print(core_idx)
    
    core_read = potential_core.columns[core_idx]
    
    print(sc_read)
    print(core_read)
    
    break
    
#     print(f"\n{similarity=:.3f}")
#     print("core: ", list(core[core_read][core[core_read] > 0].index))
#     print("sc: ", list(H_sc[sc_read][H_sc[sc_read] > 0].index))
    



In [None]:
df_sc[df_sc['read_name'] == sc_read]


In [None]:
df[df['read_name'] == core_read]

In [None]:
break

In [None]:
jsm.max(axis=1)

In [None]:
H_sc.head()

In [None]:
sns.clustermap(jsm)

In [None]:
break

In [None]:
node_params = {
    's' : 1,
    'ec' : 'k',
    'lw' : 1,
    'marker' : ".",
    'zorder' : 2,
}

line_params = {
    'lw' : 0.1,
    'alpha' : 0.5,
    'zorder' : 1,
}

plt.rcParams['figure.dpi'] = 200
plt.rcParams['figure.figsize'] = 8, 5

sample_size = 500
plt2.plot_incidence(ut.sort_by_lowest_index(H_sc.drop(outlier_indices).T.sample(sample_size).T), 
                    node_color='k',
                    node_params=node_params,
                    line_params=line_params)

plt.title(f"Chromosome {chromosomes[0]}")

In [None]:
break

In [None]:
break

In [None]:
break

# choose core by gene expression and fiedler value

In [None]:
res = []

for expression_threshold in sorted(genes_by_bin['mean_expression'].unique()):
    tmp = genes_by_bin[genes_by_bin['mean_expression'] >= expression_threshold]
    node_idx = tmp['local_bin'].astype(int).values
    node_idx = node_idx[node_idx > 35]

    core = H.loc[node_idx]

    # remove non-existant edges
    core = core.loc[:, (core.sum(axis=0) != 0)]

    if core.empty:
        continue
        
    try:
        L = matrix.normalized_hypergraph_laplacian(core).todense()        
        eigenvalues, eigenvectors = np.linalg.eigh(L)
        fiedler_number = eigenvalues[1]
    except:
        fiedler_number = None
        
    row = {
        'expression_threshold' : expression_threshold,
        'n_nodes' : len(node_idx),
        'core_nodes' : node_idx,
        'fiedler_number' : fiedler_number,
    }
    res.append(row)
    
res = pd.DataFrame(res)
res = res.sort_values(by='expression_threshold', ascending=False)
res.head(10)

# Maximize the Fiedler Value

In [None]:
"""choosing from node centralities only"""

res = []

for n_t in nodes['node_centrality_norm'].values:
    
    node_idx = sorted(nodes[nodes['node_centrality_norm'] >= n_t]['local_bin'].values)
    core = H.loc[node_idx]

    # remove non-existant edges
    core = core.loc[:, (core.sum(axis=0) != 0)]

    if core.empty:
        continue
        
    try:
        L = matrix.normalized_hypergraph_laplacian(core).todense()        
        eigenvalues, eigenvectors = np.linalg.eigh(L)
        fiedler_number = eigenvalues[1]
    except:
        fiedler_number = None

    row = {
        'node_thresh' : n_t,
        'n_nodes' : len(node_idx),
        'core_nodes' : node_idx,
        'fiedler_number' : fiedler_number,
    }
    res.append(row)
    
res = pd.DataFrame(res)
res = res.sort_values(by='fiedler_number', ascending=False)
res.head(10)