# Enhancer-Enhancer Clustering

To assess whether enhancers cluster, we plot the distributions of mean enhancer-enhancer and mean promoter-enhancer distances for (gene, celltype) combinations.

We use the data from figure 2 of the main text.

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import distance_matrix
import seaborn as sns
from matplotlib import pyplot as plt


def get_mean_inter_enhancer_distance(dfi):

    # if we only have pixel coordinates and distance in micron:
    # get pixel size by solving equation system Ax = b
    # with A: squared pixel vectors between enhancers - promoter, b: squared distance in um
    # pixel size is then given by sqrt(x)
    coords_promoter = dfi[[f"{d}_1" for d in "zyx"]].values
    coords_enhancers = dfi[[f"{d}_2" for d in "zyx"]].values
    pixel_vectors = (coords_enhancers - coords_promoter)
    pixel_size = np.linalg.lstsq(pixel_vectors**2, dfi["distance_um"]**2)[0]**(1/2)

    # enhancer coords to um
    coords_enhancers_um = coords_enhancers * pixel_size

    # select one side of distance matrix -> pairwise enhancer distances (excluding diagonal with distance to self = 0)
    enhancer_distances = distance_matrix(coords_enhancers_um, coords_enhancers_um)[np.tri(len(coords_enhancers_um), k=-1, dtype=bool)]

    return enhancer_distances.mean()


df = pd.read_csv("fig2_revision.csv")

# get mean promoter-enhancer and inter-enhancer distances for each sted FOV
promoter_enhancer_meandist = df.groupby(["fov", "gene", "celltype"])["distance_um"].mean()
enhancer_enhancer_meandist = df.groupby(["fov", "gene", "celltype"]).apply(get_mean_inter_enhancer_distance, include_groups=False)
df_meandist = pd.DataFrame({"promoter_enhancer_meandist": promoter_enhancer_meandist, "enhancer_enhancer_meandist": enhancer_enhancer_meandist})

# melt 2 distance types for sns plotting
df_for_plot = df_meandist.reset_index().melt(id_vars=["fov", "gene", "celltype"], value_name="distance", var_name="distance_type")

g = sns.FacetGrid(df_for_plot, row="celltype", col="gene", hue="distance_type")
g.map(sns.histplot, "distance", element="step", alpha=0.1, stat="density")
g.add_legend()

In [None]:

from scipy.stats import mannwhitneyu

pvals = []
meds = []

# mean EP vs mean EE
print("EP - EE")
for group, dfi in df_for_plot.groupby([ "celltype", "gene"]):
    (i1, d1), (i2, d2) = dfi.groupby("distance_type")
    res = mannwhitneyu(d1["distance"], d2["distance"])
    pvals.append(res.pvalue)
    print(group,  (i1, i2), res, d1["distance"].median(), d2["distance"].median())
    meds.append((np.round(d1["distance"].median(), 3), np.round(d2["distance"].median(), 3)))

# naive - primed
print("naive - primed")
for group, dfi in df_for_plot.groupby([ "gene", "distance_type"]):
    (i1, d1), (i2, d2) = dfi.groupby("celltype")
    print(group, (i1, i2), mannwhitneyu(d1["distance"], d2["distance"]), d1["distance"].median(), d2["distance"].median())

from statsmodels.stats.multitest import multipletests
_, pvals, _, _ = multipletests(pvals, method='fdr_bh')
pvals.round(5), meds

In [None]:
g = sns.FacetGrid(df_for_plot, row="celltype", col="gene", hue="distance_type", col_order=sorted(df_for_plot["gene"].unique()))
g.map(sns.histplot, "distance", element="step", alpha=0.1, stat="density")
g.add_legend()

for ax, pval, (medee, medep) in zip(g.axes.flat, pvals.round(5), meds):
    ax.annotate(f"Medians:\nE-P: {round(medep*1e3)} nm\nE-E: {round(medee*1e3)} nm\np-val: {pval if pval != 0 else "<1e-5"}", (1.0,1.8))

# with plt.rc_context({'svg.fonttype': 'none'}):
#     g.savefig("enhancer-clustering-fig.svg")

# plt.rcParams.keys()

In [None]:
# COMBINE new Nanog data, old data for other genes

df = pd.read_csv("/Users/david/Downloads/final_data_fig2.csv")
df2 = pd.read_csv("/Users/david/Downloads/fig2.csv")

# add / rename columns to new data to match old
df = df.rename(columns={"experiment.cell_type_0": "celltype", "len3d": "distance_um"})
df = df.rename(columns={"x_0": "x_1", "y_0": "y_1", "z_0": "z_1", "x_1": "x_2", "y_1": "y_2", "z_1": "z_2"})
df["fov"], _ = df.img.factorize()
df["gene"] = "Nanog"

# new fov indices start where old left off
df["fov"] += 1 + df2["fov"].max()

# combine: only columns already in old, non-Nanong data from old, Nanog from new
df = df[df2.columns]
df2 = df2[df2["gene"] != "Nanog"]
df = pd.concat([df2, df]).reset_index(drop=True)

# df.to_csv("fig2_revision.csv", index=None)

# Other Plots

These are more complicated, so we leave them out

## Plot per enhancer pair

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import distance_matrix
from matplotlib import pyplot as plt
from itertools import combinations
from collections import defaultdict

df = pd.read_csv("fig2.csv")

promoter_distance_dfs = []

for idx, dfi in df.groupby(["fov", "gene", "celltype"]):

    # sort so enhancer idxs correspond to neighbor rank
    dfi = dfi.sort_values("distance_um")

    res_df = defaultdict(list)

    # enhancer coords in units (pixel sizes solved via equation sys.)
    coords_promoter = dfi[[f"{d}_1" for d in "zyx"]].values
    coords_enhancers = dfi[[f"{d}_2" for d in "zyx"]].values
    pixel_size = np.linalg.lstsq((coords_enhancers - coords_promoter)**2, dfi["distance_um"]**2)[0]**(1/2)
    coords_enhancers_um = coords_enhancers * pixel_size

    # iterate enhancer pairs
    for i, j in combinations(range(len(coords_enhancers_um)), 2):

        # E-E distance
        d = np.linalg.norm(coords_enhancers_um[i] - coords_enhancers_um[j])

        res_df["enhancer_idx_1"].append(i)
        res_df["enhancer_idx_2"].append(j)
        res_df["enhancer_dist_um"].append(d)
        res_df["promoter_dist_1_um"].append(dfi["distance_um"].iloc[i])
        res_df["promoter_dist_2_um"].append(dfi["distance_um"].iloc[j])

    res_df = pd.DataFrame(res_df)
    res_df[["fov", "gene", "celltype"]] = idx
    promoter_distance_dfs.append(res_df)

promoter_distance_df = pd.concat(promoter_distance_dfs)

# avg distance to promoter of enhancer pair
promoter_distance_df["promoter_dist_avg"] = promoter_distance_df[["promoter_dist_1_um", "promoter_dist_2_um"]].mean(axis=1)

# maximal possible distance (assuming they are on opposite sides of promoter)
promoter_distance_df["max_dist"] = promoter_distance_df[["promoter_dist_1_um", "promoter_dist_2_um"]].sum(axis=1)
# minimal possible distance (assuming they are on same side of promoter)
promoter_distance_df["min_dist"] = (promoter_distance_df["promoter_dist_1_um"] - promoter_distance_df["promoter_dist_2_um"]).abs()

# normalize enhancer-enhancer dist as fraction of maximum possible distance
promoter_distance_df["enhancer_dist_norm"] = promoter_distance_df["enhancer_dist_um"] / promoter_distance_df["max_dist"]

In [None]:
import hvplot.pandas
import holoviews as hv

# interactive plot with selectable gene, celltype
(
    # scatter enhancer dist - avg dist to promoter
    promoter_distance_df.hvplot.scatter("promoter_dist_avg", "enhancer_dist_um", groupby=['gene', 'celltype'], height=800, width=1000)
    # add maximum line
    * promoter_distance_df.hvplot.line("promoter_dist_avg", "max_dist", color="red")
)

## Pseudo-pair correlation function

In [None]:
import seaborn as sns

for (gene, celltype), dfi in promoter_distance_df.groupby(["gene", "celltype"]):

    # make histogram
    counts, bin_edges = np.histogram(dfi["enhancer_dist_um"], bins=50)
    counts = counts.astype(float)
    centers = (bin_edges[1:] + bin_edges[:-1]) / 2

    # go over bin edges pairwise
    i1, i2 = iter(bin_edges), iter(bin_edges)
    next(i2)
    for i, (mi_, ma_) in enumerate(zip(i1, i2)):
        # normalize by shell volume
        # assuming uniform distribution, the expected counts would be proportional to volume
        # this gives us observed/expected, but NOTE: we don't account for absolute particle density
        counts[i] /= (ma_**3 - mi_**3)

    # normalize (as we don't have absolute units anyway)
    counts /= counts.sum()

    sns.lineplot(x=centers, y=counts, label=(gene, celltype))



In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

data = pd.read_csv('fig2.csv')

# sort by fov and then by distance, to attach neighbor rank
data = data.sort_values(['fov', 'distance_um'])
data['neighbor_rank']= np.concat([np.arange(1, len(dfi)+1) for _, dfi in data.groupby('fov')])

data

In [None]:
grid = sns.FacetGrid(data=data, row="celltype", col="gene", hue="neighbor_rank")
grid.map(sns.histplot, "distance_um", element="step", fill=None, alpha=0.7, cumulative=False, stat="density", bins=30)

In [None]:
# calculate in advance before so we can do lineplot

def get_stat_df(data, num_bins=25):

    df_stats = []

    for i, dfi in data.groupby(['gene', 'celltype', 'neighbor_rank']):

        counts, bins = np.histogram(dfi['distance_um'], bins=np.linspace(0, data['distance_um'].max(), num_bins+1))
        probs = counts / counts.sum()
        probs_cumulative = np.cumsum(probs)
        bin_centers = (bins[:-1] + bins[1:]) / 2

        df_hist = pd.DataFrame({"prob": probs, "distance_um": bin_centers, "bin_start": bins[:-1], "bin_end": bins[1:]})
        df_hist['stat'] = 'probability'

        df_hist_cum = pd.DataFrame({"prob": probs_cumulative, "distance_um": bin_centers, "bin_start": bins[:-1], "bin_end": bins[1:]})
        df_hist_cum['stat'] = 'probability_cumulative'

        df_stats_i = pd.concat([df_hist, df_hist_cum])
        df_stats_i[['gene', 'celltype', 'neighbor_rank']] = i

        df_stats.append(df_stats_i)

    df_stats = pd.concat(df_stats)
    df_stats['color'] = df_stats['celltype'].map({'naive': 'blue', 'primed': 'red'})

    return df_stats

num_bins = 20
df_stats = get_stat_df(data, num_bins)

df_stats = df_stats[df_stats["stat"] == "probability"]

df_stats["prob_norm"] = df_stats["prob"] / (df_stats["bin_end"] ** 3 - df_stats["bin_start"]**3)
df_stats

grid = sns.FacetGrid(data=df_stats, col="gene", row="neighbor_rank", hue="celltype", sharey=False)
grid.map(sns.lineplot, "distance_um", "prob_norm", alpha=0.7)

## Nearest - Avg. Neighbor Distance per enhancer

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

data = pd.read_csv('fig2.csv')

# sort by fov and then by distance, to attach neighbor rank
data = data.sort_values(['fov', 'distance_um'])
data['neighbor_rank']= np.concat([np.arange(1, len(dfi)+1) for _, dfi in data.groupby('fov')])

data

In [None]:
dfis = []

for _, dfi in data.groupby(["fov", "gene", "celltype"]):

    coords_promoter = dfi[[f"{d}_1" for d in "zyx"]].values
    coords_enhancers = dfi[[f"{d}_2" for d in "zyx"]].values
    pixel_size = np.linalg.lstsq((coords_enhancers - coords_promoter)**2, dfi["distance_um"]**2)[0]**(1/2)

    coords_enhancers_um = coords_enhancers * pixel_size
    coords_enhancers_um
    # promoter_distances = distance_matrix(coords_enhancers, coords_enhancers)[np.tri(len(coords_enhancers), k=-1, dtype=bool)]

    d = distance_matrix(coords_enhancers_um, coords_enhancers_um)
    d_others = d[~np.diag(np.ones(len(coords_promoter), dtype=bool))].reshape(len(coords_promoter), len(coords_promoter) - 1)

    dfi["enhancer_nearest_um"] = d_others.min(axis=1)
    dfi["enhancer_mean_um"] = d_others.mean(axis=1)
    dfi["enhancer_median_um"] = np.median(d_others, axis=1)

    dfis.append(dfi)

data_with_enhancer_dist = pd.concat(dfis)
data_with_enhancer_dist

d_others.mean(), d_others.mean(axis=1).mean()

In [None]:
grid = sns.FacetGrid(data=data_with_enhancer_dist, row="celltype", col="gene", hue="neighbor_rank")
grid.map(sns.histplot, "distance_um", element="step", fill=None, alpha=0.7, cumulative=True, stat="density")
with sns.color_palette("magma"):
    grid.map(sns.histplot, "enhancer_mean_um", element="step", fill=None, alpha=0.7, cumulative=True, stat="density")


In [None]:
d2 = data_with_enhancer_dist.melt(id_vars=["gene", "celltype", "neighbor_rank"], value_vars=["enhancer_nearest_um", "distance_um"])

grid = sns.FacetGrid(data=d2, row="neighbor_rank", col="gene", hue="variable")
grid.map(sns.histplot, "value", element="step", fill=None, alpha=0.7, cumulative=True, stat="density")
plt.legend()