In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
plt.style.use("/home1/smaruj/akitaX1-analyses/figures/plot_styles/global_plotting_style.mplstyle")

from akita_utils.format_io import h5_to_df
from scipy import stats

In [2]:
import sys
sys.path.insert(0, "/home1/smaruj/akitaX1-analyses/utils/")
from reading_averaging_utils import (read_and_average_genomic_exp, average_stat_over_targets, average_stat_over_backgrounds)

In [3]:
# stat metric the analysis is based on
stat_of_analysis_interest = "SCD"

data_dir = "/project/fudenber_735/akitaX1_analyses_data/genomic_disruption/disruption_by_permutation"

In [4]:
# reading data, averaging over targets
df = read_and_average_genomic_exp(data_dir, stat_to_average=stat_of_analysis_interest)

reading h5 files to dataframes
averaging over targets
collecting data for SCD


In [5]:
df

Unnamed: 0,chrom,end,start,strand,SCD_m0,SCD_m1,SCD_m2,SCD_m3,SCD_m4,SCD_m5,SCD_m6,SCD_m7,SCD
0,chr1,4403286,4403267,-,0.851074,0.476562,1.153320,2.117188,0.618652,0.324219,0.494141,1.059570,0.886230
1,chr1,4770074,4770055,+,42.156250,34.250000,29.265625,25.500000,27.562500,21.703125,24.765625,32.468750,29.718750
2,chr1,4770199,4770180,-,0.596191,0.125366,0.130981,0.770996,3.062500,0.812500,0.302734,0.756348,0.820312
3,chr1,4770886,4770867,+,0.176636,0.663574,0.180542,0.191895,0.100586,0.154175,0.268311,0.128906,0.233154
4,chr1,4773454,4773435,+,0.275146,0.732910,0.825684,0.532715,0.627930,0.246216,0.731934,0.657715,0.579102
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7555,chr1,59388445,59388426,+,0.350342,0.342773,0.152466,0.317383,0.420166,0.376709,0.268799,0.075989,0.288086
7556,chr1,59388695,59388676,-,0.230835,1.371094,0.929688,0.661621,0.555176,0.268066,0.299316,0.434570,0.593750
7557,chr1,59389409,59389390,+,11.492188,17.484375,14.289062,12.718750,17.375000,13.507812,15.875000,22.437500,15.648438
7558,chr1,59770176,59770157,-,0.107727,0.072754,0.083069,0.053833,0.092102,0.029770,0.109192,0.078125,0.078308


In [12]:
top10 = df.sort_values(by="SCD", ascending=False)[:10].reset_index(drop=True)

In [13]:
top10.to_csv('top10_disruption.tsv', sep="\t") 

In [14]:
# reading additional independent permutation for model 0
df_m0_perm = h5_to_df(data_dir+"/model_0_perm.h5", ["SCD", "INS-16", "INS-64"], average=False)
df_m0_perm_tg = average_stat_over_targets(df_m0_perm, model_index=0, head_index=1, stat=stat_of_analysis_interest)

## Disruption scores for specific sites are correlated across random motif permutations

In [None]:
plt.figure(figsize=(8, 7))

plt.scatter(x=df[f"{stat_of_analysis_interest}_m0"], y=df_m0_perm_tg[f"{stat_of_analysis_interest}_m0"], alpha=0.2, edgecolors='w', linewidth=0.5, color='blue')

perm_corr = round(stats.pearsonr(df[f"{stat_of_analysis_interest}_m0"], df_m0_perm_tg[f"{stat_of_analysis_interest}_m0"]).statistic, 4)

plt.xlabel("Disruption Score - Permutation 1")
plt.ylabel("Disruption Score - Permutation 2")

plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.5)

plt.text(x=0.71, y=0.05, s=f"Pearson R = {perm_corr}", transform=plt.gca().transAxes, verticalalignment='top')

# plt.savefig("./plots/disruption_score_independent_permutations.pdf", format="pdf", bbox_inches="tight")

## Correlations between models

In [None]:
plt.figure(figsize=(8, 7))

plt.scatter(x=df[f"{stat_of_analysis_interest}_m0"], y=df[f"{stat_of_analysis_interest}_m1"], alpha=0.3, edgecolors='w', linewidth=0.5, color='blue')
m0_m1_corr = round(stats.pearsonr(df[f"{stat_of_analysis_interest}_m0"], df[f"{stat_of_analysis_interest}_m1"]).statistic, 4)

plt.xlabel("Disruption Score - Model 0")
plt.ylabel("Disruption Score - Model 1")

plt.grid(True, which='both', linestyle='--', linewidth=0.5, alpha=0.5)

plt.text(x=0.71, y=0.05, s=f"Pearson R = {m0_m1_corr}", transform=plt.gca().transAxes, verticalalignment='top')

# plt.savefig("./plots/disruption_score_model0_model1.pdf", format="pdf", bbox_inches="tight")

## Correlations between all models

In [None]:
num_models = 8

models_corr_summary = np.zeros((num_models,num_models))

fig, axs = plt.subplots(num_models, num_models, figsize=(20, 20), sharex=True, sharey=True, layout="constrained")

for model_index_x in range(num_models):
    for model_index_y in range(num_models):
        if (model_index_y >= model_index_x):
            
            stat_x = np.array(df[f"{stat_of_analysis_interest}_m{model_index_x}"])
            stat_y = np.array(df[f"{stat_of_analysis_interest}_m{model_index_y}"])
            
            axs[model_index_x, model_index_y].scatter(x=stat_x, y=stat_y, alpha=0.3)
            
            models_corr_summary[model_index_x, model_index_y] = models_corr_summary[model_index_y, model_index_x] = stats.pearsonr(stat_x, stat_y).statistic
            
names = [f"m {i}" for i in range(num_models)]

for ax, col in zip(axs[0], names):
    ax.set_title(col, size=12)

for ax, row in zip(
    axs[:, 0],
    names,
):
    ax.set_ylabel(row, rotation=0, fontsize=12, labelpad=20)
    
fig.suptitle(f"Correlations of {stat_of_analysis_interest} between models", fontsize=16)

In [None]:
fig = plt.figure(figsize=(8, 6))

ax = fig.add_subplot(111)
cax = ax.matshow(models_corr_summary, cmap="Wistia", interpolation='nearest')
fig.colorbar(cax)

names = [f"t_{i}" for i in range(num_models)]

xaxis = np.arange(len(names))
ax.set_xticks(xaxis)
ax.set_yticks(xaxis)
ax.set_xticklabels(names)
ax.set_yticklabels(names)

for (x, y), value in np.ndenumerate(models_corr_summary):
    plt.text(x, y, f"{value:.3f}", va="center", ha="center")

fig.suptitle(f"Correlations of {stat_of_analysis_interest} between models", fontsize=16)

fig.show()

## Correlations between targets for model 0

In [None]:
num_targets = 6
model_index = 0
targets_corr_summary = np.zeros((num_targets, num_targets))

fig, axs = plt.subplots(num_targets, num_targets, figsize=(15, 15), sharex=True, sharey=True, layout="constrained")

for target_index_x in range(num_targets):
    for target_index_y in range(num_targets):
        if (target_index_y >= target_index_x):
            
            stat_x = np.array(df_m0_perm_tg[f"{stat_of_analysis_interest}_h1_m{model_index}_t{target_index_x}"])
            stat_y = np.array(df_m0_perm_tg[f"{stat_of_analysis_interest}_h1_m{model_index}_t{target_index_y}"])
            
            axs[target_index_x, target_index_y].scatter(x=stat_x, y=stat_y, alpha=0.3)
            
            targets_corr_summary[target_index_x, target_index_y] = targets_corr_summary[target_index_y, target_index_x] = stats.pearsonr(stat_x, stat_y).statistic
            
names = [f"tg {i}" for i in range(num_targets)]

for ax, col in zip(axs[0], names):
    ax.set_title(col, size=12)

for ax, row in zip(
    axs[:, 0],
    names,
):
    ax.set_ylabel(row, rotation=0, fontsize=12, labelpad=20)
    
fig.suptitle(f"Correlations of {stat_of_analysis_interest} between targets", fontsize=16)

In [None]:
fig = plt.figure(figsize=(8, 6))

ax = fig.add_subplot(111)
cax = ax.matshow(targets_corr_summary, cmap="Wistia", interpolation='nearest')
fig.colorbar(cax)

names = [f"t_{i}" for i in range(num_targets)]

xaxis = np.arange(len(names))
ax.set_xticks(xaxis)
ax.set_yticks(xaxis)
ax.set_xticklabels(names)
ax.set_yticklabels(names)

for (x, y), value in np.ndenumerate(targets_corr_summary):
    plt.text(x, y, f"{value:.2f}", va="center", ha="center")

fig.suptitle(f"Correlations of {stat_of_analysis_interest} between targets", fontsize=16)

fig.show()