In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import pyfastx
import os
import h5py

In [2]:
def plot_side(arr, ylim=[-2, 2.5], xticks=[], yticks=[0, 2], pic_name=None):
    assert arr.shape[0] % 2 == 0, "arr must have even length."
    midpoint = int(arr.shape[0]/2)
    pl = arr[:midpoint]
    mn = arr[midpoint:]
    plt.bar(range(pl.shape[0]), pl, width=2, color="r",)
    plt.bar(range(mn.shape[0]), -mn, width=2, color="b")
    axes = plt.gca()
    axes.set_ylim(ylim)
    axes.set_yticks(yticks)
    axes.set_xticks(xticks)
    axes.spines[["right", "top", "bottom"]].set_visible(False)
    plt.xlim(-0.5, pl.shape[0]-.5)
    
    if pic_name is None:
        plt.show()
    else:
        plt.savefig(pic_name, transparent=True)
        plt.close()

In [3]:
server = "cbsu"
mnt = "/home2/ayh8/"
predict_dir = "predictions/ensemble/examples/"
data_dir = "data/gse110638/examples/"
fasta_fp = "concat_sequence.fna.gz"
procap_fp = "concat_procap.npz"
prediction_fp = "ensemble_examples_prediction.h5"
workdir = "/Users/adamhe/github/scratch/examples"

In [6]:
os.makedirs(workdir, exist_ok=True)
print(f"scp {server}:{os.path.join(mnt, data_dir, fasta_fp)} {workdir}")
print(f"scp {server}:{os.path.join(mnt, data_dir, procap_fp)} {workdir}")
print(f"scp {server}:{os.path.join(mnt, predict_dir, prediction_fp)} {workdir}")

scp cbsu:/home2/ayh8/data/gse110638/examples/concat_sequence.fna.gz /Users/adamhe/github/scratch/examples
scp cbsu:/home2/ayh8/data/gse110638/examples/concat_procap.npz /Users/adamhe/github/scratch/examples
scp cbsu:/home2/ayh8/predictions/ensemble/examples/ensemble_examples_prediction.h5 /Users/adamhe/github/scratch/examples


In [8]:
fasta = pyfastx.Fasta(os.path.join(workdir, fasta_fp))
tracks = h5py.File(os.path.join(workdir, prediction_fp))["track"]
quantity = h5py.File(os.path.join(workdir, prediction_fp))["quantity"]
y = np.load(os.path.join(workdir, procap_fp))["arr_0"][:, np.r_[250:750, 1250:1750]]
seqs = pyfastx.Fasta(os.path.join(workdir, fasta_fp))
ids = [rec.name for rec in seqs]

In [9]:
y_norm = tracks / np.array(tracks).sum(axis=1, keepdims=True)
y_pred_scaled = y_norm * quantity

In [10]:
#mx1_coord = "chr21:41425651-41426650"
#mx1_ids = [idx.split("_")[-1] == mx1_coord for idx in ids]
#plot_side(np.mean(y[mx1_ids], axis=0), ylim=[-40, 50],  yticks=[0, 40], pic_name="ensemble_mx1_prom_expt.pdf")
#plot_side(np.mean(y_pred_scaled[mx1_ids], axis=0), ylim=[-4, 5], yticks=[0, 4], pic_name="ensemble_mx1_prom_pred.pdf")

ifnar2_coord = "chr21:33229367-33230366"
ints6_coord = "chr13:51452691-51453690"
ttll9_enh_coord = "chr20:31894784-31895783" # EH38E2106518
EH38E2695789_enh_coord = "chr9:70418561-70419560" # EH38E2695789, EH38E2695790
klf9_enh_coord = "chr9:70411667-70412666" # EH38E2695773
klf9_dt_enh_coord = "chr9:70419676-70420675" # EH38E2695794, EH38E2695793
kif3b_enh_coord = "chr20:32285520-32286519" # KIF3B, EH38E2106824, EH38E2106825
EH38E2107427_enh_coord = "chr20:32971851-32972850" # EH38E2107427
NASP_prom_coord = "chr1:45583513-45584512" # NASP
EH38E3485200_enh_coord = "chr22:43187170-43188169"
irf4_prom_coord = "chr6:391151-392150"
rpl10a_prom_coord = "chr6:35467831-35468830"
rpl35_prom_coord = "chr9:124861481-124862480"
irf1_prom_coord = "chr5:132490401-132491400"
irf7_prom_coord = "chr11:615501-616500"
irf8_prom_coord = "chr16:85898601-85899600"

ifnar2_ids = [idx.split("_")[-1] == ifnar2_coord for idx in ids]
ints6_ids = [idx.split("_")[-1] == ints6_coord for idx in ids]
ttll9_enh_ids = [idx.split("_")[-1] == ttll9_enh_coord for idx in ids]
EH38E2695789_enh_ids = [idx.split("_")[-1] == EH38E2695789_enh_coord for idx in ids]
klf9_enh_ids = [idx.split("_")[-1] == klf9_enh_coord for idx in ids]
klf9_dt_enh_ids = [idx.split("_")[-1] == klf9_dt_enh_coord for idx in ids]
kif3b_enh_ids = [idx.split("_")[-1] == kif3b_enh_coord for idx in ids]
EH38E2107427_enh_ids = [idx.split("_")[-1] == EH38E2107427_enh_coord for idx in ids]
NASP_prom_ids = [idx.split("_")[-1] == NASP_prom_coord for idx in ids]
EH38E3485200_enh_ids = [idx.split("_")[-1] == EH38E3485200_enh_coord for idx in ids]
irf4_prom_ids = [idx.split("_")[-1] == rpl10a_prom_coord for idx in ids]
rpl10a_prom_ids = [idx.split("_")[-1] == rpl10a_prom_coord for idx in ids]
rpl35_prom_ids = [idx.split("_")[-1] == rpl35_prom_coord for idx in ids]
irf1_prom_ids = [idx.split("_")[-1] == irf1_prom_coord for idx in ids]
irf7_prom_ids = [idx.split("_")[-1] == irf7_prom_coord for idx in ids]
irf8_prom_ids = [idx.split("_")[-1] == irf8_prom_coord for idx in ids]

In [11]:
sum(irf8_prom_ids)

67

In [12]:
plot_side(np.mean(y[ifnar2_ids], axis=0), ylim=[-1.5, 15], yticks=[0, 15], pic_name="ensemble_ifnar2_prom_expt.pdf")

In [33]:
plot_side(np.mean(y_pred_scaled[ifnar2_ids], axis=0), ylim=[-1, 10], yticks=[0, 10], pic_name="ensemble_ifnar2_prom_pred.pdf")

In [10]:
plot_side(np.mean(y[ints6_ids], axis=0), ylim=[-100, 16.7], yticks=[-100, 0], pic_name="ensemble_ints6_prom_expt.pdf")

In [55]:
plot_side(np.mean(y_pred_scaled[ints6_ids], axis=0), ylim=[-18, 3], yticks=[-18, 0], pic_name="ensemble_ints6_prom_pred.pdf")

In [50]:
plot_side(np.mean(y[klf9_dt_enh_ids], axis=0), ylim=[-3.5, 7], yticks=[0, 7], pic_name="ensemble_klf9_dt_enh_expt.pdf")

In [51]:
plot_side(np.mean(y_pred_scaled[klf9_dt_enh_ids], axis=0), ylim=[-2.5, 5], yticks=[0, 5], pic_name="ensemble_klf9_dt_enh_pred.pdf")

In [60]:
plot_side(np.mean(y[EH38E2107427_enh_ids], axis=0), ylim=[-0.8, 0.6], yticks=[0, 0.6], pic_name="ensemble_EH38E2107427_enh_expt.pdf")

In [59]:
plot_side(np.mean(y_pred_scaled[EH38E2107427_enh_ids], axis=0), ylim=[-1.2, 0.9], yticks=[0, 0.9], pic_name="ensemble_EH38E2107427_enh_pred.pdf")

In [69]:
plot_side(np.mean(y[NASP_prom_ids], axis=0), ylim=[-5, 50], yticks=[0, 50], pic_name="ensemble_nasp_prom_expt.pdf")

In [68]:
plot_side(np.mean(y_pred_scaled[NASP_prom_ids], axis=0), ylim=[-8, 80], yticks=[0, 80], pic_name="ensemble_nasp_prom_pred.pdf")

In [72]:
plot_side(np.mean(y[EH38E3485200_enh_ids], axis=0), ylim=[-1.5, 15], yticks=[0, 15], pic_name="ensemble_EH38E3485200_enh_expt.pdf")

In [73]:
plot_side(np.mean(y_pred_scaled[EH38E3485200_enh_ids], axis=0), ylim=[-0.5, 5], yticks=[0, 5], pic_name="ensemble_EH38E3485200_enh_pred.pdf")

In [96]:
plot_side(np.mean(y[irf4_prom_ids], axis=0), ylim=[-20, 100], yticks=[0, 72], pic_name="ensemble_irf4_prom_expt.pdf")

In [95]:
plot_side(np.mean(y_pred_scaled[irf4_prom_ids], axis=0), ylim=[-6, 30], yticks=[0, 30], pic_name="ensemble_irf4_prom_pred.pdf")

In [99]:
plot_side(np.mean(y[rpl10a_prom_ids], axis=0), ylim=[-20, 100], yticks=[0, 100], pic_name="ensemble_rpl10a_prom_expt.pdf")

In [98]:
plot_side(np.mean(y_pred_scaled[rpl10a_prom_ids], axis=0), ylim=[-6, 30], yticks=[0, 30], pic_name="ensemble_rpl10a_prom_pred.pdf")

In [85]:
plot_side(np.mean(y[rpl35_prom_ids], axis=0), ylim=[-24, 16], yticks=[0, -24], pic_name="ensemble_rpl35_prom_expt.pdf")

In [101]:
plot_side(np.mean(y_pred_scaled[rpl35_prom_ids], axis=0), ylim=[-6, 4], yticks=[0, -6], pic_name="ensemble_rpl35_prom_pred.pdf")

In [42]:
plot_side(np.mean(y[irf1_prom_ids], axis=0), ylim=[-180, 18], yticks=[0, -180], pic_name="ensemble_irf1_prom_expt.pdf")

In [43]:
plot_side(np.mean(y_pred_scaled[irf1_prom_ids], axis=0), ylim=[-40, 4], yticks=[0, -40], pic_name="ensemble_irf1_prom_pred.pdf")

In [40]:
plot_side(np.mean(y[irf7_prom_ids], axis=0), ylim=[-60, 6], yticks=[0, -60], pic_name="ensemble_irf7_prom_expt.pdf")

In [41]:
plot_side(np.mean(y_pred_scaled[irf7_prom_ids], axis=0), ylim=[-20, 2], yticks=[0, -20], pic_name="ensemble_irf7_prom_pred.pdf")

In [54]:
plot_side(np.mean(y[irf8_prom_ids], axis=0), ylim=[-10, 15], yticks=[0, 15], pic_name="ensemble_irf8_prom_expt.pdf")

In [53]:
plot_side(np.mean(y_pred_scaled[irf8_prom_ids], axis=0), ylim=[-10, 15], yticks=[0, 15], pic_name="ensemble_irf8_prom_pred.pdf")

In [86]:
pearsonr(np.mean(y[rpl35_prom_ids], axis=0), np.mean(y_pred_scaled[rpl35_prom_ids], axis=0))

(0.8007984875720182, 2.1019227884362822e-224)

In [87]:
pearsonr(np.mean(y[EH38E2107427_enh_ids], axis=0), np.mean(y_pred_scaled[EH38E2107427_enh_ids], axis=0))

(0.7878591308729714, 2.5434027939558296e-212)

In [61]:
y_pred_scaled[irf7_prom_ids].mean(axis=0).argmax()

699