In [1]:
"""
This notebook plots predicted and observed tracks for the rs8050061 diQTL (Fig. 3D)
"""

'\nThis notebook plots predicted and observed tracks for the rs8050061 diQTL (Fig. 3D)\n'

In [1]:
import numpy as np
import pyfastx
import os
import h5py
import sys
sys.path.append("../")
from utils import plot_side

In [2]:
# PRINT COMMANDS TO DOWNLOAD AND EXTRACT DATA

# Set SCRATCH to where you want to download data to
SCRATCH = "/Users/adamhe/github/scratch"

URL = "https://zenodo.org/records/10597358/files"
TAR = "example_tracks_and_deepshap.tar.gz"
print(f"wget {URL}/{TAR} -P {SCRATCH}")
print(f"tar -xvzf {SCRATCH}/{TAR}")

wget https://zenodo.org/records/10597358/files/example_tracks_and_deepshap.tar.gz -P /Users/adamhe/github/scratch
tar -cvzf /Users/adamhe/github/scratch/example_tracks_and_deepshap.tar.gz


In [4]:
# Load data

# Experimental
y = np.load(
    os.path.join(SCRATCH, "example_tracks_and_deepshap/concat_procap.npz")
)["arr_0"][:, np.r_[250:750, 1250:1750]]

# Predicted
prediction = h5py.File(os.path.join(SCRATCH, "example_tracks_and_deepshap/fold_7_examples_prediction.h5"))
tracks = prediction["track"]
quantity = prediction["quantity"]
y_norm = tracks / np.array(tracks).sum(axis=1, keepdims=True)
y_pred_scaled = y_norm * quantity

In [10]:
# Divide individuals by genotype:

fasta = pyfastx.Fasta(os.path.join(SCRATCH, "example_tracks_and_deepshap/concat_sequence.fna.gz"))
seq_coords = [seq.name.split("_")[-1] for seq in fasta]

rs8050061_coord = "chr16:80231439-80232438"
rs8050061_seqs = [i for i in range(len(fasta)) if seq_coords[i] == rs8050061_coord]

c_pred = [y_pred_scaled[i, :] for i in rs8050061_seqs if fasta[i].seq[500] == "C"]
c_expt = [y[i, :] for i in rs8050061_seqs if fasta[i].seq[500] == "C"]
ct_pred = [y_pred_scaled[i, :] for i in rs8050061_seqs if fasta[i].seq[500] == "Y"]
ct_expt = [y[i, :] for i in rs8050061_seqs if fasta[i].seq[500] == "Y"]
t_pred = [y_pred_scaled[i, :] for i in rs8050061_seqs if fasta[i].seq[500] == "T"]
t_expt = [y[i, :] for i in rs8050061_seqs if fasta[i].seq[500] == "T"]

# Get mean per genotype

c_pred_mean = np.mean(np.array(c_pred), axis=0)
c_expt_mean = np.mean(np.array(c_expt), axis=0)
ct_pred_mean = np.mean(np.array(ct_pred), axis=0)
ct_expt_mean = np.mean(np.array(ct_expt), axis=0)
t_pred_mean = np.mean(np.array(t_pred), axis=0)
t_expt_mean = np.mean(np.array(t_expt), axis=0)

In [11]:
plot_side(c_pred_mean, ylim=[-6, 3], yticks=[0, 0.5], pic_name="img/model_fold_7_rs8050061C_pred.pdf")

In [12]:
plot_side(c_expt_mean, ylim=[-0.5, 0.25], yticks=[0, 0.25], pic_name="img/model_fold_7_rs8050061C_expt.pdf")

In [13]:
plot_side(t_pred_mean, ylim=[-6, 3], yticks=[0, 0.5], pic_name="img/model_fold_7_rs8050061T_pred.pdf")

In [14]:
plot_side(t_expt_mean, ylim=[-0.5, 0.25], yticks=[0, 0.25], pic_name="img/model_fold_7_rs8050061T_expt.pdf")

In [15]:
plot_side(ct_expt_mean, ylim=[-1, 0.5], yticks=[0, 0.5], pic_name="img/model_fold_7_rs8050061CT_expt.pdf")

In [16]:
plot_side(ct_pred_mean, ylim=[-6, 3], yticks=[0, 0.5], pic_name="img/model_fold_7_rs8050061CT_pred.pdf")