In [12]:
import numpy as np
import pyfastx
from scipy.stats import pearsonr
import os
import h5py
from utils import plot_side

In [5]:
# PRINT COMMANDS TO DOWNLOAD AND EXTRACT DATA

# Set SCRATCH to where you want to download data to
SCRATCH = "/Users/adamhe/github/scratch"

URL = "https://zenodo.org/records/10597358/files"
TAR = "example_tracks_and_deepshap.tar.gz"
print(f"wget {URL}/{TAR} -P {SCRATCH}")
print(f"tar -cvzf {SCRATCH}/{TAR}")

wget https://zenodo.org/records/10597358/files/example_tracks_and_deepshap.tar.gz -P /Users/adamhe/github/scratch
tar -cvzf /Users/adamhe/github/scratch/example_tracks_and_deepshap.tar.gz


In [6]:
# Load data

# Experimental
y = np.load(
    os.path.join(SCRATCH, "example_tracks_and_deepshap/concat_procap.npz")
)["arr_0"][:, np.r_[250:750, 1250:1750]]

# Predicted
tracks = h5py.File(os.path.join(SCRATCH, "example_tracks_and_deepshap/fold_1_examples_prediction.h5"))["track"]
quantity = h5py.File(os.path.join(SCRATCH, "example_tracks_and_deepshap/fold_1_examples_prediction.h5"))["quantity"]
y_norm = tracks / np.array(tracks).sum(axis=1, keepdims=True)
y_pred_scaled = y_norm * quantity

  y_norm = tracks / np.array(tracks).sum(axis=1, keepdims=True)


In [7]:
# Divide individuals by genotype:

fasta = pyfastx.Fasta(os.path.join(SCRATCH, "example_tracks_and_deepshap/concat_sequence.fna.gz"))
seq_coords = [seq.name.split("_")[-1] for seq in fasta]

rs185220_coord = "chr5:56909030-56910029"
rs185220_seqs = [i for i in range(len(fasta)) if seq_coords[i] == rs185220_coord]

a_pred = [y_pred_scaled[i, :] for i in rs185220_seqs if fasta[i].seq[500] == "A"]
a_expt = [y[i, :] for i in rs185220_seqs if fasta[i].seq[500] == "A"]
ag_pred = [y_pred_scaled[i, :] for i in rs185220_seqs if fasta[i].seq[500] == "R"]
ag_expt = [y[i, :] for i in rs185220_seqs if fasta[i].seq[500] == "R"]
g_pred = [y_pred_scaled[i, :] for i in rs185220_seqs if fasta[i].seq[500] == "G"]
g_expt = [y[i, :] for i in rs185220_seqs if fasta[i].seq[500] == "G"]

# Get mean per genotype

a_pred_mean = np.mean(np.array(a_pred), axis=0)
a_expt_mean = np.mean(np.array(a_expt), axis=0)
ag_pred_mean = np.mean(np.array(ag_pred), axis=0)
ag_expt_mean = np.mean(np.array(ag_expt), axis=0)
g_pred_mean = np.mean(np.array(g_pred), axis=0)
g_expt_mean = np.mean(np.array(g_expt), axis=0)

In [13]:
plot_side(a_pred_mean, ylim=[-1.5, 3], yticks=[0, 3], pic_name="img/model_fold_1_rs185220A_pred.pdf")

In [14]:
plot_side(a_expt_mean, ylim=[-1.5, 3], yticks=[0, 3], pic_name="img/model_fold_1_rs185220A_expt.pdf")

In [15]:
plot_side(g_pred_mean, ylim=[-1.5, 3], yticks=[0, 3], pic_name="img/model_fold_1_rs185220G_pred.pdf")

In [13]:
plot_side(g_expt_mean, ylim=[-1.5, 3], yticks=[0, 3], pic_name="img/model_fold_1_rs185220G_expt.pdf")

In [14]:
plot_side(ag_expt_mean, ylim=[-1.5, 3], yticks=[0, 3], pic_name="img/model_fold_1_rs185220AG_expt.pdf")

In [15]:
plot_side(ag_pred_mean, ylim=[-1.5, 3], yticks=[0, 3], pic_name="img/model_fold_1_rs185220AG_pred.pdf")