In [None]:
# using fine tuned Enformer model to get pooled and base level contribution scores for CD69 locus**
# code taken/adapted from Enformer authors at https://github.com/deepmind/deepmind-research/blob/master/enformer/enformer-usage.ipynb**
# correspond to gradient tracks in Fig S1, Fig 3D

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import joblib
import gzip
import kipoiseq
from kipoiseq import Interval
import pyfaidx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import sys
from deeplift import dinuc_shuffle
sys.path.append('../../../enformer_fine_tuning')
import enformer_nomod as enformer
import sonnet as snt
import tensorflow_addons as tfa
from tensorflow import strings as tfs

2023-02-17 17:22:42.068095: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-17 17:22:42.249024: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-17 17:22:42.249051: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-17 17:22:43.179696: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [2]:
fasta_file = "../../../reference_files/hg38_erccpatch.fa"
pyfaidx.Faidx(fasta_file)

Faidx("../../../reference_files/hg38_erccpatch.fa")

In [3]:
class FastaStringExtractor:

    def __init__(self, fasta_file):
        self.fasta = pyfaidx.Fasta(fasta_file)
        self._chromosome_sizes = {k: len(v) for k, v in self.fasta.items()}

    def extract(self, interval: Interval, **kwargs) -> str:
        # Truncate interval if it extends beyond the chromosome lengths.
        chromosome_length = self._chromosome_sizes[interval.chrom]
        trimmed_interval = Interval(interval.chrom,
                                    max(interval.start, 0),
                                    min(interval.end, chromosome_length),
                                    )
        # pyfaidx wants a 1-based interval
        sequence = str(self.fasta.get_seq(trimmed_interval.chrom,
                                          trimmed_interval.start + 1,
                                          trimmed_interval.stop).seq).upper()
        # Fill truncated values with N's.
        pad_upstream = 'N' * max(-interval.start, 0)
        pad_downstream = 'N' * max(interval.end - chromosome_length, 0)
        return pad_upstream + sequence + pad_downstream

    def close(self):
        return self.fasta.close()

def one_hot_encode(sequence):
  return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)

In [4]:
#with strategy.scope():

#SEQUENCE_LENGTH = 196608

class Enformer:
    def __init__(self):
        
        model = enformer.Enformer()
        options = tf.train.CheckpointOptions(experimental_io_device="/job:localhost")
        checkpoint = tf.train.Checkpoint(module=model)#,options=options)
        tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
        latest = tf.train.latest_checkpoint("gs://picard-testing-176520/be_paper_finetuning//models/enformer_fine_tuning_230119_LR15e-05_LR20.001_WD15e-07_WD25e-07_WD25e-07_enformer_fine_tuning_230119/final")
        checkpoint.restore(latest,options).assert_existing_objects_matched()
        self._model=model


    def predict_on_batch(self, inputs):
        return self._model.predict_on_batch(inputs)
        #return {k: v.numpy() for k, v in predictions.items()}

    @tf.function
    def contribution_input_grad(self, input_sequence,
                                target_mask, track_index):
        input_sequence = input_sequence[tf.newaxis]

        target_mask_mass = tf.reduce_sum(target_mask)
        with tf.GradientTape() as tape:
            tape.watch(input_sequence)
            pred = self._model.predict_on_batch(input_sequence)#[:,:,track_index]

            prediction = tf.reduce_sum(
                  target_mask[tf.newaxis] * pred) / target_mask_mass

        grad = tape.gradient(prediction, input_sequence)
        input_grad = grad * input_sequence
        input_grad = tf.squeeze(input_grad, axis=0)

        return tf.reduce_sum(input_grad, axis=-1), grad

# @title `variant_centered_sequences`
#with strategy.scope():
class FastaStringExtractor:

    def __init__(self, fasta_file):
        self.fasta = pyfaidx.Fasta(fasta_file)
        self._chromosome_sizes = {k: len(v) for k, v in self.fasta.items()}

    def extract(self, interval: Interval, **kwargs) -> str:
        # Truncate interval if it extends beyond the chromosome lengths.
        chromosome_length = self._chromosome_sizes[interval.chrom]
        trimmed_interval = Interval(interval.chrom,
                                    max(interval.start, 0),
                                    min(interval.end, chromosome_length),
                                    )
        # pyfaidx wants a 1-based interval
        sequence = str(self.fasta.get_seq(trimmed_interval.chrom,
                                          trimmed_interval.start + 1,
                                          trimmed_interval.stop).seq).upper()
        # Fill truncated values with N's.
        pad_upstream = 'N' * max(-interval.start, 0)
        pad_downstream = 'N' * max(interval.end - chromosome_length, 0)
        return pad_upstream + sequence + pad_downstream

    def close(self):
        return self.fasta.close()

def one_hot_encode(sequence):
    return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)


def importance_scores(chrom, start, stop, target_index, mask_indices):

    target_interval = kipoiseq.Interval(chrom, int(start), int(stop))
    resized_interval = target_interval.resize(196608)
    sequence_one_hot = one_hot_encode(fasta_extractor.extract(resized_interval))
    #print(sequence_one_hot.shape)
    print(sequence_one_hot[np.newaxis].shape)
    predictions = model.predict_on_batch(sequence_one_hot[np.newaxis])[0]
    print(predictions.shape)

    target_mask = np.zeros_like(predictions)
    for idx in mask_indices:
        target_mask[idx, target_index] = 1
    # This will take some time since tf.function needs to get compiled.
    contribution_scores, grad = model.contribution_input_grad(sequence_one_hot.astype(np.float32), target_mask, target_index)
    contribution_scores = contribution_scores.numpy()
    pooled_contribution_scores = tf.nn.avg_pool1d(np.abs(contribution_scores)[np.newaxis,
                                                                              :, np.newaxis],
                                                  128, 128, 'VALID')[0, :, 0].numpy()

    base_scores = (sequence_one_hot[:][:].T * [contribution_scores[:],
                                                   contribution_scores[:],
                                                   contribution_scores[:],
                                                   contribution_scores[:]]).T

    gradient = np.multiply(sequence_one_hot[:][:].T, (np.squeeze(grad).T))
    ###### dinucleotide shuffled sequences
    """
    seq_shuffled = dinuc_shuffle.dinuc_shuffle(sequence_one_hot, 1)[0]

    target_mask = np.zeros_like(predictions)
    for idx in mask_indices:
        target_mask[idx, target_index] = 1
    # This will take some time since tf.function needs to get compiled.

    contribution_scores_scram, grad_scram = model.contribution_input_grad(seq_shuffled, target_mask, target_index)
    contribution_scores_scram = contribution_scores_scram.numpy()
    pooled_contribution_scores_scram = tf.nn.avg_pool1d(np.abs(contribution_scores_scram)[np.newaxis,
                                                                              :, np.newaxis],
                                                  128, 128, 'VALID')[0, :, 0].numpy()

    ## get base level matrix

    base_scores_scram = (seq_shuffled[:][:].T * [contribution_scores_scram[:],
                                        contribution_scores_scram[:],
                                        contribution_scores_scram[:],
                                        contribution_scores_scram[:]]).T

    ## get base level matri
    gradient_scram = np.multiply(seq_shuffled[:][:].T, (np.squeeze(grad_scram).T))
    """


    return resized_interval,contribution_scores,pooled_contribution_scores,base_scores,np.squeeze(grad), sequence_one_hot


In [5]:
def write_out_bedgraph_pooled(pooled_contribution_scores, interval, filename_base):
    start = interval.start
    end =  interval.end
    chrom = interval.chrom
    name = '_'.join([str(chrom), str(start), str(end)])

    out_file = open(filename_base + '.pooled.bedGraph', 'w')


    for k, value in enumerate(pooled_contribution_scores):

        start_interval = k * 128 + start
        end_interval = (k+1) * 128 + start

        line = [str(chrom),
                str(start_interval), str(end_interval),
                str(value)]

        out_file.write('\t'.join(line) + '\n')
    out_file.close()
    
def write_out_bedgraph_all(contribution_scores, interval, filename_base):
    start = interval.start
    end =  interval.end
    chrom = interval.chrom
    name = '_'.join([str(chrom), str(start), str(end)])

    out_file = open(filename_base + '.all.bedGraph', 'w')


    for k, value in enumerate(contribution_scores):

        start_interval = start + k
        end_interval = start + k + 1

        line = [str(chrom),
                str(start_interval), str(end_interval),
                str(value)]

        out_file.write('\t'.join(line) + '\n')
    out_file.close()

In [6]:
## iterator
#with strategy.scope():
model = Enformer()


## center interval at the RE-4 boundaries, corresponding to 9,764,556 - 9,765,505
chrom = "chr12"
start = 9764556
end = 9765505
#SEQUENCE_LENGTH=196608*2 # add 6 bp to allow for the +/- 0-3 bp shift
#target_length = 196608
fasta_extractor = FastaStringExtractor(fasta_file)

2023-02-17 17:22:45.183595: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-02-17 17:22:45.183632: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-02-17 17:22:45.183655: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (r-ubuntu): /proc/driver/nvidia/version does not exist
2023-02-17 17:22:45.183984: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
mask_indices=[442,443,444,445,446,447,448,449,450,451,452,453,454]
out = importance_scores("chr12", 9764300, 9765900,1, mask_indices) # Jurkat resting 1
resized_int, scores, pooled, base_scores,grad, seq_one_hot = out
write_out_bedgraph_pooled(pooled,
                          resized_int,
                          'enformer_ft_pooledscores_Jurkat_resting.bedGraph')
write_out_bedgraph_all(np.abs(scores),
                          resized_int,
                          'enformer_ft_scores_Jurkat_resting.bedGraph')

np.save("Jurkat_resting.basescores.npy", np.array(base_scores))#, fmt='%10.8f')

(1, 196608, 4)
(896, 3)


In [8]:
out_2 = importance_scores("chr12", 9764300, 9765900,0, mask_indices)
resized_int_2, scores_2, pooled_2, base_scores_2,grad_2, seq_one_hot_2 = out_2
write_out_bedgraph_pooled(pooled_2,
                          resized_int,
                          'enformer_ft_pooledscores_Jurkat_activated.bedGraph')
write_out_bedgraph_all(np.abs(scores_2),
                          resized_int,
                          'enformer_ft_scores_Jurkat_activated.bedGraph')

np.save("Jurkat_activated.basescores.npy", np.array(base_scores_2))#, fmt='%10.8f')

(1, 196608, 4)
(896, 3)


In [9]:
mask_indices=[442,443,444,445,446,447,448,449,450,451,452,453,454]
out_3 = importance_scores("chr12", 9764300, 9765900,2, mask_indices)
resized_int_3, scores_3, pooled_3, base_scores_3,grad_3, seq_one_hot_3 = out_3

write_out_bedgraph_pooled(pooled_3,
                          resized_int_3,
                          'enformer_ft_pooledscores_Jurkat_diff.bedGraph')
write_out_bedgraph_all(scores_3,
                          resized_int_3,
                          'enformer_ft_scores_Jurkat_diff.bedGraph')

np.save("Jurkat_diff.basescores.npy", np.array(base_scores_3))#, fmt='%10.8f')

(1, 196608, 4)
(896, 3)


In [None]:
print('test')