In [1]:
from __future__ import print_function

from optparse import OptionParser
import json
import os

# import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"

import pickle
import random

import numpy as np
import pandas as pd
import pysam

import tensorflow as tf

if tf.__version__[0] == "1":
    tf.compat.v1.enable_eager_execution()
gpus = tf.config.experimental.list_physical_devices("GPU")

print(gpus)

from basenji import seqnn, stream, dna_io

from akita_utils.utils import ut_dense
from akita_utils.seq_gens import (
    symmertic_insertion_seqs_gen,
    reference_seqs_gen,
)
from akita_utils.utils import split_df_equally
from akita_utils.h5_utils import initialize_stat_output_h5, write_stat_metrics_to_h5

2023-10-12 08:59:11.641504: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home1/smaruj/software/GSL/lib:/home1/smaruj/software/HTSLIB/lib
2023-10-12 08:59:11.641552: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


[]


2023-10-12 08:59:13.979647: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home1/smaruj/software/GSL/lib:/home1/smaruj/software/HTSLIB/lib
2023-10-12 08:59:13.979694: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-10-12 08:59:13.979742: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (b02-07.hpc.usc.edu): /proc/driver/nvidia/version does not exist


In [2]:
params_file = "/project/fudenber_735/tensorflow_models/akita/v2/models/f1c0/train/params.json" 
model_file = "/project/fudenber_735/tensorflow_models/akita/v2/models/f1c0/train/model1_best.h5"
motif_file = "./ctcf_tsv/mismatched_dot.tsv"

In [3]:
out_dir = "/scratch2/smaruj/miniTEST_dots"

In [4]:
if not os.path.isdir(out_dir):
    os.mkdir(out_dir)

In [5]:
genome_fasta = "/project/fudenber_735/genomes/mm10/mm10.fa"

In [6]:
batch_size = 4

rc = False
shifts = "0"
shifts = [int(shift) for shift in shifts.split(",")]

head_index = int(model_file.split("model")[-1][0])
model_index = int(model_file.split("c0")[0][-1])

background_file = f"/project/fudenber_735/tensorflow_models/akita/v2/analysis/mouse_backgrounds/m{model_index}_background_seqs.fa"

random.seed(44)

In [7]:
#################################################################
# read parameters and targets

# read model parameters
with open(params_file) as params_open:
    params = json.load(params_open)
params_train = params["train"]
params_model = params["model"]

if batch_size is None:
    batch_size = params_train["batch_size"]
else:
    batch_size = batch_size

# if targets_file is not None:
#     targets_df = pd.read_csv(targets_file, sep="\t", index_col=0)
#     target_ids = targets_df.identifier
#     target_labels = targets_df.description

#################################################################
# load model
seqnn_model = seqnn.SeqNN(params_model)
seqnn_model.restore(model_file, head_i=head_index)
seqnn_model.build_ensemble(rc, shifts)
seq_length = int(params_model["seq_length"])

# dummy target info
# if targets_file is None:
#     num_targets = seqnn_model.num_targets()
#     target_ids = [
#         ti for ti in range(num_targets)
#     ]  # checkpoint? to be sure that the langth of given targets_file is compatibile with the requested head?
#     target_labels = [""] * len(target_ids)

2023-10-12 08:59:34.380046: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 sequence (InputLayer)          [(None, 1310720, 4)  0           []                               
                                ]                                                                 
                                                                                                  
 stochastic_reverse_complement   ((None, 1310720, 4)  0          ['sequence[0][0]']               
 (StochasticReverseComplement)  , ())                                                             
                                                                                                  
 stochastic_shift (StochasticSh  (None, 1310720, 4)  0           ['stochastic_reverse_complement[0
 ift)                                                            ][0]']                     

In [8]:
seq_coords_df = pd.read_csv(motif_file, sep="\t")

In [9]:
num_experiments = len(seq_coords_df)

print("===================================")
print(
    "Number of experiements = ", num_experiments
)  # Warning! It's not number of predictions. Num of predictions is this number x5 or x6

# open genome FASTA
genome_open = pysam.Fastafile(
    genome_fasta
)  # needs to be closed at some point

background_seqs = []
with open(background_file, "r") as f:
    for line in f.readlines():
        if ">" in line:
            continue
        background_seqs.append(dna_io.dna_1hot(line.strip()))
        
num_insert_backgrounds = seq_coords_df["background_index"].max()

if len(background_seqs) < num_insert_backgrounds:
    raise ValueError(
        "must provide a background file with at least as many"
        + "backgrounds as those specified in the insert seq_coords tsv."
        + "\nThe provided background file has {len(background_seqs)} sequences."
    )

Number of experiements =  540


In [10]:
import h5py

In [11]:
stats = ["SCD", "dot-score", "cross-score", "x-score"]

In [12]:
#################################################################
    # setup output

stat_h5_outfile = initialize_stat_output_h5(out_dir,
                                            model_file,
                                            stats,
                                            seq_coords_df)

print("stat_h5_outfile initialized")


stat_h5_outfile initialized


In [13]:
for key in stat_h5_outfile:
    print(key, stat_h5_outfile[key].shape)

SCD_h1_m1 (540, 6)
background_index (540,)
boundary_end (540,)
boundary_index (540,)
boundary_start (540,)
chrom (540,)
cross-score_h1_m1 (540, 6)
dot-score_h1_m1 (540, 6)
end (540,)
exp_id (540,)
flank_bp (540,)
num_ctcf (540,)
orientation (540,)
seq_id (540,)
spacer_bp (540,)
span (540,)
start (540,)
strand (540,)
x-score_h1_m1 (540, 6)


In [14]:
num_backgrounds = len(background_seqs)

# initialize predictions stream for reference (background) sequences
refs_stream = stream.PredStreamGen(
    seqnn_model, reference_seqs_gen(background_seqs), batch_size
)

# for background_index in range(num_backgrounds):
#     bg_prediction = refs_stream[background_index]
    
#     # save maps for background sequences
#     write_maps_to_h5(bg_prediction,
#                     h5_outfile,
#                     background_index,
#                     head_index,
#                     model_index,
#                     reference=True
#     )
#     print(f"reference {background_index} saved")

In [None]:
# h5_outfile["refmap_h1_m1"][0,:,0].shape

In [15]:
from akita_utils.stats_utils import calculate_scores

In [16]:
# initialize predictions stream for alternate (ctcf-inserted) sequences
preds_stream = stream.PredStreamGen(
    seqnn_model,
    symmertic_insertion_seqs_gen(
        seq_coords_df, background_seqs, genome_open
    ),
    batch_size,
)

for exp_index in range(num_experiments):
    # get predictions
    preds_matrix = preds_stream[exp_index]
    background_index = seq_coords_df.iloc[exp_index].background_index
    ref_matrix = refs_stream[background_index]

    write_stat_metrics_to_h5(preds_matrix,
                            ref_matrix,
                            stat_h5_outfile,
                            exp_index,
                            head_index,
                            model_index,
                            diagonal_offset=2,
                            stat_metrics=stats
                        )
    
    # save maps
    # write_maps_to_h5(preds_matrix,
    #                 h5_outfile,
    #                 altseq_index,
    #                 head_index,
    #                 model_index,
    #                 reference=False
    # )
  



In [19]:
stat_h5_outfile["cross-score_h1_m1"][0,]

array([ 7.9930e-05,  6.5899e-04, -1.0884e-04, -6.0022e-05,  2.7537e-04,
        2.4390e-04], dtype=float16)

In [20]:
genome_open.close()
stat_h5_outfile.close()