In [2]:
import torch
import os
import datetime

%matplotlib inline

from selene_sdk.utils import load_path
from selene_sdk.utils import parse_configs_and_run
from selene_sdk.utils import DeeperDeepSEA
from selene_sdk.predict._common import predict
from selene_sdk.utils import load_features_list

In [3]:
import sys

import numpy as np
import pandas as pd
import tensorflow as tf
import h5py
from scipy.io import loadmat

In [4]:
sys.path.insert(1,'/home/tt419/Projects/DeepLearning/')
import PhDeep.config.log_conf as log_conf
import PhDeep.models.deepsea_fyr.model as fyr
from PhDeep.data_loader.deepsea.data_loader import DataManager

In [5]:
features_504 = load_features_list("/rds-d5/project/who1000/rds-who1000-wgs10k/user/tt419/Epigenome/output_encode_fold/encode_meta1_features.txt")
features_919 = load_features_list("/rds-d5/user/tt419/hpc-work/data-storage/DeepSea_orig_data/deepsea_train/label_names.txt")
features_1846 = load_features_list("/rds-d5/project/who1000/rds-who1000-wgs10k/user/tt419/Epigenome/Selene_data/selene_ftp_output/distinct_1846_features.txt")

In [6]:
train_919 = "/home/tt419/Projects/DeepLearning/DeepSea_data/deepsea_train/train.mat"
test_919 = "/home/tt419/Projects/DeepLearning/DeepSea_data/deepsea_train/test.mat"
valid_919 = "/home/tt419/Projects/DeepLearning/DeepSea_data/deepsea_train/valid.mat"

data_504 = "/rds-d5/project/who1000/rds-who1000-wgs10k/user/tt419/Epigenome/output_encode_fold/encode_meta1_sorted.bed.gz"
data_1846 = "/rds-d5/project/who1000/rds-who1000-wgs10k/user/tt419/Epigenome/Selene_data/selene_ftp_output/sorted_selene_fullFeatures.bed.gz"


In [7]:
model_504_path = "/rds-d5/project/who1000/rds-who1000-wgs10k/user/tt419/Epigenome/output_encode_fold/logs/online_sampler_outputs_503_v1_r1/best_model.pth.tar"
#different file-type, as it was run with TF instead of PyTorch
model_919_path = "/rds-d5/user/tt419/hpc-work/data-storage/PhDeep_logs/DeepSea_ckpt/modelCheckpoint-0.05066-4-99.h5"
model_1846_path = "/rds-d5/project/who1000/rds-who1000-wgs10k/user/tt419/Epigenome/Selene_data/selene_ftp_output/logs/online_sampler_outputs_base/best_model.pth.tar"

In [None]:
ref_hg19 = "/rds-d5/project/who1000/rds-who1000-wgs10k/user/tt419/Epigenome/Selene_data/selene_ftp_output/male.hg19.fasta"

# Load model architectures for Selene models

In [8]:
from selene_sdk.utils import NonStrandSpecific


model_arch_504 = NonStrandSpecific(DeeperDeepSEA(2000, 504))
model_arch_1846 = NonStrandSpecific(DeeperDeepSEA(2000, 1846))


In [9]:
from selene_sdk.predict import AnalyzeSequences
from selene_sdk.utils import load_features_list

analysis_504 = AnalyzeSequences(
    model_arch_504,
    model_504_path,
    sequence_length=2000,
    features=features_504,
    use_cuda=False)

analysis_1846 = AnalyzeSequences(
    model_arch_1846,
    model_1846_path,
    sequence_length=2000,
    features=features_1846,
    use_cuda=False)

# Load model for PhDeep

In [10]:
local = False
load = True
tf.debugging.set_log_device_placement(True)

avail_gpus = tf.config.list_physical_devices('GPU')
no_avail_gpus = len(tf.config.list_physical_devices('GPU'))
print("Num GPUs Available: {} ;\n namely: {}".format(no_avail_gpus, avail_gpus))
for gpu in avail_gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

lc = log_conf.LogConfig("/home/tt419/Projects/DeepLearning/PhDeep/logs/", "DeepSea_ckpt/")
mm = fyr.DeepSeaModel(sub_version=0.0, log_config=lc)
dm = DataManager()

Num GPUs Available: 0 ;
 namely: []


In [11]:
if load:
    lc.set_best_checkpoint(mm.MODEL_VERSION)
    if lc.BEST_CHECKPOINT == -1:
        mm.build_model()
        load_fp = model_919_path
        print("Load failed, no checkpoint found")
    else:
        load_fp = lc.CHECKPOINT_DIR + lc.BEST_CHECKPOINT
        mm.load_model(fp=load_fp)
        print(f"Model loaded from {load_fp}")
else:
    load_fp = None

model = mm.compile_model(multi_gpu=no_avail_gpus>1 ,load_fp=load_fp)
model.summary()

set_best_checkpoint -> val: -1
Executing op RandomUniform in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Sub in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Mul in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Add in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarIsInitializedOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op LogicalNot in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Assert in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op AssignVariableOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op Fill in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in device /job:localhost/replica:0/task:0/device:CPU:0
Executing op VarHandleOp in devi

# Load test data
This data can be used for prediction and put into the "visualize_roc_curves" function of the utils/performance_metrics.py  

In [None]:
dm = DataManager()
val_x, val_y = dm.read_val_data()

### Region selection
It is to be debated, whether the models should all be benchmarked on the same regions, or just on the same chromosomes (usually 6 & 7 or 8 & 9)

In [None]:
from selene_sdk.sequences import Genome
from selene_sdk.samplers import OnlineSampler
ref_hg19_seq = Genome(ref_hg19)
os_504 = OnlineSampler(ref_hg19_seq, data_504, feature_504, sequence_length=2000, mode="validate", save_datasets=["validate"], output_dir=[]

