# DeepBind: Sample Run for final Dataset

# Import necessary packages

In [1]:
import os
os.chdir("../../../")
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import exoNet
import scanpy as sc
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

Using TensorFlow backend.


# Data Loading and Preprocessing

In [3]:
data_name = "final"

In [4]:
char_encoder = {
    'N': -1,
    'A': 0,
    'C': 1,
    'G': 2,
    'U': 3
}

In [5]:
raw_df = pd.read_csv(f"./Data/{data_name}/{data_name}.design.mat.shortened.csv")
raw_df.index = raw_df['id']
raw_df.head()

Unnamed: 0_level_0,id,seq,dotbracket,element_string,element_string_number
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chr1_10140_10163_+_ev,chr1_10140_10163_+_ev,UGGGAUUGGGGAUUGGGAUUGGGA,........................,ffffffffffffffffffffffff,0
chr1_10320_10341_+_ev,chr1_10320_10341_+_ev,GGUUGGGAUUGGGGAUUGGGAU,......................,ffffffffffffffffffffff,0
chr1_10381_10400_+_ev,chr1_10381_10400_+_ev,AUUGGGAUUGGGGAUUGGGG,....................,ffffffffffffffffffff,0
chr1_10496_10524_+_ev,chr1_10496_10524_+_ev,GGCCCAGACUGGACUCCUCUUGACACGAG,((.(((...)))...))............,ssissshhhsssiiisstttttttttttt,11100011100000000262799360
chr1_10749_10778_+_ev,chr1_10749_10778_+_ev,GCGGCGCGGCCGCGUCCGCGUCUCUCCGCG,(((((((((......))))).....)))).,ssssssssshhhhhhsssssiiiiisssst,11111000000111111032537088


In [6]:
data = np.load(f"./Data/{data_name}/sequences.npy", allow_pickle=True)
data = data[:, :, :4]
data.shape

(80841, 50, 4)

In [7]:
labels = np.load(f"./Data/{data_name}/labels.npy", allow_pickle=True)
labels = np.reshape(labels, (-1,))
labels.shape

(80841,)

# Model Instatiation

In [8]:
network = exoNet.models.DeepBind(seq_len=data.shape[1],
                                 n_channels=data.shape[2],
                                 n_classes=2,
                                 padding='valid',
                                 use_batchnorm=True,
                                 lr=0.001,
                                 model_path=f"./models/DeepBind/{data_name}/",
                                 dropout_rate=0.1,
                                )

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
data (InputLayer)            (None, 50, 4)             0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 41, 32)            1280      
_________________________________________________________________
batch_normalization_1 (Batch (None, 41, 32)            128       
_________________________________________________________________
re_lu_1 (ReLU)               (None, 41, 32)            0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1, 32)             0         
_________________________________________________________________
flatten_1 (Flatten)  

In [18]:
labels_enc, _ = exoNet.utils.label_encoder(labels, {"NO": 0, "YES": 1})

In [19]:
labels_enc = np.reshape(labels_enc, (-1,))

In [16]:
import keras
labels_one = keras.utils.to_categorical(labels_enc, num_classes=2)

In [16]:
predictions = network.model.predict(data)

In [20]:
true_positives = np.sum(np.round(np.clip(labels_enc * np.argmax(predictions, axis=1), 0, 1)))
all_positives = np.sum(np.round(np.clip(labels_enc, 0, 1)))
true_positives / (all_positives + 1e-3)

0.8354590677641266

In [9]:
network.train(data,
              labels, 
              le={"NO": 0, "YES": 1},
              n_epochs=1000,
              batch_size=512,
              early_stopping_kwargs={"patience": 10, "monitor": "val_loss"},
              save=True,
              )

Instructions for updating:
Use tf.cast instead.
Train on 64672 samples, validate on 16169 samples
Epoch 1/1000
 - 10s - loss: 0.6622 - acc: 0.6253 - sensitivity: 0.6288 - specificity: 0.6222 - val_loss: 0.9308 - val_acc: 0.5415 - val_sensitivity: 0.1018 - val_specificity: 0.9769
Epoch 2/1000
 - 2s - loss: 0.5669 - acc: 0.7119 - sensitivity: 0.7145 - specificity: 0.7103 - val_loss: 0.8088 - val_acc: 0.5858 - val_sensitivity: 0.2114 - val_specificity: 0.9566
Epoch 3/1000
 - 2s - loss: 0.5284 - acc: 0.7400 - sensitivity: 0.7611 - specificity: 0.7201 - val_loss: 0.7089 - val_acc: 0.6327 - val_sensitivity: 0.3294 - val_specificity: 0.9330
Epoch 4/1000
 - 2s - loss: 0.5024 - acc: 0.7591 - sensitivity: 0.7865 - specificity: 0.7328 - val_loss: 0.6312 - val_acc: 0.6807 - val_sensitivity: 0.4562 - val_specificity: 0.9033
Epoch 5/1000
 - 2s - loss: 0.4890 - acc: 0.7668 - sensitivity: 0.7947 - specificity: 0.7400 - val_loss: 0.5419 - val_acc: 0.7318 - val_sensitivity: 0.6162 - val_specificity: 0.8

In [73]:
network.model.save_weights("./models/DeepBind/final/weights.h5")

In [14]:
network.model.load_weights("./models/DeepBind/final/weights.h5")

In [10]:
# os.makedirs('./models/DeepBind/final/', exist_ok=True)

In [10]:
network.save_model()

In [21]:
results = pd.DataFrame({'id': raw_df['id'].values, 'no': predictions[:, 0], 'yes': predictions[:, 1]})
results.head()

Unnamed: 0,id,no,yes
0,chr1_10140_10163_+_ev,0.06592,0.93408
1,chr1_10320_10341_+_ev,0.073277,0.926723
2,chr1_10381_10400_+_ev,0.030033,0.969967
3,chr1_10496_10524_+_ev,0.790471,0.209529
4,chr1_10749_10778_+_ev,0.653369,0.34663


In [61]:
results.to_csv("/media/pgdrive/sharif/exosomians/data/bams/deepbind.all.predictions.csv")

In [22]:
ev_extreme_results = results[results['id'].str.contains('ev')]
ev_extreme_results = ev_extreme_results[ev_extreme_results['yes'] >= ev_extreme_results['yes'].quantile(.9)]
ev_extreme_results.shape

(4023, 3)

In [67]:
ev_extreme_results.to_csv("/media/pgdrive/sharif/exosomians/data/bams/deepbind.ev.extreme.predictions.csv")

In [23]:
ic_extreme_results = results[results['id'].str.contains('ic')]
ic_extreme_results = ic_extreme_results[ic_extreme_results['no'] >= ic_extreme_results['no'].quantile(.9)]
ic_extreme_results.shape

(4062, 3)

In [69]:
ic_extreme_results.to_csv("/media/pgdrive/sharif/exosomians/data/bams/deepbind.ic.extreme.predictions.csv")

In [24]:
ev_extreme_sequences = raw_df.loc[ev_extreme_results['id'], 'seq']
ev_extreme_sequences.head()

id
chr1_10381_10400_+_ev    AUUGGGAUUGGGGAUUGGGG
chr1_21635_21653_+_ev     GUCCCGUUUCGAGGGAGUC
chr1_56712_56730_+_ev     UUUGUUUACGGUGACAUUU
chr1_85224_85242_+_ev     AUUUGAAUUUUUCUAACUU
chr1_92412_92430_-_ev     UCAGCUCUUUAUUUGAUUG
Name: seq, dtype: object

In [25]:
ic_extreme_sequences = raw_df.loc[ic_extreme_results['id'], 'seq']
ic_extreme_sequences.head()

id
chr3_175356796_175356818_+_ic    CUCCUACGUACUACUUCUGGACA
chr6_22766836_22766857_+_ic       UUAUGUACUUCGGCACAUCGCC
chr19_31189049_31189071_-_ic     UAGAUAUAAAGUUGUUCAUAGCA
chr7_101775745_101775765_+_ic      UACUGUACUCGGUGACGUGGA
chr10_31951106_31951127_+_ic      UACUCUACUUGGGAUAGAAACA
Name: seq, dtype: object

In [26]:
ev_extreme_sequences_encoded = exoNet.pep.seq_encoder(ev_extreme_sequences, char_encoder, 50, unknown_char=True)
ev_extreme_sequences_encoded.shape

(4023, 50, 4)

In [27]:
ic_extreme_sequences_encoded = exoNet.pep.seq_encoder(ic_extreme_sequences, char_encoder, 50, unknown_char=True)
ic_extreme_sequences_encoded.shape

(4062, 50, 4)

In [34]:
from keras.models import Model

In [39]:
conv_model = Model(network.model.inputs, network.model.layers[3].output)
conv_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
data (InputLayer)            (None, 50, 4)             0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 41, 32)            1280      
_________________________________________________________________
batch_normalization_1 (Batch (None, 41, 32)            128       
_________________________________________________________________
re_lu_1 (ReLU)               (None, 41, 32)            0         
Total params: 1,408
Trainable params: 1,344
Non-trainable params: 64
_________________________________________________________________


In [54]:
np.array_equal(network.model.layers[1].get_weights()[0], conv_model.layers[1].get_weights()[0])

True

In [55]:
positive_motif_scores = conv_model.predict(ev_extreme_sequences_encoded)
positive_motif_scores.shape

(40221, 41, 32)

In [56]:
negative_motif_scores = conv_model.predict(ic_extreme_sequences_encoded)
negative_motif_scores.shape

(40620, 41, 32)

In [57]:
positive_pfm = exoNet.pop.compute_pfm(ev_extreme_sequences_encoded, positive_motif_scores)
positive_pfm.shape

(32, 10, 4)

In [58]:
negative_pfm = exoNet.pop.compute_pfm(ic_extreme_sequences_encoded, negative_motif_scores)
negative_pfm.shape

(32, 10, 4)

In [59]:
np.save(arr=positive_pfm, file="/media/pgdrive/sharif/exosomians/data/bams/deepbind.pos.all.pfm.shortened.npy")

In [60]:
np.save(arr=negative_pfm, file="/media/pgdrive/sharif/exosomians/data/bams/deepbind.neg.all.pfm.shortened.npy")

In [47]:
results.to_csv("./final.deepbind.result.mat.shortened.csv")

In [28]:
ev_extreme_results = results[results['id'].str.contains('ev')]
ev_extreme_results = ev_extreme_results[ev_extreme_results['yes'] >= ev_extreme_results['yes'].quantile(.90)]
ev_extreme_results.shape

(4023, 3)

In [29]:
ic_extreme_results = results[results['id'].str.contains('ic')]
ic_extreme_results = ic_extreme_results[ic_extreme_results['no'] >= ic_extreme_results['no'].quantile(.90)]
ic_extreme_results.shape

(4062, 3)

In [32]:
ev_extreme_results.to_csv("/media/pgdrive/sharif/exosomians/data/bams/deepbind.ev.extremes.predictions.csv")

In [30]:
network.label_encoder = {"NO": 0, "YES": 1}

In [31]:
ev_extreme_sequences_encoded.shape

(4023, 50, 4)

In [32]:
ev_extreme_labels = np.array(['YES'] * ev_extreme_sequences_encoded.shape[0])
ic_extreme_labels = np.array(['NO'] * ic_extreme_sequences_encoded.shape[0])

In [81]:
for idx, layer in enumerate(network.model.layers):
    layer.name = f"name-{idx}"

In [85]:
network.model.name = "mohsen-model"

In [87]:
import tensorflow as tf

In [40]:
ev_baseline = np.mean(ic_extreme_sequences_encoded, axis=0)
ic_baseline = np.mean(ev_extreme_sequences_encoded, axis=0)

In [44]:
ev_extreme_deeplift = exoNet.pop.interpret(network, ev_extreme_sequences_encoded, ev_extreme_labels, method='DeepLIFT', baseline=ev_baseline)
ic_extreme_deeplift = exoNet.pop.interpret(network, ic_extreme_sequences_encoded, ic_extreme_labels, method='DeepLIFT', baseline=ic_baseline)

In [45]:
np.save(file="/media/pgdrive/sharif/exosomians/data/bams/deepbind.ev.extreme.deeplift.shortened.pkl", arr=ev_extreme_deeplift)
np.save(file="/media/pgdrive/sharif/exosomians/data/bams/deepbind.ic.extreme.deeplift.shortened.pkl", arr=ic_extreme_deeplift)

In [48]:
ev_extreme_deeptaylor = exoNet.pop.interpret(network, ev_extreme_sequences_encoded, ev_extreme_labels, method="DeepTaylor")
ic_extreme_deeptaylor = exoNet.pop.interpret(network, ic_extreme_sequences_encoded, ic_extreme_labels, method="DeepTaylor")

In [49]:
np.save(file="/media/pgdrive/sharif/exosomians/data/bams/deepbind.ev.extreme.deeptaylor.shortened.pkl", arr=ev_extreme_deeptaylor)
np.save(file="/media/pgdrive/sharif/exosomians/data/bams/deepbind.ic.extreme.deeptaylor.shortened.pkl", arr=ic_extreme_deeptaylor)

In [72]:
ev_extreme_deeptaylor[0, :21]

array([[0.00607364, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.00884957],
       [0.        , 0.        , 0.00386993, 0.        ],
       [0.        , 0.        , 0.01440077, 0.        ],
       [0.        , 0.        , 0.00809197, 0.        ],
       [0.05417186, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.00903742],
       [0.        , 0.        , 0.        , 0.03092203],
       [0.        , 0.        , 0.04176026, 0.        ],
       [0.        , 0.        , 0.01474026, 0.        ],
       [0.        , 0.        , 0.0270181 , 0.        ],
       [0.        , 0.        , 0.01202146, 0.        ],
       [0.03289767, 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.03975287],
       [0.        , 0.        , 0.        , 0.1966001 ],
       [0.        , 0.        , 0.09341811, 0.        ],
       [0.        , 0.        ,