Dataset

Training 
  Positive : 26974, 
  Negative : 27470

Validation 
  Positive : 2150, 
  Negative : 2136

In [10]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [11]:
import json

import numpy as np
import pandas as pd

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

from tensorflow.keras import layers

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
Found GPU at: /device:GPU:0


In [12]:
#utitliy functions 

import os
import sys

import datetime

output_path="/content/drive/MyDrive/Colab Notebooks/2/results/TargetNet_training/"

def set_output(string):
    """ set output configurations """
    output, save_prefix = sys.stdout, None
    if output_path is not None:
        save_prefix = output_path
        if not os.path.exists(save_prefix):
            os.makedirs(save_prefix, exist_ok=True)
        output = open( output_path + string + ".txt", "a")

    return output, save_prefix

def Print(string, output, newline=False, timestamp=True):
    """ print to stdout and a file (if given) """
    if timestamp:
        time = datetime.datetime.now()
        line = '\t'.join([str(time.strftime('%m-%d %H:%M:%S')), string])
    else:
        time = None
        line = string

    print(line, file=sys.stderr)
    if newline: print("", file=sys.stderr)

    if not output == sys.stdout:
        print(line, file=output)
        if newline: print("", file=output)

    output.flush()
    return time

In [13]:
from tensorflow.keras import models 

modelDir = '/content/drive/MyDrive/Colab Notebooks/Research/Colab Notebooks/finals/results/the_model9.h5'
   
#Getting the model
model = models.load_model('/content/drive/MyDrive/Colab Notebooks/Research/Colab Notebooks/finals/results/the_model9.h5')



In [14]:
!pip install bio
import sys
import numpy as np
from Bio import pairwise2

class miRNA_CTS_dataset():
    """ Pytorch dataloader for miRNA-CTS pair data """
    def __init__(self, X, labels, set_idxs, set_labels):
        self.X = X
        self.labels = labels
        self.set_idxs = set_idxs
        self.set_labels = set_labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, i):
        return self.X[i], self.labels[i], self.set_idxs[i]

def encode_RNA(mirna_seq, mirna_esa, cts_rev_seq, cts_rev_esa):
    """ one-hot encoder for RNA sequences with/without extended seed alignments """
    chars = {"A":0, "C":1, "G":2, "U":3, "-":4}

    x = np.zeros((10, 50), dtype=np.float32)
    for i in range(len(mirna_esa)):
        x[chars[mirna_esa[i]], 5 + i] = 1
    for i in range(10, len(mirna_seq)):
        x[chars[mirna_seq[i]], 5 + i - 10 + len(mirna_esa)] = 1
    for i in range(5):
        x[chars[cts_rev_seq[i]] + len(chars), i] = 1
    for i in range(len(cts_rev_esa)):
        x[chars[cts_rev_esa[i]] + len(chars), i + 5] = 1
    for i in range(15, len(cts_rev_seq)):
        x[chars[cts_rev_seq[i]]  + len(chars), i + 5 - 15 + len(cts_rev_esa)] = 1

    return x
        


def reverse(seq):
    """ reverse the given sequence """
    seq_r = ""
    for i in range(len(seq)):
        seq_r += seq[len(seq) - 1 - i]
    return seq_r


score_matrix = {}  # Allow watson-crick & wobble
for c1 in 'ACGU':
    for c2 in 'ACGU':
        if (c1, c2) in [('A', 'U'), ('U', 'A'), ('G', 'C'), ('C', 'G')]:
            score_matrix[(c1, c2)] = 1
        elif (c1, c2) in [('U', 'G'), ('G', 'U')]:
            score_matrix[(c1, c2)] = 1
        else:
            score_matrix[(c1, c2)] = 0


def extended_seed_alignment(mi_seq, cts_r_seq):
    """ extended seed alignment """
    alignment = pairwise2.align.globaldx(mi_seq[:10], cts_r_seq[5:15], score_matrix, one_alignment_only=True)[0]
    mi_esa = alignment[0]
    cts_r_esa = alignment[1]
    esa_score = alignment[2]
    return mi_esa, cts_r_esa, esa_score


def get_dataset_from_configs(data_cfg,split_idx=None):
  
    FILE = open(data_cfg, "r")
    lines = FILE.readlines()
    FILE.close()

    X, labels, set_idxs, set_labels = [], [], [], []
    set_idx = 0
    for l, line in enumerate(lines[1:]):
      tokens = line.strip().split(",")
      # print(tokens)
      mirna_id, mirna_seq, mrna_id, mrna_seq = tokens[:4]
      label = float(tokens[4]) if len(tokens) > 4 else 0
      
      mirna_seq = mirna_seq.upper().replace("T", "U")
      mrna_seq = mrna_seq.upper().replace("T", "U")
      mrna_rev_seq = reverse(mrna_seq)

      
      for pos in range(len(mrna_rev_seq) - 40 + 1):
          tempX=[]
          mirna_esa, cts_rev_esa, esa_score = extended_seed_alignment(mirna_seq, mrna_rev_seq[pos:pos+40])
          if split_idx not in ["train", "val"] and esa_score < 6: continue
          x = encode_RNA(mirna_seq, mirna_esa,mrna_rev_seq[pos:pos+40], cts_rev_esa).transpose()
          X.append(np.array(x))
          labels.append(tf.expand_dims(np.array(label), 0))
          set_idxs.append(np.array(set_idx))

      set_labels.append(label)
      set_idx += 1

      if set_idx % 5 == 0:
          print('# {} {:.1%}'.format(split_idx, l / len(lines[1:])), end='\r', file=sys.stderr)
    print(' ' * 150, end='\r', file=sys.stderr)

    X = np.array(X)
    labels = np.array(labels)
    dataset = miRNA_CTS_dataset(X, labels, set_idxs, np.array(set_labels))

    return X, labels, set_idxs, set_labels

output, save_prefix = set_output("test_model_log")
device = device_name

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [15]:
#Loading the datasets
testingData =  '/content/drive/MyDrive/Colab Notebooks/2/data/csv/Independent_dataset_new.csv'

ind_X, ind_y, ind_set_idxs, ind_set_labels = get_dataset_from_configs(testingData)



In [20]:
result = model.predict(ind_X)

tempIdx =0
tempVal =[]
final =[]

for i in range(len(ind_set_idxs)):
  if tempIdx == ind_set_idxs[i]:
    tempVal.append(result[i])
  else:
    tempIdx +=1
    maxVal = np.max(tempVal)
    final.append(np.array(maxVal))


count =0
for i in range(len(final)):
  if final[i] > 0.5 :
    count += 1

print("Count", count)

data 47
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
Count 47
