In [None]:
# imports
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import binding_2D_matrix_encoder as b2dme



# parameters
file_path = '../datasets/AGO2_CLASH_Hejret2023.tsv'
model = 'miRBind.h5'
num_of_epochs = 10  # number of epochs - one epoch is one iteation of the entire dataset
alphabet = {"AT": 1, "TA": 1, "GC": 1, "CG": 1, "AU": 1, "UA": 1}

In [5]:
# load the dataset
df = pd.read_csv(file_path, sep='\t')
print("\nDataset loaded successfully.")
print(f"Dataset shape: {df.shape}")
print(f"First few rows of the dataset:\n{df.head()}\n")

# extract ncRNA and gene sequences and the target labels
noncodingRNA = df['noncodingRNA'].values  # assumed nucleotide sequences
genes = df['gene'].values  # assumed nucleotide sequences
labels = df['label'].values  # binary labels (binding or non-binding)

print("Extracted ncRNA, gene sequences, and labels.")
print(f"ncRNA example: {noncodingRNA[0]}")
print(f"Gene example: {genes[0]}")
print(f"Label example: {labels[0]}\n")

# combining sequences as a touple
combined_sequences = list(zip(noncodingRNA, genes, labels))
print(f"Combined sequences at index 0: {combined_sequences[0]}")
print(f"Combined sequences at index 1: {combined_sequences[1]}\n")


Dataset loaded successfully.
Dataset shape: (42460, 3)
First few rows of the dataset:
             noncodingRNA                                               gene  \
0  TGAGGTAGTAGGTTGTGTGGTT  ACTTTTCTTGACTTGCCTGTTTTGGCCATTAGCTGCCTTAAACGTT...   
1  AAAAGCTGGGTTGAGAGGGCGA  CGAACTGACACTGAGCCACAACCCAAACAACCCAGCTCTCCCTAAG...   
2  TCCGAGCCTGGGTCTCCCTCTT  CGTGCGTGGCAAGCTCTCCCGGCTCGGCTCGCTCGGCCTCCCTTAC...   
3  TAGCAGCACATAATGGTTTGTG  GCACATGCCACAGCTCCTGTGAATATTGCCGGCTCCAGAACCGCCG...   
4  CAAAGAATTCTCCTTTTGGGCT  GTCTGGTCTTGAACTCTTGGGCTCAAAGAATTCTCCTGCTTCAGCC...   

   label  
0      1  
1      1  
2      1  
3      1  
4      1  

Extracted ncRNA, gene sequences, and labels.
ncRNA example: TGAGGTAGTAGGTTGTGTGGTT
Gene example: ACTTTTCTTGACTTGCCTGTTTTGGCCATTAGCTGCCTTAAACGTTAACA
Label example: 1

Combined sequences at index 0: ('TGAGGTAGTAGGTTGTGTGGTT', 'ACTTTTCTTGACTTGCCTGTTTTGGCCATTAGCTGCCTTAAACGTTAACA', 1)
Combined sequences at index 1: ('AAAAGCTGGGTTGAGAGGGCGA', 'CGAACTGACACTGAGCCACAACCCA

In [6]:
#obtain training and validation/testing sets
#0.2 => 80% training, 20% validation
training_data, validation_data = train_test_split(combined_sequences, test_size=0.2, random_state=42)

# display data is terminal
print("Training_data:")
for i in range(10):
    print(training_data[i])
print(f"Size: {len(training_data)}\n")

print("validation_data")
for i in range(10):
    print(validation_data[i])
print(f"Size: {len(validation_data)}\n")

Training_data:
('CGCGCCGGGCCCGGGTT', 'GAGAATGGTGAACCCAGGAGGCGGAACTTGCAGTGACCCGAGATCGTGCC', 0)
('ACCAGGAGGCTGAGGCCCCT', 'CTGTCCATCTTAGAGGAGGACAAGGACTGGAGGCCAGCCATCACAATCAA', 0)
('TGAGACCAGGACTGGATGCACC', 'CTGTTATTGAAATATTACAGATGGAAAGAATGAGGCTCAGGGAAGTTAAA', 0)
('CCTCTTCCCCTTGTCTCTCCAG', 'GAAGGCAGGGGACACAGATGGGAGAGATTGAGCCAAGTCAGCCTTCTGTT', 0)
('AATCCTTGGAACCTAGGTGTGAGT', 'GTGGATACTTCCTGGCGGGGCGCTCTATGACCTGGGTAGCAATTGGTGCC', 0)
('GCGACCCATACTTGGTTTCAG', 'CAACAAATTACCTGGGGAGAAGCTGGGCCGAGTTGTGCATATAATCCAAG', 0)
('TTCAAGTAATTCAGGATAGGT', 'ATTAAATTAAAAAATACTGACTGGCTGGCAGGCAGGTGCCATGTCTGGGA', 0)
('TCAGGCTCAGTCCCCTCCCGAT', 'GATTTTTGTGCTGGGGTGGGATTATGGGTCAATGTTGAAGAATTTTTAAG', 1)
('TCCCTGAGACCCTAACTTGTGA', 'AAAACTAACTTAAATTTCAGCATAGGGTTTCAGGGGACATGGCGAAGTTT', 0)
('AACATTCAACGCTGTCGGTGAGT', 'AACACATATGGCCTAGACTACGTACATAACCTAAACCTACTCCAATGCTA', 0)
Size: 33968

validation_data
('CTGAATAGCTGGGACTACAGGT', 'ATATATATATCAACAGTGGTAAATACTGTAGATTTTTATATATATATATA', 0)
('TGACCGATTTCTCCTGGTGTTC', 'TTCTCCATTG

#### Encoders

In [7]:
def sequence_encoder(seq):
    encoded_seq = []
    
    # tterate through the sequence, considering each consecutive pair of nucleotides
    # len(seq) - 1 so that we don't go out of bounds
    for i in range(len(seq) - 1):
        # concatenate the current nucleotide and the next nucleotide to form a base pair
        base_pair = seq[i] + seq[i + 1]
        
        # if base pair is in the 'alphabet' dictionary, append the corresponding value (1)
        if base_pair in alphabet:
            encoded_seq.append(alphabet[base_pair])
        # else append 0 (indicating no valid pairing)
        else:
            encoded_seq.append(0)
    
    # Return the list of encoded values, which is now a numerical representation of the sequence
    return encoded_seq


def encode_data(data_tuple):
    noncodingRNA, gene, label = data_tuple
    
    # Encode both sequences
    encoded_noncodingRNA = sequence_encoder(noncodingRNA)
    encoded_gene = sequence_encoder(gene)
    
    # Return in the same structure (encoded RNA, encoded gene, label untouched)
    return encoded_noncodingRNA, encoded_gene, label


print(training_data[0])
print(encode_data(training_data[0]))
# # Encode the data
# for i in range(10):
#     training_data[i] = encode_data(training_data[i])
#     print(training_data[i])
    
# print("\n")

# for i in range(10):
#     validation_data[i] = encode_data(validation_data[i])
#     print(validation_data[i])

('CGCGCCGGGCCCGGGTT', 'GAGAATGGTGAACCCAGGAGGCGGAACTTGCAGTGACCCGAGATCGTGCC', 0)
([1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0], 0)


In [None]:
# Watson-Crick base pairing rules
alphabet = {"AT": 1, "TA": 1, "GC": 1, "CG": 1, "AU": 1, "UA": 1}

def sequence_interaction_matrix(noncodingRNA, gene):
    # Create an empty matrix to store interactions
    interaction_matrix = np.zeros((len(noncodingRNA), len(gene)), dtype=int)
    
    # Iterate over each base in noncodingRNA
    for i, nc_base in enumerate(noncodingRNA):
        # Compare with each base in the gene
        for j, gene_base in enumerate(gene):
            base_pair = nc_base + gene_base
            interaction_matrix[i, j] = alphabet.get(base_pair, 0)
            
            # Print out the base pair and the corresponding interaction value
            # print(f"{nc_base}{gene_base} = {alphabet.get(base_pair, 0)}")
    
    return interaction_matrix

def encode_data(data_tuple):
    noncodingRNA, gene, label = data_tuple
    
    # Generate the interaction matrix
    interaction_matrix = sequence_interaction_matrix(noncodingRNA, gene)
    
    # Return the interaction matrix along with the label (untouched)
    return interaction_matrix, label

# Example input data (noncodingRNA, gene, label)
data = training_data[0]

# Encode the data
encoded_matrix, label = encode_data(data)

# Print the interaction matrix and label
print(data)
print(f"Interaction Matrix:\n{encoded_matrix}")
print(f"Label: {label}")

('CGCGCCGGGCCCGGGTT', 'GAGAATGGTGAACCCAGGAGGCGGAACTTGCAGTGACCCGAGATCGTGCC', 0)
Interaction Matrix:
[[1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0
  0 0 0 1 0 1 0 0 0 1 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
  1 1 1 0 0 0 0 0 1 0 0 0 1 1]
 [1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0
  0 0 0 1 0 1 0 0 0 1 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
  1 1 1 0 0 0 0 0 1 0 0 0 1 1]
 [1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0
  0 0 0 1 0 1 0 0 0 1 0 1 0 0]
 [1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0
  0 0 0 1 0 1 0 0 0 1 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
  1 1 1 0 0 0 0 0 1 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
  1 1 1 0 0 0 0 0 1 0 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0

In [8]:
# Split into training and validation/testing sets
training_data, validation_data = train_test_split(df, test_size=0.2, random_state=42)

print(f"Size of training set: {len(training_data)}")
print(f"Size of validation set: {len(validation_data)}\n")

# Encode the data using your binding_2D_matrix_encoder's binding_encoding function
def encode_dataset(data):
    # Use the function from the binding_2D_matrix_encoder module
    return b2dme.binding_encoding(data, alphabet)

# Encode the training data and validation data
encoded_training_data, training_labels = encode_dataset(training_data)
encoded_validation_data, validation_labels = encode_dataset(validation_data)

# Print example of encoded data
print(f"Encoded training data shape: {encoded_training_data.shape}")
print(f"Encoded validation data shape: {encoded_validation_data.shape}\n")
print(f"First encoded training example:\n{encoded_training_data[0]}")
print(f"First training label: {training_labels[0]}\n")

Size of training set: 33968
Size of validation set: 8492

Encoded training data shape: (33968, 50, 20, 1)
Encoded validation data shape: (8492, 50, 20, 1)

First encoded training example:
[[[1.]
  [0.]
  [1.]
  [0.]
  [1.]
  [1.]
  [0.]
  [0.]
  [0.]
  [1.]
  [1.]
  [1.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [1.]
  [1.]
  [0.]
  [0.]
  [0.]]

 [[1.]
  [0.]
  [1.]
  [0.]
  [1.]
  [1.]
  [0.]
  [0.]
  [0.]
  [1.]
  [1.]
  [1.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [1.]
  [1.]
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [1.]
  [1.]
  [0.]
  [0.]
  [0.]]

 [[0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0.]
  [0