In [18]:
import numpy as np
import pandas as pd
import tensorflow as tf
from pathlib import Path
from sklearn.model_selection import train_test_split

from src.features.encodings import pse_knc
from src.features.encoder import FeatureEncoder
from src.utils.random_samples import generate_unique_rna_samples

# This Section is to prepare Sample Dataset

In [19]:
BASE_PATH = Path('/Users/arish/workspace/research/psi_predictor')

In [24]:
def generate_data():
    info = pse_knc.get_info('PseKNC')

    samples = generate_unique_rna_samples(100000, 21)
    encodings = FeatureEncoder.pse_knc(pd.Series(samples), info, 3, 2, 0.1) * 100

    return pd.concat([pd.Series(samples, name='sequence'), encodings], axis=1)

In [26]:
generated_data = generate_data()
print(len(generated_data))
generated_data.to_csv('data.csv', index=False)

100000


# This Section is to train data

In [27]:
data = pd.read_csv('data.csv')

In [28]:
NUCLEOTIDE_MAP = {'A': 1, 'C': 2, 'G': 3, 'U': 4}

sequences = data['sequence']
encoded_sequences = []
for sequence in sequences:
    numerical_sequence = [NUCLEOTIDE_MAP[char.upper()] for char in sequence]
    encoded_sequences.append(numerical_sequence)

In [29]:
input_data = tf.constant(encoded_sequences, dtype=tf.float32)
output_data = tf.constant(data.drop('sequence', axis=1), dtype=tf.float32)

In [30]:
indices = np.arange(len(input_data))
np.random.shuffle(indices)

train_size = int(0.8 * len(input_data))

train_indices = indices[:train_size]
test_indices = indices[train_size:]

input_train, input_test = tf.gather(input_data, train_indices), tf.gather(input_data, test_indices)
output_train, output_test = tf.gather(output_data, train_indices), tf.gather(output_data, test_indices)

In [45]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=5, output_dim=32, input_length=21),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(output_data.shape[1], activation='linear')
])

In [46]:
model.compile(
    loss=tf.keras.losses.MeanSquaredError(),
    optimizer=tf.keras.optimizers.legacy.Adam(),
    metrics=['accuracy']
)

In [49]:
model.fit(input_train, output_train, epochs=10, batch_size=32, validation_data=(input_test, output_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x297d02410>

In [50]:
evaluation = model.evaluate(input_test, output_test)
print(f"Evaluation Loss: {evaluation[0]}")
print(f"Evaluation Accuracy: {evaluation[1]}")

Evaluation Loss: 2.1710333824157715
Evaluation Accuracy: 0.8471500277519226
