In [1]:
from src.dataset import load_dataset, Species, Modification

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout

In [2]:
train_data = load_dataset(Species.human, Modification.psi)

In [3]:
max_length = 41

# Tokenize the sequences
tokenizer = Tokenizer(char_level=True)  # Set char_level to True for character-level tokenization
tokenizer.fit_on_texts([x[0] for x in train_data.samples.values])

# Convert sequences to numerical format
train_sequences = tokenizer.texts_to_sequences([x[0] for x in train_data.samples.values])
# test_sequences = tokenizer.texts_to_sequences(test_data.sequences)

# Pad the sequences
train_sequences_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
# test_sequences_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [4]:
train_sequences_padded

array([[1, 1, 3, ..., 2, 4, 1],
       [2, 1, 2, ..., 1, 1, 4],
       [4, 1, 3, ..., 1, 3, 2],
       ...,
       [3, 3, 1, ..., 2, 2, 4],
       [4, 1, 3, ..., 4, 3, 2],
       [4, 4, 2, ..., 4, 1, 2]], dtype=int32)

In [5]:
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length))

# First RNN layer with more units
model_rnn.add(SimpleRNN(100, return_sequences=True))  # return_sequences=True for stacking RNN layers

# Dropout layer for regularization
model_rnn.add(Dropout(0.2))

# Second RNN layer
model_rnn.add(SimpleRNN(100))

# Another Dropout layer
model_rnn.add(Dropout(0.2))

# Dense layer
model_rnn.add(Dense(64, activation='relu'))

# Final Dense layer with sigmoid activation for binary classification
model_rnn.add(Dense(1, activation='sigmoid'))

# Compile the model
model_rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model_rnn.fit(train_sequences_padded, train_data.targets, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2a0bcde50>

In [7]:
model_rnn.fit(train_sequences_padded, train_data.targets, epochs=20, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x2a1950990>