<a href="https://colab.research.google.com/github/Ayushn24/DNA-seq-promoter-vs-non-promoter-classification/blob/main/promoter_vs_non_promoter_DNA_seq_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout
import tensorflow as tf

In [2]:
# For reproducibility
random_state = 42
np.random.seed(random_state)
tf.random.set_seed(random_state)

In [19]:
# Load the dataset
from datasets import load_dataset
dataset = load_dataset("neuralbioinfo/bacterial_promoters")

In [4]:
dataset["train"][0]

{'segment_id': 'PPD000000',
 'ppd_original_SpeciesName': 'Bradyrhizobium japonicum USDA 110',
 'Strand': '+',
 'segment': 'GATTGCTCTGTTTTTTGCGCCCAAGAGACCCTGGCATGCTGGTTGCAAAAGTCTTGATCAAGAAGTCGCCGTCGCAACAGC',
 'class_label': 'promoter',
 'L': 81,
 'prom_class': 'PPD',
 'y': 1}

In [5]:
sequences = dataset["train"]["segment"]
labels = dataset["train"]["y"]

In [6]:
print(sequences[0])
print(labels[0])

GATTGCTCTGTTTTTTGCGCCCAAGAGACCCTGGCATGCTGGTTGCAAAAGTCTTGATCAAGAAGTCGCCGTCGCAACAGC
1


In [7]:
lengths = dataset["train"]["L"]
min(lengths), max(lengths)

(81, 81)

In [8]:
mapping = {
    'A': [1,0,0,0],
    'C': [0,1,0,0],
    'G': [0,0,1,0],
    'T': [0,0,0,1]
}

max_len = 81

def one_hot_encode(seq):
    encoded = np.zeros((max_len, 4))
    for i, base in enumerate(seq):
        encoded[i] = mapping.get(base, [0,0,0,0])
    return encoded

In [9]:
X = np.array([one_hot_encode(seq) for seq in sequences])
y = np.array(labels)

In [10]:
print(X.shape)  # (N, 81, 4)
print(y.shape)  # (N,)

(223092, 81, 4)
(223092,)


In [11]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.3,
    stratify=y,
    random_state=42
)

In [12]:
model_1 = Sequential()
model_1.add(LSTM(64, return_sequences=False, input_shape=(81, 4)))
model_1.add(Dense(32, activation="relu"))
model_1.add(Dropout(0.4))
model_1.add(Dense(1, activation="sigmoid"))

  super().__init__(**kwargs)


In [13]:
model_1.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model_1.summary()

In [14]:
history_1 = model_1.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    verbose=1
)

Epoch 1/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 9ms/step - accuracy: 0.5681 - loss: 0.6784 - val_accuracy: 0.6356 - val_loss: 0.6380
Epoch 2/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 8ms/step - accuracy: 0.6431 - loss: 0.6321 - val_accuracy: 0.6622 - val_loss: 0.6094
Epoch 3/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 8ms/step - accuracy: 0.6612 - loss: 0.6126 - val_accuracy: 0.6734 - val_loss: 0.5958
Epoch 4/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 8ms/step - accuracy: 0.6761 - loss: 0.5957 - val_accuracy: 0.6824 - val_loss: 0.5839
Epoch 5/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 9ms/step - accuracy: 0.6841 - loss: 0.5850 - val_accuracy: 0.6867 - val_loss: 0.5777
Epoch 6/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 9ms/step - accuracy: 0.6913 - loss: 0.5756 - val_accuracy: 0.6933 - val_loss: 0.5679
Epoch 7/10

In [15]:
model_2 = Sequential()
model_2.add(GRU(64, return_sequences=False, input_shape=(81, 4)))
model_2.add(Dense(32, activation="relu"))
model_2.add(Dropout(0.4))
model_2.add(Dense(1, activation="sigmoid"))

  super().__init__(**kwargs)


In [16]:
model_2.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model_2.summary()

In [17]:
history_2 = model_2.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    verbose=1
)

Epoch 1/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 9ms/step - accuracy: 0.5065 - loss: 0.6927 - val_accuracy: 0.6131 - val_loss: 0.6570
Epoch 2/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 8ms/step - accuracy: 0.6289 - loss: 0.6460 - val_accuracy: 0.6795 - val_loss: 0.5887
Epoch 3/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 8ms/step - accuracy: 0.6844 - loss: 0.5849 - val_accuracy: 0.6849 - val_loss: 0.5783
Epoch 4/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 8ms/step - accuracy: 0.6955 - loss: 0.5696 - val_accuracy: 0.7046 - val_loss: 0.5569
Epoch 5/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 8ms/step - accuracy: 0.6994 - loss: 0.5631 - val_accuracy: 0.7098 - val_loss: 0.5500
Epoch 6/10
[1m4881/4881[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 8ms/step - accuracy: 0.7037 - loss: 0.5576 - val_accuracy: 0.7132 - val_loss: 0.5470
Epoch 7/10