In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense,
    Bidirectional,
    LSTM,
    Conv1D,
    Dropout,
    BatchNormalization,
    Input,
)
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.preprocessing import LabelEncoder
from google.colab import drive

print("Mounting Google Drive...")
drive.mount('/content/drive')
print("Drive mounted successfully.")


Mounting Google Drive...
Mounted at /content/drive
Drive mounted successfully.


In [None]:


# We define a 5-character vocabulary to include 'N'
VOCAB = "ACGTN"
char_to_int = {char: i for i, char in enumerate(VOCAB)}
int_to_char = {i: char for i, char in enumerate(VOCAB)}
NUM_CLASSES = len(VOCAB)
SEQUENCE_LENGTH = 4  # Predict the 5th nucleotide


def preprocess_data(filepath):
    """Loads a CSV, cleans it, and prepares (X, y) for the model."""
    print(f"Processing {filepath}...")
    try:
        # Load the data
        df = pd.read_csv(filepath)

        # Clean the sequences
        # Remove '<' and '>' characters
        df["NucleotideSequence"] = df["NucleotideSequence"].str.strip("<>")
        # Filter out any non-ACGTN characters (just in case)
        df["CleanSequence"] = df["NucleotideSequence"].apply(
            lambda seq: "".join([char for char in seq.upper() if char in VOCAB])
        )

        # Create integer sequences
        df["IntegerSequence"] = df["CleanSequence"].apply(
            lambda seq: [char_to_int[char] for char in seq]
        )

        # Prepare (X, y) pairs
        X_list = []
        y_list = []

        for seq in df["IntegerSequence"]:
            # Create sliding windows of size (SEQUENCE_LENGTH + 1)
            for i in range(len(seq) - SEQUENCE_LENGTH):
                X_list.append(seq[i : i + SEQUENCE_LENGTH])
                y_list.append(seq[i + SEQUENCE_LENGTH])

        if not X_list:
            print(f"Warning: No valid sequences found in {filepath}")
            return (
                np.array([]).reshape(0, SEQUENCE_LENGTH, NUM_CLASSES),
                np.array([]).reshape(0, NUM_CLASSES),
            )

        # Convert lists to numpy arrays
        X_np = np.array(X_list)
        y_np = np.array(y_list)

        # One-hot encode X and y
        X_one_hot = to_categorical(X_np, num_classes=NUM_CLASSES)
        y_one_hot = to_categorical(y_np, num_classes=NUM_CLASSES)

        print(f"Generated X shape: {X_one_hot.shape}")
        print(f"Generated y shape: {y_one_hot.shape}")

        return X_one_hot, y_one_hot

    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
        print("Please ensure the Google Drive path is correct and the file exists.")
        return None, None
    except Exception as e:
        print(f"An error occurred while processing {filepath}: {e}")
        return None, None


# --- 3. Load and Process Data Using Your Drive Paths ---

# *** UPDATED FILE PATHS ***
train_file = "/content/drive/MyDrive/gene type project/gene dataset/train.csv"
val_file = "/content/drive/MyDrive/gene type project/gene dataset/validation.csv"
test_file = "/content/drive/MyDrive/gene type project/gene dataset/test.csv"

X_train, y_train = preprocess_data(train_file)
X_val, y_val = preprocess_data(val_file)
X_test, y_test = preprocess_data(test_file)



Processing /content/drive/MyDrive/gene type project/gene dataset/train.csv...
Generated X shape: (8036894, 4, 5)
Generated y shape: (8036894, 5)
Processing /content/drive/MyDrive/gene type project/gene dataset/validation.csv...
Generated X shape: (1641930, 4, 5)
Generated y shape: (1641930, 5)
Processing /content/drive/MyDrive/gene type project/gene dataset/test.csv...
Generated X shape: (3009781, 4, 5)
Generated y shape: (3009781, 5)


In [None]:
# --- 4. Define an Improved Model ---

# A more robust model architecture
model = Sequential(
    [
        Input(shape=(SEQUENCE_LENGTH, NUM_CLASSES)),
        # 1D Conv layer to find local patterns (motifs)
        Conv1D(
            filters=128,
            kernel_size=3,
            activation="relu",
            padding="same",
        ),
        BatchNormalization(),
        Dropout(0.2),
        # Bidirectional LSTMs to learn sequence context
        Bidirectional(LSTM(128, return_sequences=True)),
        Dropout(0.3),
        Bidirectional(LSTM(64)),
        Dropout(0.3),
        # Dense layer for final classification
        Dense(64, activation="relu"),
        # Output layer MUST have 5 units for (A, C, G, T, N)
        Dense(NUM_CLASSES, activation="softmax"),
    ]
)

# Compile the model
model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

model.summary()



In [None]:
# --- 5. Set Up Callbacks (Including Model Saving) ---

# Save the *best* performing model based on validation accuracy
checkpoint_callback = ModelCheckpoint(
    filepath="best_dna_model.keras",  # This will save to your Colab instance
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    verbose=1,
)

# Stop training early if the model stops improving
early_stopping_callback = EarlyStopping(
    monitor="val_accuracy",
    patience=5,  # Stop after 5 epochs of no improvement
    restore_best_weights=True,
    verbose=1,
)

# --- 6. Train the Model ---

# Check if data was loaded successfully before training
if X_train is not None and X_val is not None:
    print("\n--- Starting Model Training ---")
    history = model.fit(
        X_train,
        y_train,
        epochs=20,  # Train for more epochs; EarlyStopping will find the best one
        batch_size=512,
        validation_data=(X_val, y_val),
        callbacks=[checkpoint_callback, early_stopping_callback],
    )

    # --- 7. Evaluate and Save Final Model ---
    if X_test is not None:
        print("\n--- Evaluating Model on Test Data ---")
        test_loss, test_accuracy = model.evaluate(X_test, y_test)
        print(f"Test Loss: {test_loss:.4f}")
        print(f"Test Accuracy: {test_accuracy:.4f}")

    # Save the final model
    model.save("final_dna_model.keras")
    print("\nBest model saved as 'best_dna_model.keras'")
    print("Final model saved as 'final_dna_model.keras'")
    print("You can find these saved models in the Colab file browser.")
else:
    print("\n--- Model Training Skipped ---")
    print("Training was skipped because one or more data files could not be loaded.")


--- Starting Model Training ---
Epoch 1/50
[1m61209/62789[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m1:32[0m 59ms/step - accuracy: 0.3247 - loss: 1.3455