<a href="https://colab.research.google.com/github/BerpyDerpy/DeepTide-ML/blob/main/DeepTide_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m1.7/3.3 MB[0m [31m49.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from Bio import Entrez, SeqIO

# Settings
Entrez.email = "sairithwik2007@gmail.com"
NUM_SEQS_PER_CLASS = 25
CLASSES = ["Cnidaria", "Arthropoda", "Porifera", "Echinodermata"]
OUTPUT_CSV = "dna_dataset.csv"

SEQ_LENGTH = 300
BATCH_SIZE = 8
LEARNING_RATE = 0.001
EPOCHS = 20
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
def fetch_sequences(term, num_seqs):
    print(f"Searching for '{term}'...")
    handle = Entrez.esearch(db="nucleotide", term=term, retmax=num_seqs, idtype="acc")
    results = Entrez.read(handle)
    handle.close()
    id_list = results["IdList"]
    if not id_list: return ""
    handle = Entrez.efetch(db="nucleotide", id=id_list, rettype="fasta", retmode="text")
    records = handle.read()
    handle.close()
    return records


In [None]:
if not os.path.exists(OUTPUT_CSV):
    print("Downloading dataset...")
    all_data = []
    for label in CLASSES:
        search_term = f'("{label}"[Organism]) AND 18S ribosomal RNA[Title] AND 1500:2500[Sequence Length]'
        fasta_data = fetch_sequences(search_term, NUM_SEQS_PER_CLASS)
        with open("temp.fasta", "w") as f: f.write(fasta_data)
        for record in SeqIO.parse("temp.fasta", "fasta"):
            all_data.append({"sequence": str(record.seq), "label": label})
        os.remove("temp.fasta")
    df = pd.DataFrame(all_data)
    df.to_csv(OUTPUT_CSV, index=False)

Downloading dataset...
Searching for '("Cnidaria"[Organism]) AND 18S ribosomal RNA[Title] AND 1500:2500[Sequence Length]'...
Searching for '("Arthropoda"[Organism]) AND 18S ribosomal RNA[Title] AND 1500:2500[Sequence Length]'...
Searching for '("Porifera"[Organism]) AND 18S ribosomal RNA[Title] AND 1500:2500[Sequence Length]'...
Searching for '("Echinodermata"[Organism]) AND 18S ribosomal RNA[Title] AND 1500:2500[Sequence Length]'...


In [None]:
def one_hot_encode(seq):
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    encoded = np.zeros((4, len(seq)), dtype=np.float32)
    for i, base in enumerate(seq):
        if base in mapping:
            encoded[mapping[base], i] = 1.0
    return encoded

class DNADataset(Dataset):
    def __init__(self, dataframe, label_encoder):
        self.sequences = dataframe['sequence'].values
        self.labels = label_encoder.transform(dataframe['label'].values)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        label = self.labels[idx]
        encoded_seq = one_hot_encode(seq)
        return torch.tensor(encoded_seq, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


In [None]:
df = pd.read_csv(OUTPUT_CSV)
df['sequence'] = df['sequence'].apply(lambda x: x[:SEQ_LENGTH].ljust(SEQ_LENGTH, 'N'))

classes = ['Cnidaria', 'Arthropoda', 'Porifera', 'Echinodermata']
print(classes)

label_encoder = LabelEncoder()
label_encoder.fit_transform(classes)
CLASSES = label_encoder.classes_
print(CLASSES)

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

train_dataset = DNADataset(train_df, label_encoder)
val_dataset = DNADataset(val_df, label_encoder)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


['Cnidaria', 'Arthropoda', 'Porifera', 'Echinodermata']
['Arthropoda' 'Cnidaria' 'Echinodermata' 'Porifera']


In [None]:
class SimpleDNA_CNN(nn.Module):
    def __init__(self, num_classes=4, seq_length=SEQ_LENGTH):
        super(SimpleDNA_CNN, self).__init__()

        self.conv_layer = nn.Sequential(
            nn.Conv1d(in_channels=4, out_channels=32, kernel_size=5, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2),
            )

        # Calculate the output size of the convolutional layer
        # Pass a dummy tensor through the conv_layer to determine the size
        dummy_input = torch.randn(1, 4, seq_length)
        with torch.no_grad():
            conv_output = self.conv_layer(dummy_input)
        conv_output_size = conv_output.view(1, -1).size(1)


        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(conv_output_size, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self,x):
      x = self.conv_layer(x)
      return self.classifier(x)

In [None]:
model = SimpleDNA_CNN(num_classes=len(CLASSES)).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
  model.train()
  total_loss = 0
  for sequences, labels in train_loader:
    sequences, labels = sequences.to(DEVICE), labels.to(DEVICE)
    optimizer.zero_grad()
    outputs = model(sequences)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f"EPOCHS: {epoch}, Loss: {total_loss/len(train_loader):.4f}")


EPOCHS: 0, Loss: 1.2078
EPOCHS: 1, Loss: 0.6782
EPOCHS: 2, Loss: 0.4170
EPOCHS: 3, Loss: 0.2470
EPOCHS: 4, Loss: 0.1293
EPOCHS: 5, Loss: 0.0669
EPOCHS: 6, Loss: 0.0362
EPOCHS: 7, Loss: 0.0238
EPOCHS: 8, Loss: 0.0158
EPOCHS: 9, Loss: 0.0120
EPOCHS: 10, Loss: 0.0094
EPOCHS: 11, Loss: 0.0075
EPOCHS: 12, Loss: 0.0064
EPOCHS: 13, Loss: 0.0054
EPOCHS: 14, Loss: 0.0046
EPOCHS: 15, Loss: 0.0040
EPOCHS: 16, Loss: 0.0035
EPOCHS: 17, Loss: 0.0032
EPOCHS: 18, Loss: 0.0028
EPOCHS: 19, Loss: 0.0026


In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
  for sequences, labels in val_loader:
    sequences, labels = sequences.to(DEVICE), labels.to(DEVICE)
    outputs = model(sequences)
    _, predicted = torch.max(outputs.data, 1)
    total += len(labels)
    correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f"Validation Accuracy: {accuracy:.2f}%")

Validation Accuracy: 90.00%


In [None]:
torch.save(model.state_dict(), "dna_classifier.pth")

In [None]:
model = SimpleDNA_CNN()

model.load_state_dict(torch.load("DeepTide_v0.pth", map_location=torch.device('cpu')))
model.eval()
classes = ['Cnidaria', 'Arthropoda', 'Porifera', 'Echinodermata']
label_encoder = LabelEncoder()
label_encoder.fit_transform(classes)
CLASSES = label_encoder.classes_
print(CLASSES)

['Arthropoda' 'Cnidaria' 'Echinodermata' 'Porifera']


In [None]:
seq ="TGAAGACCTAAGCCCATAGCCTACCTGCAATAACCCTAGGGTACCAAATCGTCCGGCTGAGGTCTTGACGAATACTACGGAAATAGTATGATTTAAAGCCGGGGTCTGATTTACATATGTTATCCAAAAGCGCGTGCTCCACGTACGTGACGGCTGTCCCTTCGAGGAATCAGGATTGCCCCGTGGATATTGGACATCTCCAAAAGCTATTCCGACCCTCCCGCCTCGAGAGAAACCGACTAGGCTCTAATCAGTGACTACAGAGTTCGCGCCTTAACAAAGCCCACCACCAGCTAGGCT"
encoded_seq = one_hot_encode(seq)
input_tensor = torch.tensor(encoded_seq, dtype=torch.float32).unsqueeze(0).to("cpu")

model.eval()
with torch.no_grad():
    output = model(input_tensor)
    _, predicted_class = torch.max(output, 1)
    probabilities = torch.softmax(output, dim=1)
    confidence = probabilities[0][predicted_class].item()*100

print(output)
print(probabilities)
predicted_label = label_encoder.inverse_transform(predicted_class.cpu().numpy())
print("Predicted class:", predicted_label[0])
print(f"Confidence: {confidence:.4f}")

tensor([[-1.2638,  1.4131, -0.3065,  0.6217]])
tensor([[0.0404, 0.5879, 0.1053, 0.2664]])
Predicted class: Cnidaria
Confidence: 58.7855
