In [None]:
!unzip real_data.zip

In [None]:
import os
import torch

def load_dna_sequences(folder):
    sequences = []
    for filename in sorted(os.listdir(folder)):
        if filename.endswith('.dna'):
            with open(os.path.join(folder, filename), 'r') as file:
                # Read the file content
                content = file.read()
                # Remove whitespaces, newlines, and special characters
                cleaned_content = ''.join(filter(str.isalpha, content))
                sequences.append(cleaned_content.upper())  # Convert to upper case if needed
    return sequences

# Load training, validation, and test sets
train_data = load_dna_sequences('train_data')
train_labels = load_dna_sequences('train_labels')
val_data = load_dna_sequences('val_data')
val_labels = load_dna_sequences('val_labels')
test_data = load_dna_sequences('test_data')
test_labels = load_dna_sequences('test_labels')

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Initialize the tokenizer and model
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

In [None]:
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

class DNADataset(Dataset):
    def __init__(self, data, labels, tokenizer):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_sequence = self.data[idx]
        target_sequence = self.labels[idx]

        input_tokens = self.tokenizer(input_sequence, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
        target_tokens = self.tokenizer(target_sequence, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

        return input_tokens.input_ids.squeeze(), target_tokens.input_ids.squeeze()

In [None]:
# Prepare Datasets
train_dataset = DNADataset(train_data, train_labels, tokenizer)
val_dataset = DNADataset(val_data, val_labels, tokenizer)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=5)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(10):  # Number of epochs
    model.train()
    for input_ids, labels in train_loader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch} completed")

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for input_ids, labels in val_loader:
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            outputs = model(input_ids=input_ids, labels=labels)
            val_loss += outputs.loss.item()

    print(f"Validation loss: {val_loss / len(val_loader)}")


Epoch 0 completed
Validation loss: 0.03717063601422859
Epoch 1 completed
Validation loss: 0.02943184882353403
Epoch 2 completed
Validation loss: 0.027702374722330683
Epoch 3 completed
Validation loss: 0.025820320496629728
Epoch 4 completed
Validation loss: 0.024015144680896282
Epoch 5 completed
Validation loss: 0.023865796205140406
Epoch 6 completed
Validation loss: 0.023635614036098367
Epoch 7 completed
Validation loss: 0.023333009450365545
Epoch 8 completed
Validation loss: 0.02243391638560417
Epoch 9 completed
Validation loss: 0.022137515109773773


In [None]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration

model.eval()  # Set the model to evaluation mode

test_dataset = DNADataset(test_data, test_labels, tokenizer)

# Initialize the tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# Using DNADataset for test data
test_dataset = DNADataset(test_data, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=5)  # Batch size set to 1 for individual prediction

# Generate predictions
model.eval()
predictions = []

counter = 0
with torch.no_grad():
    for input_ids, _ in test_loader:
        print("Batch num:", counter)
        input_ids = input_ids.to(model.device)
        output = model.generate(input_ids, max_length=96, min_length=96, pad_token_id=tokenizer.pad_token_id)
        prediction = tokenizer.decode(output[0], skip_special_tokens=True, trim_offsets=False)
        predictions.append(prediction)
        counter += 1

Batch num: 0
Batch num: 1
Batch num: 2
Batch num: 3
Batch num: 4
Batch num: 5
Batch num: 6
Batch num: 7
Batch num: 8
Batch num: 9
Batch num: 10
Batch num: 11
Batch num: 12
Batch num: 13
Batch num: 14
Batch num: 15
Batch num: 16
Batch num: 17
Batch num: 18
Batch num: 19
Batch num: 20
Batch num: 21
Batch num: 22
Batch num: 23
Batch num: 24
Batch num: 25
Batch num: 26
Batch num: 27
Batch num: 28
Batch num: 29
Batch num: 30
Batch num: 31
Batch num: 32
Batch num: 33
Batch num: 34
Batch num: 35
Batch num: 36
Batch num: 37
Batch num: 38
Batch num: 39
Batch num: 40
Batch num: 41
Batch num: 42
Batch num: 43
Batch num: 44
Batch num: 45
Batch num: 46
Batch num: 47
Batch num: 48
Batch num: 49
Batch num: 50
Batch num: 51
Batch num: 52
Batch num: 53
Batch num: 54
Batch num: 55
Batch num: 56
Batch num: 57
Batch num: 58
Batch num: 59
Batch num: 60
Batch num: 61
Batch num: 62
Batch num: 63
Batch num: 64
Batch num: 65
Batch num: 66
Batch num: 67
Batch num: 68
Batch num: 69
Batch num: 70
Batch num: 71
Ba

In [None]:
import editdistance

# Assuming 'predictions' and 'test_labels' are lists of strings with your predicted and actual sequences

def calculate_normalized_edit_distance(predictions, true_labels):
    total_distance = 0
    total_length = 0

    for pred, true in zip(predictions, true_labels):
        # Compute the edit distance for each pair of sequences
        distance = editdistance.eval(pred[:96], true)
        total_distance += distance
        total_length += len(true)

    # Normalizing the total edit distance by the total length of all sequences
    avg_normalized_distance = total_distance / total_length
    return avg_normalized_distance

actuals = test_labels[::5]
inputs = test_labels[::5]

# Calculate the average edit distance
average_edit_distance = calculate_normalized_edit_distance(predictions, actuals)
print("Average Edit Distance:", average_edit_distance)


Average Edit Distance: 0.023447141094199916


In [None]:
print(predictions[0][:96])
print(inputs[0])
print(actuals[0])

GCAGATAAGGCCGTCACTCCGCAGAGTAGTCTGTTAACGCTTTCATTCCAGGACCCATTCATTGCGGTAGCGCCATCTCTTCAGTCTATCTTCACG
GCAGATAAGGCCGTCACTCCGCAGAGTAGTCTGTTAACGCTTTCATTCCAGGACCCATTCATTGCGGTAGCGCCATCTCTTCAGTCCATCTTCACG
GCAGATAAGGCCGTCACTCCGCAGAGTAGTCTGTTAACGCTTTCATTCCAGGACCCATTCATTGCGGTAGCGCCATCTCTTCAGTCCATCTTCACG
