In [None]:
!unzip real_data.zip

In [None]:
import torch
import os

target_length = 250

def encode_sequences(tokenizer, source_sequences, target_sequences, max_length=512):
    input_ids = []
    attention_masks = []
    target_ids = []

    for src, tgt in zip(source_sequences, target_sequences):
        src_tokenized = tokenizer.encode_plus(src, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
        tgt_tokenized = tokenizer.encode_plus(tgt, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")

        input_ids.append(src_tokenized['input_ids'])
        attention_masks.append(src_tokenized['attention_mask'])
        target_ids.append(tgt_tokenized['input_ids'])

    return torch.cat(input_ids), torch.cat(attention_masks), torch.cat(target_ids)

def load_dna_sequences(folder):
    sequences = []
    for filename in sorted(os.listdir(folder)):
        if filename.endswith('.dna'):
            with open(os.path.join(folder, filename), 'r') as file:
                # Read the file content
                content = file.read()
                # Remove whitespaces, newlines, and special characters
                cleaned_content = ''.join(filter(str.isalpha, content))
                sequences.append(cleaned_content.upper())  # Convert to upper case if needed
    return sequences

# Load training, validation, and test sets
train_data = load_dna_sequences('train_data')
train_labels = load_dna_sequences('train_labels')
val_data = load_dna_sequences('val_data')
val_labels = load_dna_sequences('val_labels')
test_data = load_dna_sequences('test_data')
test_labels = load_dna_sequences('test_labels')

In [None]:
%pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
from transformers import ByT5Tokenizer, T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')
tokenizer = ByT5Tokenizer.from_pretrained('google/byt5-small')

In [None]:
train_input_ids, train_attention_masks, train_target_ids = encode_sequences(tokenizer, train_data, train_labels, target_length)
val_input_ids, val_attention_masks, val_target_ids = encode_sequences(tokenizer, val_data, val_labels, target_length)
test_input_ids, test_attention_masks, test_target_ids = encode_sequences(tokenizer, test_data, test_labels, target_length)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

batch_size = 8  # Adjust based on your GPU capacity

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_target_ids)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_target_ids)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_target_ids)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
from transformers import AdamW
import torch

# Check if CUDA is available and set it as the default device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.cuda.empty_cache()

optimizer = AdamW(model.parameters(), lr=5e-5)

model.to(device)
model.train()

num_epochs = 5
for epoch in range(num_epochs):
    # Training loop
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    total_eval_loss = 0
    for batch in val_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()

    avg_val_loss = total_eval_loss / len(val_loader)
    print(f'Validation Loss: {avg_val_loss}')



Validation Loss: 3.372872178753217
Validation Loss: 1.887516846259435
Validation Loss: 0.5185126985112826
Validation Loss: 0.07291650065841775
Validation Loss: 0.05703257790689046


In [None]:
prediction_length = 96

predictions = []
actuals = []

model.eval()

for batch in test_loader:
    input_ids, attention_mask, labels = [b.to(device) for b in batch]

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=prediction_length,  # Set your target length here
    min_length=prediction_length)

    decoded_preds = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
    decoded_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]

    predictions.extend(decoded_preds)
    actuals.extend(decoded_labels)

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
%pip install levenshtein



In [None]:
from Levenshtein import distance as levenshtein_distance
import numpy as np

# Store individual distances
distances = []
for pred, actual in zip(predictions, actuals):
    if len(actual) > 0:  # To avoid division by zero
        distances.append(levenshtein_distance(pred, actual) / len(actual))

print(predictions[39])
print(actuals[39])

print(distances)
print(len(predictions[41]))
print(len(actuals[41]))
# Calculate the median distance
max_distance = max(distances)
min_distance = min(distances)
median_distance = np.median(distances)
average_distance = np.average(distances)
print(f"Median Levenshtein Distance: {median_distance}")
print(f"Max Levenshtein Distance: {max_distance}")
print(f"Min Levenshtein Distance: {min_distance}")
print(f"Average Levenshtein Distance: {average_distance}")

print(len(distances))

CGGGAAGCCCGCCGACAACCGACTGTGGTCTTTGATTTATACTCGGTCACATGATCAATCGCTGACTACGTTCAGATCGTACCGCA#
TGAACGAGTTTGGGAAGCCCGCCGACAACCGACTGTGGTCTTTGATTTATACTCGGTCACATGATCAATCGCTGACTACGTTCAGATCGTACCGCA
[0.010416666666666666, 0.5833333333333334, 0.010416666666666666, 0.052083333333333336, 0.010416666666666666, 0.05154639175257732, 0.010416666666666666, 0.020833333333333332, 0.042105263157894736, 0.031578947368421054, 0.010416666666666666, 0.03125, 0.041237113402061855, 0.23958333333333334, 0.03125, 0.1111111111111111, 0.010416666666666666, 0.020833333333333332, 0.041666666666666664, 0.03125, 0.030927835051546393, 0.03125, 0.052083333333333336, 0.03125, 0.07291666666666667, 0.03125, 0.0425531914893617, 0.010416666666666666, 0.010416666666666666, 0.05319148936170213, 0.4, 0.020833333333333332, 0.4895833333333333, 0.40217391304347827, 0.020833333333333332, 0.19791666666666666, 0.041666666666666664, 0.010416666666666666, 0.21875, 0.11458333333333333, 0.041666666666666664, 0.052083333333333336, 0.14583333333

In [None]:
# Save the model
model.save_pretrained('dna_t5_model')

# Load the model
model = T5ForConditionalGeneration.from_pretrained('dna_t5_model')