In [None]:
import sys
import os

sys.path.append("scripts")

try:
	from encode_dna import read_fasta, filter_short_sequences, encode_kmer_batch, one_hot_encode, save_numpy_array
except ModuleNotFoundError:
	from encode_dna import read_fasta, filter_short_sequences, encode_kmer_batch, one_hot_encode, save_numpy_array
	
import numpy as np

# 🧬 1. Load and filter sequences

In [20]:
# Define potential file paths to try
potential_paths = [
	"../data/raw/azadirachta_indica.fasta",
	"data/raw/azadirachta_indica.fasta",
	"./data/raw/azadirachta_indica.fasta",
	os.path.join(os.getcwd(), "data/raw/azadirachta_indica.fasta")
]

# Try each path until we find the file
fasta_path = None
for path in potential_paths:
	if os.path.exists(path):
		fasta_path = path
		print(f"Found FASTA file at: {path}")
		break

# If no file found, prompt user for the correct path
if not fasta_path:
	print(f"❌ Error: FASTA file not found. Current directory: {os.getcwd()}")
	fasta_path = input("Please enter the correct path to the FASTA file: ")

# Read sequences
sequences = read_fasta(fasta_path)
print(f"Total sequences: {len(sequences)}")

filtered = filter_short_sequences(sequences, min_length=30)
print(f"Filtered sequences (len ≥ 30): {len(filtered)}")

Found FASTA file at: data/raw/azadirachta_indica.fasta
Total sequences: 1
Filtered sequences (len ≥ 30): 1


🧪 2. K-mer Encoding

In [21]:
X_kmer, kmer_vocab = encode_kmer_batch(filtered, k=3)
print(f"K-mer encoded shape: {X_kmer.shape}")

# Create output directory if it doesn't exist
output_dir = "data/processed"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "X_kmer.npy")

save_numpy_array(X_kmer, output_path)
print(f"Successfully saved k-mer encoding to: {output_path}")

K-mer encoded shape: (1, 64)
Successfully saved k-mer encoding to: data/processed\X_kmer.npy


# 🔢 3. One-hot Encoding (optional – for CNNs)
# WARNING: This can consume lots of RAM if too many sequences

In [22]:
try:
    X_onehot = one_hot_encode(filtered)
    print(f"One-hot encoded shape: {X_onehot.shape}")
    # Create output directory if it doesn't exist
    output_dir = "data/processed"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "X_onehot.npy")
    save_numpy_array(X_onehot, output_path)
except Exception as e:
    print("❌ One-hot encoding failed:", e)

One-hot encoded shape: (1, 160737)


# ✅ Done

In [23]:
print("🎉 Preprocessing complete. Encoded data saved.")

🎉 Preprocessing complete. Encoded data saved.
