In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from Bio import SeqIO
from Bio.Blast import NCBIWWW
import pandas as pd

In [None]:
fasta_file = "data/golden_dataset.fasta"
records = list(SeqIO.parse(fasta_file, "fasta"))
print(f"Loaded {len(records)} sequences from the Golden Dataset.")
for record in records:
    print(f"- ID: {record.id}, Length: {len(record.seq)} bp")

In [None]:
print("\n--- Testing DNABERT Classifier ---")
model_name = 'zhihan1996/dna_bert_3'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Test with the Anglerfish sequence
anglerfish_seq = str(records[2].seq)
inputs = tokenizer(anglerfish_seq, return_tensors="pt", padding=True, truncation=True, max_length=512)

with torch.no_grad():
    outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=1)
    confidence, predicted_class_id = torch.max(probabilities, dim=1)

print(f"Model prediction for Anglerfish sequence:")
print(f"  - Predicted Class ID: {predicted_class_id.item()}")
print(f"  - Confidence: {confidence.item():.4f}")
print("This confirms the model loads and provides an output.")

In [None]:
print("\n--- Testing BLAST Check ---")
print("Running BLAST on the Anglerfish sequence. This may take a moment...")
try:
    result_handle = NCBIWWW.qblast("blastn", "nt", anglerfish_seq, megablast=True, hitlist_size=1)
    blast_records = list(result_handle)
    if blast_records and blast_records[0].descriptions:
        title = blast_records[0].descriptions[0].title
        print(f"BLAST found a match: '{title}'")
    else:
        print("BLAST found no significant match.")
except Exception as e:
    print(f"BLAST query failed: {e}")
print("This confirms our 'hidden pain point' demo is working.")