# DeepBio-Scan: Large-Scale Atlas Seeding (N=100,000)
## Phase 1: Reference Atlas Generation

**Personas Active:**
- `@Embedder-ML` (Model Logic & Inference)
- `@Data-Ops` (Data Pipeline & Parquet Export)

**Hardware Target:** Google Colab T4 GPU (or better)

# DeepBio-Scan: Colab Seeding Phase
## Phase 1: Reference Atlas Generation

**Personas Active:**
- `@Embedder-ML` (Model Logic & Inference)
- `@Data-Ops` (Data Pipeline & Parquet Export)

**Hardware Target:** Google Colab T4 GPU (or better)

In [None]:
# @Data-Ops: Dependency Setup
# Fix: Uninstall torch_xla to prevent ABI conflicts with updated libraries on GPU runtimes
!pip uninstall -y torch_xla
!pip install --upgrade transformers==4.40.2 pandas pyarrow duckdb lancedb accelerate biopython

In [None]:
import os
import torch
import pandas as pd
import duckdb
import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig

# @Embedder-ML: GPU Acceleration Check
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {device.upper()}")
if device == "cpu":
    print("WARNING: GPU not detected. Embedding will be slow.")

In [None]:
class ColabEmbedder:
    def __init__(self, model_name="InstaDeepAI/nucleotide-transformer-v2-50m-multi-species"):
        """
        @Embedder-ML: Initializes the Nucleotide Transformer with GPU optimization.
        """
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        
        print(f"Initializing Model: {model_name}...")
        
        # Load Config
        config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
        
        # @Embedder-ML: Size Auto-Detection
        # The 50M model uses hidden_size=512, while larger ones use 768 or 1280.
        # We assume the config on HuggingFace is now correct (intermediate_size=2048 for 50M).
        self.hidden_dim = getattr(config, "hidden_size", 512)
        print(f"Auto-Detected Hidden Dimension: {self.hidden_dim}")
            
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        self.model = AutoModelForMaskedLM.from_pretrained(
            model_name, 
            config=config,
            trust_remote_code=True
        ).to(self.device)
        
        self.model.eval()
        print("Model successfully loaded on GPU.")

    def get_embeddings(self, sequences, batch_size=16):
        """
        Generates embeddings with Shape Defense logic.
        """
        all_embeddings = []
        
        # Process in batches
        for i in range(0, len(sequences), batch_size):
            batch = sequences[i:i+batch_size]

            # @Embedder-ML: Sequence Cleaner (Biological Reality)
            # Real sequences from NCBI often contain whitespaces, newlines, or "N" (unknown) characters.
            batch = [seq.upper().replace("\n", "").replace("\r", "") for seq in batch]
            # Replace 'N' with 'A' (neutral assumption) for model stability
            batch = [seq.replace("N", "A") for seq in batch]
            
            # Tokenization
            inputs = self.tokenizer(
                batch, 
                return_tensors="pt", 
                padding=True, 
                truncation=True, 
                max_length=1000
            ).to(self.device)
            
            with torch.no_grad():
                outputs = self.model(**inputs, output_hidden_states=True)
                
                # Mean Pooling Strategy
                last_hidden_state = outputs.hidden_states[-1]
                attention_mask = inputs["attention_mask"].unsqueeze(-1).expand(last_hidden_state.size()).float()
                
                sum_embeddings = torch.sum(last_hidden_state * attention_mask, 1)
                sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)
                mean_embeddings = sum_embeddings / sum_mask
                
                # @Embedder-ML: Shape Defense
                # Verify dimensions are (Batch_Size, Hidden_Dim)
                if mean_embeddings.shape[1] != self.hidden_dim:
                    raise ValueError(f"Shape Mismatch! Expected {self.hidden_dim}, got {mean_embeddings.shape[1]}")
                
                all_embeddings.append(mean_embeddings.cpu().numpy())
        
        if not all_embeddings:
            return np.empty((0, self.hidden_dim))
        
        # @Data-Ops: Vector Type Casting (Memory Optimization)
        # LanceDB and 32GB USB drive performance optimization (float32)
        return np.concatenate(all_embeddings, axis=0).astype(np.float32)

In [None]:
# @Data-Ops: File Upload Utility
# Run this cell to upload 'Expedition_DeepSea_Batch.fasta' from your local machine.
try:
    from google.colab import files
    print("Initiating Transport Uplink...")
    uploaded = files.upload()
    for fn in uploaded.keys():
        print(f"Received artifact: {fn} ({len(uploaded[fn])} bytes)")
except ImportError:
    print("Not running in Google Colab. Skipping upload widget.")

In [None]:
# @Data-Ops: Real Data Ingestion
# Upload your real FASTA or Parquet to Colab, then run:

import os
import pandas as pd

input_path = "real_deepsea_data.parquet" 

if os.path.exists(input_path):
    print(f"Found real dataset: {input_path}")
    df = pd.read_parquet(input_path)
    print(f"Loaded {len(df)} real sequences.")
else:
    print("Input file not found. Falling back to synthetic test.")
    
    # @Data-Ops: Data Ingestion Simulation
    # In a real scenario, this would query OBIS or NCBI APIs.
    # Here we create a synthetic dataset for the demo seeding.
    print("Generating Synthetic Deep-Sea Dataset for Demo...")
    synthetic_data = {
        "id": [f"seq_{i}" for i in range(100)],
        "species": ["Grimpoteuthis sp." if i % 2 == 0 else "Bathynomus giganteus" for i in range(100)],
        "sequence": ["AGTC" * 250 for _ in range(100)] # Placeholder DNA
    }
    df = pd.DataFrame(synthetic_data)
    print(f"Loaded {len(df)} sequences for processing.")

In [None]:
# @Data-Ops: FASTA to Parquet Conversion
# If you uploaded a FASTA file (e.g., from the DeepBio fetch script), convert it here.
# Otherwise, skip this cell if using a pre-made Parquet.

from Bio import SeqIO
import pandas as pd

fasta_input = "Expedition_DeepSea_Batch.fasta" # Upload this file to Colab first

if os.path.exists(fasta_input):
    print(f"Processing FASTA: {fasta_input}")
    records = []
    for record in SeqIO.parse(fasta_input, "fasta"):
        # Header format from fetcher: >Accession | Species | Depth:XM
        parts = record.description.split("|")
        
        # Robust parsing
        species = parts[1].strip() if len(parts) > 1 else "Unknown"
        seq_id = parts[0].strip()
        
        records.append({
            "id": seq_id,
            "species": species,
            "sequence": str(record.seq)
        })
        
    df = pd.DataFrame(records)
    print(f"Converted {len(df)} FASTA records to DataFrame.")
    
    # Overwrite the input logic for the next cell
    input_path = "real_deepsea_data.parquet"
    df.to_parquet(input_path)
    print("Saved to temporary Parquet for embedding pipeline.")
else:
    print(f"FASTA file {fasta_input} not found. Skipping conversion.")

In [None]:
# @Data-Ops: Pipeline Execution

# 1. Initialize Embedder
embedder = ColabEmbedder()

# 2. Generate Embeddings
print("Starting embedding process...")
vectors = embedder.get_embeddings(df["sequence"].tolist(), batch_size=32)

# 3. Merge with Metadata
df["vector"] = list(vectors)
print(f"Vectors Generated. Shape: {vectors.shape}")

# 4. Save to Parquet (Optimized for LanceDB)
output_file = "deepbio_reference_atlas.parquet"
df.to_parquet(output_file)
print(f"SUCCESS: Atlas saved to {output_file}. Ready for USB Transfer.")