# Hyperbolic AMP Navigator

## Multi-Objective Antimicrobial Peptide Optimization

**Partner:** Carlos Brizuela  
**Objective:** Navigate the hyperbolic latent space to design AMPs with high activity and low toxicity

### Key Features
1. Visualize AMPs in hyperbolic (p-adic) embedding space
2. Identify toxic vs non-toxic clusters
3. Compute geodesic paths from toxic to safe regions
4. Run NSGA-II optimization in latent space

In [None]:
# Standard imports
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Circle

# Add project root
project_root = Path.cwd().parents[1]
sys.path.insert(0, str(project_root))

# Project imports
try:
    from scripts.optimization.latent_nsga2 import (
        LatentNSGA2, OptimizationConfig, create_mock_objectives
    )
    print("NSGA-II optimizer loaded successfully")
except ImportError as e:
    print(f"Warning: Could not load optimizer: {e}")

## 1. Load AMP Data

Load pre-computed hyperbolic embeddings from StarPepDB.

In [None]:
import torch

# Path to processed data
data_path = project_root / "data" / "processed" / "starpep_hyperbolic.pt"

if data_path.exists():
    data = torch.load(data_path)
    embeddings = data['embeddings'].numpy()
    metadata = data['metadata']
    print(f"Loaded {len(embeddings)} peptides")
    print(f"Embedding dimension: {embeddings.shape[1]}")
else:
    # Generate synthetic demo data
    print("Data not found. Generating demo data...")
    np.random.seed(42)
    
    # Create two clusters: toxic and non-toxic
    n_toxic = 100
    n_safe = 200
    
    toxic_embeddings = np.random.randn(n_toxic, 16) * 0.3 + np.array([1.5] * 16)
    safe_embeddings = np.random.randn(n_safe, 16) * 0.4 + np.array([-0.5] * 16)
    
    embeddings = np.vstack([toxic_embeddings, safe_embeddings])
    labels = np.array([1] * n_toxic + [0] * n_safe)
    
    metadata = pd.DataFrame({
        'sequence': [f'PEPTIDE_{i}' for i in range(len(embeddings))],
        'toxic': labels,
        'activity': np.random.rand(len(embeddings))
    })
    
    print(f"Generated {len(embeddings)} synthetic peptides")

## 2. Visualize Latent Space

Use UMAP/PCA to reduce dimensionality for visualization.

In [None]:
from sklearn.decomposition import PCA

# Reduce to 2D for visualization
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(embeddings)

print(f"Variance explained: {pca.explained_variance_ratio_.sum():.2%}")

In [None]:
# Plot toxic vs non-toxic
fig, ax = plt.subplots(figsize=(10, 8))

if 'toxic' in metadata.columns:
    toxic_mask = metadata['toxic'] == 1
    
    ax.scatter(
        embeddings_2d[~toxic_mask, 0], 
        embeddings_2d[~toxic_mask, 1],
        c='green', alpha=0.6, label='Non-toxic', s=50
    )
    ax.scatter(
        embeddings_2d[toxic_mask, 0], 
        embeddings_2d[toxic_mask, 1],
        c='red', alpha=0.6, label='Toxic', s=50
    )
    
    # Add cluster centroids
    safe_centroid = embeddings_2d[~toxic_mask].mean(axis=0)
    toxic_centroid = embeddings_2d[toxic_mask].mean(axis=0)
    
    ax.scatter(*safe_centroid, c='darkgreen', s=200, marker='*', label='Safe Center')
    ax.scatter(*toxic_centroid, c='darkred', s=200, marker='*', label='Toxic Center')
    
    # Draw geodesic path (straight line in PCA space)
    ax.annotate('', xy=safe_centroid, xytext=toxic_centroid,
                arrowprops=dict(arrowstyle='->', color='blue', lw=2))
    ax.text((safe_centroid[0] + toxic_centroid[0])/2,
            (safe_centroid[1] + toxic_centroid[1])/2 + 0.2,
            'Optimization Path', fontsize=10, color='blue')
else:
    ax.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.6)

ax.set_xlabel('PC1', fontsize=12)
ax.set_ylabel('PC2', fontsize=12)
ax.set_title('AMP Hyperbolic Latent Space', fontsize=14)
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Run NSGA-II Optimization

Optimize latent coordinates to find Pareto-optimal peptides.

In [None]:
# Configure optimization
config = OptimizationConfig(
    latent_dim=16,
    population_size=100,
    generations=50,
    seed=42
)

# Define objectives (using mocks for demo)
objectives = create_mock_objectives()

print(f"Optimization Config:")
print(f"  Population: {config.population_size}")
print(f"  Generations: {config.generations}")
print(f"  Objectives: {len(objectives)}")

In [None]:
# Run optimization
optimizer = LatentNSGA2(config, objectives)
pareto_front = optimizer.run(verbose=True)

print(f"\nPareto front size: {len(pareto_front)}")

In [None]:
# Visualize Pareto front
pareto_latents = np.array([ind.latent for ind in pareto_front])
pareto_2d = pca.transform(pareto_latents)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Left: Latent space with Pareto solutions
ax = axes[0]
ax.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.3, c='gray', label='Original')
ax.scatter(pareto_2d[:, 0], pareto_2d[:, 1], c='blue', s=100, marker='D', label='Pareto Front')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('Pareto-Optimal Solutions in Latent Space')
ax.legend()
ax.grid(True, alpha=0.3)

# Right: Objective space
ax = axes[1]
objectives_arr = np.array([ind.objectives for ind in pareto_front])
ax.scatter(objectives_arr[:, 0], objectives_arr[:, 1], c='blue', s=50)
ax.set_xlabel('Objective 1 (Reconstruction)')
ax.set_ylabel('Objective 2 (Toxicity)')
ax.set_title('Pareto Front in Objective Space')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Export Best Candidates

Export the Pareto-optimal latent vectors for decoding.

In [None]:
# Create results dataframe
results = []
for i, ind in enumerate(pareto_front[:20]):  # Top 20
    results.append({
        'rank': i + 1,
        'objective_1': ind.objectives[0],
        'objective_2': ind.objectives[1],
        'objective_3': ind.objectives[2] if len(ind.objectives) > 2 else np.nan,
        **{f'z_{j}': z for j, z in enumerate(ind.latent)}
    })

results_df = pd.DataFrame(results)
print(results_df.head(10))

In [None]:
# Save results
output_path = project_root / 'results' / 'pareto_peptides_brizuela.csv'
output_path.parent.mkdir(parents=True, exist_ok=True)
results_df.to_csv(output_path, index=False)
print(f"Saved results to {output_path}")

## Summary

This notebook demonstrates:
1. Loading and visualizing AMP embeddings in hyperbolic space
2. Identifying toxic vs non-toxic clusters
3. Running NSGA-II to find Pareto-optimal solutions
4. Exporting candidates for experimental validation

**Next Steps:**
- Decode latent vectors back to sequences using VAE decoder
- Validate predicted peptides with toxicity assays
- Iterate based on experimental feedback