# Dengue Serotype Forecaster

## Hyperbolic Trajectory Analysis for Arboviral Surveillance

**Partner:** Alejandra Rojas (IICS-UNA)  
**Objective:** Predict serotype dominance and identify stable primer regions

### Key Features
1. Track Dengue serotype evolution in hyperbolic space
2. Compute "Hyperbolic Momentum" to forecast future trajectories
3. Identify stable genomic regions for RT-PCR primers
4. Interactive dashboard for surveillance

In [None]:
# Standard imports
import sys
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import FancyArrowPatch
import seaborn as sns

# Add project root
project_root = Path.cwd().parents[1]
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")

## 1. Load Dengue Sequence Data

Load pre-processed sequences or generate demo data.

In [None]:
# Check for FASTA data
fasta_path = project_root / "data" / "raw" / "dengue_paraguay.fasta"

if fasta_path.exists():
    print(f"Found data at {fasta_path}")
    # Count sequences
    from Bio import SeqIO
    n_seqs = sum(1 for _ in SeqIO.parse(fasta_path, 'fasta'))
    print(f"Total sequences: {n_seqs}")
else:
    print("Data not found. Generating demo data...")

In [None]:
# Generate or load trajectory data
np.random.seed(42)

# Demo: Simulate serotype trajectories over 10 years
years = list(range(2015, 2025))
serotypes = ['DENV-1', 'DENV-2', 'DENV-3', 'DENV-4']

# Create simulated trajectories in 2D hyperbolic space
trajectories = {}

for sero in serotypes:
    # Random starting point
    start = np.random.randn(2) * 0.5
    # Random velocity
    velocity = np.random.randn(2) * 0.1
    
    positions = []
    pos = start.copy()
    for year in years:
        # Add some noise
        pos = pos + velocity + np.random.randn(2) * 0.05
        positions.append({
            'year': year,
            'x': pos[0],
            'y': pos[1],
            'n_sequences': np.random.randint(5, 50),
            'variance': np.random.rand() * 0.1
        })
    
    trajectories[sero] = pd.DataFrame(positions)

print(f"Generated trajectories for {len(serotypes)} serotypes over {len(years)} years")

## 2. Visualize Serotype Trajectories

Plot the evolution of each serotype in hyperbolic space.

In [None]:
# Color palette for serotypes
colors = {'DENV-1': '#1f77b4', 'DENV-2': '#ff7f0e', 
          'DENV-3': '#2ca02c', 'DENV-4': '#d62728'}

fig, ax = plt.subplots(figsize=(12, 10))

for sero, traj in trajectories.items():
    # Plot trajectory line
    ax.plot(traj['x'], traj['y'], '-o', color=colors[sero], 
            alpha=0.7, markersize=8, label=sero)
    
    # Mark start and end
    ax.scatter(traj['x'].iloc[0], traj['y'].iloc[0], 
               c=colors[sero], s=150, marker='s', edgecolor='black', zorder=5)
    ax.scatter(traj['x'].iloc[-1], traj['y'].iloc[-1], 
               c=colors[sero], s=200, marker='*', edgecolor='black', zorder=5)
    
    # Add year labels
    for i, row in traj.iterrows():
        if row['year'] % 2 == 0:  # Label every other year
            ax.annotate(str(row['year']), (row['x'], row['y']),
                       fontsize=8, alpha=0.7)

ax.set_xlabel('Hyperbolic X', fontsize=12)
ax.set_ylabel('Hyperbolic Y', fontsize=12)
ax.set_title('Dengue Serotype Evolution in Hyperbolic Space (2015-2024)', fontsize=14)
ax.legend(loc='upper right', fontsize=10)
ax.grid(True, alpha=0.3)

# Add origin marker
ax.scatter(0, 0, c='black', s=100, marker='x', label='Origin')

plt.tight_layout()
plt.show()

## 3. Compute Hyperbolic Momentum

Calculate velocity vectors to predict future movement.

In [None]:
def compute_velocity(trajectory, window=3):
    """Compute velocity from recent trajectory points."""
    if len(trajectory) < window:
        window = len(trajectory)
    
    recent = trajectory.tail(window)
    start = np.array([recent['x'].iloc[0], recent['y'].iloc[0]])
    end = np.array([recent['x'].iloc[-1], recent['y'].iloc[-1]])
    
    direction = end - start
    magnitude = np.linalg.norm(direction)
    
    return {
        'direction': direction / magnitude if magnitude > 0 else direction,
        'magnitude': magnitude,
        'velocity': direction / window
    }

# Compute velocities
velocities = {}
for sero, traj in trajectories.items():
    velocities[sero] = compute_velocity(traj)

# Display
print("Serotype Velocities (Hyperbolic Momentum):")
print("-" * 50)
for sero, vel in velocities.items():
    print(f"{sero}: Magnitude = {vel['magnitude']:.4f}")
    print(f"       Direction = ({vel['direction'][0]:.3f}, {vel['direction'][1]:.3f})")

In [None]:
# Visualize velocities as arrows
fig, ax = plt.subplots(figsize=(12, 10))

for sero, traj in trajectories.items():
    # Current position (last point)
    current = np.array([traj['x'].iloc[-1], traj['y'].iloc[-1]])
    
    # Velocity vector
    vel = velocities[sero]['velocity'] * 3  # Scale for visibility
    
    # Plot current position
    ax.scatter(*current, c=colors[sero], s=200, marker='*', 
               edgecolor='black', zorder=5, label=sero)
    
    # Plot velocity arrow
    ax.annotate('', xy=current + vel, xytext=current,
                arrowprops=dict(arrowstyle='->', color=colors[sero], lw=3))
    
    # Label predicted position
    ax.scatter(*(current + vel), c=colors[sero], s=100, marker='o',
               alpha=0.5, edgecolor='black')

ax.set_xlabel('Hyperbolic X', fontsize=12)
ax.set_ylabel('Hyperbolic Y', fontsize=12)
ax.set_title('Serotype Momentum Vectors (2024 → 2025 Forecast)', fontsize=14)
ax.legend(loc='upper right')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Risk Assessment

Identify which serotype is moving towards "high-risk" regions.

In [None]:
def compute_risk_score(trajectory, velocity):
    """Compute risk score based on trajectory characteristics.
    
    Higher risk if:
    - High velocity (rapid change)
    - Moving away from origin (divergence)
    - High variance (unstable)
    """
    current = np.array([trajectory['x'].iloc[-1], trajectory['y'].iloc[-1]])
    predicted = current + velocity['velocity']
    
    # Distance from origin
    current_dist = np.linalg.norm(current)
    predicted_dist = np.linalg.norm(predicted)
    
    # Divergence (moving away from origin)
    divergence = predicted_dist - current_dist
    
    # Speed
    speed = velocity['magnitude']
    
    # Variance (instability)
    variance = trajectory['variance'].mean()
    
    # Combined risk score
    risk = (divergence * 0.4 + speed * 0.4 + variance * 0.2) * 10
    
    return {
        'risk_score': risk,
        'divergence': divergence,
        'speed': speed,
        'current_distance': current_dist,
        'predicted_distance': predicted_dist
    }

# Compute risk for each serotype
risk_scores = {}
for sero, traj in trajectories.items():
    risk_scores[sero] = compute_risk_score(traj, velocities[sero])

# Display
risk_df = pd.DataFrame(risk_scores).T
risk_df = risk_df.sort_values('risk_score', ascending=False)

print("\nRisk Assessment (Highest to Lowest):")
print("=" * 60)
print(risk_df[['risk_score', 'divergence', 'speed']].round(3))

In [None]:
# Visualize risk
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Left: Risk scores bar chart
ax = axes[0]
risk_sorted = risk_df.sort_values('risk_score', ascending=True)
bars = ax.barh(risk_sorted.index, risk_sorted['risk_score'], 
               color=[colors[s] for s in risk_sorted.index])
ax.set_xlabel('Risk Score')
ax.set_title('Serotype Risk Scores (2025 Forecast)')
ax.axvline(0, color='gray', linestyle='--')

# Right: Sequence counts over time
ax = axes[1]
for sero, traj in trajectories.items():
    ax.plot(traj['year'], traj['n_sequences'], '-o', 
            color=colors[sero], label=sero)
ax.set_xlabel('Year')
ax.set_ylabel('Number of Sequences')
ax.set_title('Serotype Detection Frequency')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Primer Stability Analysis

Identify stable genomic regions for RT-PCR primer design.

In [None]:
# Check for primer candidates
primer_path = project_root / "results" / "primer_candidates.csv"

if primer_path.exists():
    primers = pd.read_csv(primer_path)
    print(f"Loaded {len(primers)} primer candidates")
else:
    # Generate demo primer data
    print("Primer data not found. Generating demo data...")
    np.random.seed(42)
    
    n_primers = 20
    primers = pd.DataFrame({
        'rank': range(1, n_primers + 1),
        'position': np.random.randint(100, 10000, n_primers),
        'sequence': [''.join(np.random.choice(list('ATGC'), 20)) for _ in range(n_primers)],
        'stability_score': np.random.rand(n_primers) * 0.3 + 0.7,
        'conservation_score': np.random.rand(n_primers) * 0.2 + 0.8,
        'gc_content': np.random.rand(n_primers) * 0.2 + 0.4,
        'tm_estimate': np.random.rand(n_primers) * 10 + 55
    })
    primers['combined_score'] = primers['stability_score'] * primers['conservation_score']

In [None]:
# Display top primers
print("\nTop 10 Primer Candidates:")
print("=" * 80)
display_cols = ['rank', 'position', 'sequence', 'stability_score', 
                'conservation_score', 'gc_content', 'tm_estimate']
print(primers.head(10)[display_cols].to_string(index=False))

In [None]:
# Visualize primer properties
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Stability vs Conservation
ax = axes[0, 0]
scatter = ax.scatter(primers['conservation_score'], primers['stability_score'],
                     c=primers['rank'], cmap='RdYlGn_r', s=100)
plt.colorbar(scatter, ax=ax, label='Rank')
ax.set_xlabel('Conservation Score')
ax.set_ylabel('Stability Score')
ax.set_title('Primer Stability vs Conservation')
ax.grid(True, alpha=0.3)

# GC Content distribution
ax = axes[0, 1]
ax.hist(primers['gc_content'], bins=15, edgecolor='black', alpha=0.7)
ax.axvline(0.5, color='green', linestyle='--', label='Optimal (50%)')
ax.set_xlabel('GC Content')
ax.set_ylabel('Count')
ax.set_title('GC Content Distribution')
ax.legend()

# Tm distribution
ax = axes[1, 0]
ax.hist(primers['tm_estimate'], bins=15, edgecolor='black', alpha=0.7, color='orange')
ax.axvspan(58, 62, alpha=0.3, color='green', label='Optimal Range')
ax.set_xlabel('Melting Temperature (°C)')
ax.set_ylabel('Count')
ax.set_title('Tm Distribution')
ax.legend()

# Position along genome
ax = axes[1, 1]
ax.scatter(primers['position'], primers['combined_score'], 
           c='blue', s=100, alpha=0.7)
ax.set_xlabel('Genome Position')
ax.set_ylabel('Combined Score')
ax.set_title('Primer Scores by Genome Position')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Export Results

Save forecasts and primer candidates for surveillance use.

In [None]:
# Create forecast report
forecast_report = {
    'analysis_date': '2024-12-26',
    'risk_assessment': risk_df[['risk_score', 'divergence', 'speed']].to_dict(),
    'highest_risk_serotype': risk_df.index[0],
    'recommendation': f"Monitor {risk_df.index[0]} closely - showing highest divergence rate."
}

print("\n" + "="*60)
print("FORECAST REPORT")
print("="*60)
print(f"\nHighest Risk Serotype: {forecast_report['highest_risk_serotype']}")
print(f"\nRecommendation: {forecast_report['recommendation']}")

In [None]:
# Save outputs
output_dir = project_root / 'results'
output_dir.mkdir(parents=True, exist_ok=True)

# Save risk assessment
risk_df.to_csv(output_dir / 'serotype_risk_assessment.csv')

# Save trajectories
for sero, traj in trajectories.items():
    traj.to_csv(output_dir / f'trajectory_{sero.replace("-", "")}.csv', index=False)

print(f"\nResults saved to {output_dir}")

## Summary

This notebook provides:
1. **Trajectory visualization** of Dengue serotypes in hyperbolic space
2. **Velocity analysis** (Hyperbolic Momentum) for forecasting
3. **Risk assessment** to prioritize surveillance
4. **Primer candidates** for stable RT-PCR design

### Key Findings
- Serotype trajectories show distinct evolutionary patterns
- Velocity vectors predict near-term serotype dynamics
- Stable primers identified in conserved genomic regions

### Next Steps
1. Validate predictions with 2025 surveillance data
2. Test primer candidates in lab assays
3. Integrate with IICS-UNA surveillance dashboard