# Geometric Rotamer Scoring Function

## P-adic Geometry for Protein Side-Chain Optimization

**Partner:** Dr. Jos√© Colbes  
**Objective:** Develop a geometric energy term that identifies unstable rotamers invisible to traditional scoring functions

### Key Hypothesis
Rare rotamers that appear stable in Euclidean space may be "fractured" in hyperbolic (p-adic) space, 
predicting instability that Rosetta and other physics-based methods miss.

In [None]:
# Standard imports
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add project root
project_root = Path.cwd().parents[1]
sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")

## 1. Load Rotamer Data

Load chi angle data from PDB structures.

In [None]:
import torch

# Path to processed data
data_path = project_root / "data" / "processed" / "rotamers.pt"

if data_path.exists():
    data = torch.load(data_path)
    chi_angles = data['chi_angles'].numpy()
    metadata = data['metadata']
    print(f"Loaded {len(metadata)} residues")
    print(f"Chi angles shape: {chi_angles.shape}")
else:
    # Generate synthetic demo data
    print("Data not found. Generating demo data...")
    np.random.seed(42)
    
    n_residues = 500
    
    # Standard rotamer conformations (gauche+, gauche-, trans)
    rotamer_centers = [
        [np.radians(60), np.radians(60)],    # gauche+/gauche+
        [np.radians(-60), np.radians(-60)],  # gauche-/gauche-
        [np.radians(180), np.radians(180)],  # trans/trans
    ]
    
    chi_angles = []
    residue_types = []
    is_rare = []
    
    residue_names = ['LEU', 'ILE', 'VAL', 'PHE', 'TYR', 'TRP', 'MET', 'ARG', 'LYS', 'GLU']
    
    for i in range(n_residues):
        # 90% common rotamers, 10% rare
        if np.random.rand() < 0.9:
            center = rotamer_centers[np.random.randint(3)]
            noise = np.random.randn(2) * 0.2
            is_rare.append(False)
        else:
            # Rare rotamer: unusual angles
            center = [np.radians(np.random.uniform(-180, 180)), 
                      np.radians(np.random.uniform(-180, 180))]
            noise = np.random.randn(2) * 0.3
            is_rare.append(True)
        
        chi = [center[0] + noise[0], center[1] + noise[1], np.nan, np.nan]
        chi_angles.append(chi)
        residue_types.append(np.random.choice(residue_names))
    
    chi_angles = np.array(chi_angles)
    metadata = [
        {'pdb_id': 'DEMO', 'chain_id': 'A', 'residue_id': i, 
         'residue_name': residue_types[i], 'is_rare': is_rare[i]}
        for i in range(n_residues)
    ]
    
    print(f"Generated {n_residues} synthetic residues")
    print(f"Rare rotamers: {sum(is_rare)}")

## 2. Compute P-adic Distances

Map chi angles to hyperbolic space and compute distances.

In [None]:
def padic_valuation(n, p=3):
    """Compute p-adic valuation."""
    if n == 0:
        return 0
    v = 0
    while n % p == 0:
        v += 1
        n //= p
    return v

def angle_to_index(angle, bins=36):
    """Convert angle to bin index."""
    # Normalize to [0, 2pi]
    while angle < 0:
        angle += 2 * np.pi
    while angle >= 2 * np.pi:
        angle -= 2 * np.pi
    return int(angle / (2 * np.pi) * bins) % bins

def hyperbolic_distance(chi):
    """Compute hyperbolic distance from chi angles."""
    valid = [c for c in chi if not np.isnan(c)]
    if not valid:
        return 0.0
    
    # Map to Poincare ball
    coords = np.array([np.tanh(c / np.pi) for c in valid])
    r = np.linalg.norm(coords)
    if r >= 1.0:
        r = 0.999
    
    return 2 * np.arctanh(r)

In [None]:
# Compute distances for all residues
hyp_distances = [hyperbolic_distance(chi) for chi in chi_angles]

# Add to results
results = pd.DataFrame(metadata)
results['hyperbolic_distance'] = hyp_distances
results['chi1'] = chi_angles[:, 0]
results['chi2'] = chi_angles[:, 1]

print("Hyperbolic distance statistics:")
print(results['hyperbolic_distance'].describe())

## 3. Identify Rare Rotamers

Find residues with high hyperbolic distance (potential instability).

In [None]:
# Threshold for rare classification
threshold = np.percentile(hyp_distances, 90)
results['predicted_rare'] = results['hyperbolic_distance'] > threshold

print(f"Threshold (90th percentile): {threshold:.3f}")
print(f"Predicted rare: {results['predicted_rare'].sum()}")

if 'is_rare' in results.columns:
    # Compare with ground truth
    from sklearn.metrics import confusion_matrix, classification_report
    
    print("\nClassification Report:")
    print(classification_report(results['is_rare'], results['predicted_rare']))

In [None]:
# Visualize chi1 vs chi2 colored by hyperbolic distance
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Left: Ramachandran-like plot colored by hyperbolic distance
ax = axes[0]
scatter = ax.scatter(
    np.degrees(results['chi1']),
    np.degrees(results['chi2']),
    c=results['hyperbolic_distance'],
    cmap='RdYlGn_r',
    alpha=0.7,
    s=30
)
plt.colorbar(scatter, ax=ax, label='Hyperbolic Distance')
ax.set_xlabel('Chi1 (degrees)')
ax.set_ylabel('Chi2 (degrees)')
ax.set_title('Rotamer Distribution (Colored by Hyperbolic Distance)')
ax.axhline(0, color='gray', linestyle='--', alpha=0.3)
ax.axvline(0, color='gray', linestyle='--', alpha=0.3)

# Right: Distribution of distances
ax = axes[1]
ax.hist(results['hyperbolic_distance'], bins=50, edgecolor='black', alpha=0.7)
ax.axvline(threshold, color='red', linestyle='--', label=f'Rare threshold ({threshold:.2f})')
ax.set_xlabel('Hyperbolic Distance')
ax.set_ylabel('Count')
ax.set_title('Distribution of Hyperbolic Distances')
ax.legend()

plt.tight_layout()
plt.show()

## 4. Compare with Standard Rotamer Library

Identify residues that deviate from Dunbrack library conformations.

In [None]:
# Standard rotamer centers (chi1, chi2 in radians)
ROTAMER_CENTROIDS = {
    'gauche+': np.array([np.radians(60), np.radians(60)]),
    'gauche-': np.array([np.radians(-60), np.radians(-60)]),
    'trans': np.array([np.radians(180), np.radians(180)]),
    'g+/t': np.array([np.radians(60), np.radians(180)]),
    'g-/t': np.array([np.radians(-60), np.radians(180)]),
}

def find_nearest_rotamer(chi1, chi2):
    """Find nearest standard rotamer."""
    if np.isnan(chi1) or np.isnan(chi2):
        return 'unknown', np.inf
    
    point = np.array([chi1, chi2])
    min_dist = np.inf
    nearest = 'unknown'
    
    for name, center in ROTAMER_CENTROIDS.items():
        # Angular distance
        diff = np.abs(point - center)
        diff = np.minimum(diff, 2*np.pi - diff)  # Wrap around
        dist = np.linalg.norm(diff)
        if dist < min_dist:
            min_dist = dist
            nearest = name
    
    return nearest, min_dist

# Compute for all residues
nearest_rotamers = [find_nearest_rotamer(row['chi1'], row['chi2']) 
                    for _, row in results.iterrows()]

results['nearest_rotamer'] = [r[0] for r in nearest_rotamers]
results['euclidean_distance'] = [r[1] for r in nearest_rotamers]

In [None]:
# Correlation between Euclidean and Hyperbolic distance
valid = results[(results['euclidean_distance'] < np.inf) & 
                (results['hyperbolic_distance'] > 0)]

correlation = np.corrcoef(valid['euclidean_distance'], 
                          valid['hyperbolic_distance'])[0, 1]

fig, ax = plt.subplots(figsize=(8, 8))

ax.scatter(valid['euclidean_distance'], valid['hyperbolic_distance'], alpha=0.5)

# Regression line
z = np.polyfit(valid['euclidean_distance'], valid['hyperbolic_distance'], 1)
p = np.poly1d(z)
x_line = np.linspace(valid['euclidean_distance'].min(), 
                     valid['euclidean_distance'].max(), 100)
ax.plot(x_line, p(x_line), 'r--', label=f'r = {correlation:.3f}')

ax.set_xlabel('Euclidean Distance from Rotamer Library')
ax.set_ylabel('Hyperbolic Distance (P-adic)')
ax.set_title('Euclidean vs Hyperbolic Rotamer Distance')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nCorrelation: {correlation:.3f}")

## 5. The "Rosetta-Blind" Residues

Find residues with LOW Euclidean distance (looks stable) but HIGH hyperbolic distance (actually unstable).

In [None]:
# Define thresholds
eucl_threshold = np.percentile(valid['euclidean_distance'], 50)  # Low Euclidean
hyp_threshold = np.percentile(valid['hyperbolic_distance'], 75)  # High Hyperbolic

# Rosetta-blind: low Euclidean, high Hyperbolic
rosetta_blind = valid[
    (valid['euclidean_distance'] < eucl_threshold) &
    (valid['hyperbolic_distance'] > hyp_threshold)
]

print(f"Rosetta-Blind Residues: {len(rosetta_blind)}")
print(f"These residues LOOK stable in Euclidean space but are UNSTABLE in hyperbolic space.")
print()
print(rosetta_blind[['pdb_id', 'chain_id', 'residue_id', 'residue_name', 
                     'euclidean_distance', 'hyperbolic_distance']].head(10))

In [None]:
# Visualize Rosetta-blind residues
fig, ax = plt.subplots(figsize=(10, 8))

# All points
ax.scatter(valid['euclidean_distance'], valid['hyperbolic_distance'], 
           alpha=0.3, c='gray', label='All residues')

# Rosetta-blind region
ax.axhspan(hyp_threshold, valid['hyperbolic_distance'].max(), 
           xmax=eucl_threshold/valid['euclidean_distance'].max(),
           alpha=0.2, color='red', label='Rosetta-Blind Zone')

# Highlight Rosetta-blind
ax.scatter(rosetta_blind['euclidean_distance'], rosetta_blind['hyperbolic_distance'],
           c='red', s=100, marker='o', label='Rosetta-Blind', edgecolor='black')

ax.axhline(hyp_threshold, color='red', linestyle='--', alpha=0.5)
ax.axvline(eucl_threshold, color='blue', linestyle='--', alpha=0.5)

ax.set_xlabel('Euclidean Distance (Low = Stable)', fontsize=12)
ax.set_ylabel('Hyperbolic Distance (High = Unstable)', fontsize=12)
ax.set_title('Identifying "Rosetta-Blind" Unstable Rotamers', fontsize=14)
ax.legend(loc='lower right')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Export Results

Save analysis for further validation.

In [None]:
# Save full results
output_path = project_root / 'results' / 'rotamer_analysis_colbes.csv'
output_path.parent.mkdir(parents=True, exist_ok=True)
results.to_csv(output_path, index=False)

# Save Rosetta-blind candidates
rosetta_blind_path = project_root / 'results' / 'rosetta_blind_residues.csv'
rosetta_blind.to_csv(rosetta_blind_path, index=False)

print(f"Saved results to:")
print(f"  {output_path}")
print(f"  {rosetta_blind_path}")

## Summary

This notebook demonstrates:
1. Computing p-adic (hyperbolic) distances for rotamer conformations
2. Comparing with standard Dunbrack rotamer library
3. Identifying "Rosetta-Blind" residues that look stable but are geometrically unstable

### The Geometric Energy Term ($E_{geom}$)

$$E_{geom} = w \cdot d_{hyp}(\chi) \cdot \mathbb{1}[d_{eucl} < \theta_{eucl}]$$

Where:
- $d_{hyp}$ = Hyperbolic distance in p-adic space
- $d_{eucl}$ = Euclidean distance from rotamer library
- $w$ = Weighting factor
- $\theta_{eucl}$ = Threshold for "apparent stability"

**Next Steps:**
- Validate predictions with MD simulations
- Integrate $E_{geom}$ into SCWRL/Rosetta scoring