In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import os

# Paths
FOLDSEEK_RUNS_DIR = Path("similarity_scores/foldseek_runs")
ORIGINAL_SCORES_PATH = Path("data/all_similarity_scores.parquet")

## 1. Load Original Scores

In [None]:
# Load original scores (use fastparquet engine due to compatibility)
orig_scores = pd.read_parquet(ORIGINAL_SCORES_PATH, engine='fastparquet')
print(f"Original scores shape: {orig_scores.shape}")
print(f"\nColumns: {orig_scores.columns.tolist()}")
orig_scores.head()

## 2. Parse My Computed Foldseek Alignments

Extract system pairs from the foldseek_runs directory and parse alignment results.

In [None]:
# Get list of computed pairs from foldseek_runs directory
computed_pairs = []
for d in FOLDSEEK_RUNS_DIR.iterdir():
    if d.is_dir() and "_vs_" in d.name:
        parts = d.name.split("_vs_")
        if len(parts) == 2:
            query_system, target_system = parts
            computed_pairs.append((query_system, target_system, d))

print(f"Number of computed pairs: {len(computed_pairs)}")
print(f"\nFirst 5 pairs:")
for q, t, _ in computed_pairs[:5]:
    print(f"  {q} vs {t}")

In [None]:
def parse_foldseek_alignment(alignment_file: Path) -> dict:
    """Parse foldseek alignment.tsv file and extract best alignment."""
    if not alignment_file.exists():
        return None
    
    alignments = []
    with open(alignment_file, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 5:
                alignments.append({
                    'query_chain': parts[0].replace('receptor_', ''),
                    'target_chain': parts[1].replace('receptor_', ''),
                    'u': parts[2],  # rotation matrix
                    't': parts[3],  # translation vector
                    'lddt': float(parts[4]),
                    'rmsd': float(parts[5]) if len(parts) > 5 else None,
                    'alnlen': int(parts[6]) if len(parts) > 6 else None,
                })
    
    if not alignments:
        return None
    
    # Return best alignment by lDDT
    return max(alignments, key=lambda x: x['lddt'])

# Test parsing
test_dir = list(FOLDSEEK_RUNS_DIR.iterdir())[1]
test_alignment = parse_foldseek_alignment(test_dir / "alignment.tsv")
print(f"Test alignment from {test_dir.name}:")
print(test_alignment)

## 3. Find Overlapping Pairs

Match my computed pairs with the original scores dataset.

In [None]:
# Create a set of original pairs for fast lookup
orig_pairs_set = set(zip(orig_scores['query_system'], orig_scores['target_system']))
print(f"Original pairs count: {len(orig_pairs_set)}")

# Find overlapping pairs (check both directions)
overlapping = []
for query, target, run_dir in computed_pairs:
    if (query, target) in orig_pairs_set:
        overlapping.append((query, target, run_dir, False))  # Same direction
    elif (target, query) in orig_pairs_set:
        overlapping.append((query, target, run_dir, True))  # Reversed

print(f"\nOverlapping pairs: {len(overlapping)}")
print(f"\nFirst 10 overlapping pairs:")
for q, t, _, rev in overlapping[:10]:
    rev_str = " (reversed)" if rev else ""
    print(f"  {q} vs {t}{rev_str}")

## 4. Compare Foldseek lDDT Scores

In [None]:
# Build comparison dataframe
comparison_data = []

for query, target, run_dir, reversed_dir in overlapping:
    # Parse my alignment
    my_alignment = parse_foldseek_alignment(run_dir / "alignment.tsv")
    if my_alignment is None:
        continue
    
    # Get original score
    if reversed_dir:
        orig_row = orig_scores[(orig_scores['query_system'] == target) & 
                               (orig_scores['target_system'] == query)]
    else:
        orig_row = orig_scores[(orig_scores['query_system'] == query) & 
                               (orig_scores['target_system'] == target)]
    
    if len(orig_row) == 0:
        continue
    
    orig_row = orig_row.iloc[0]
    
    comparison_data.append({
        'query_system': query,
        'target_system': target,
        'my_lddt': my_alignment['lddt'],
        'orig_protein_lddt_max': orig_row.get('protein_lddt_max', np.nan),
        'my_rmsd': my_alignment['rmsd'],
        'my_alnlen': my_alignment['alnlen'],
        'orig_tanimoto': orig_row.get('topological_tanimoto', np.nan),
        'orig_morgan_tanimoto': orig_row.get('morgan_tanimoto', np.nan),
        'orig_sucos_protein': orig_row.get('sucos_protein', np.nan),
        'orig_sucos_shape': orig_row.get('sucos_shape', np.nan),
        'orig_shape': orig_row.get('shape', np.nan),
        'orig_color': orig_row.get('color', np.nan),
    })

comparison_df = pd.DataFrame(comparison_data)
print(f"Comparison dataframe shape: {comparison_df.shape}")
comparison_df.head(10)

In [None]:
# Compare lDDT scores
if len(comparison_df) > 0:
    print("=" * 60)
    print("lDDT Score Comparison")
    print("=" * 60)
    
    valid = comparison_df['orig_protein_lddt_max'].notna()
    if valid.sum() > 0:
        my_lddt = comparison_df.loc[valid, 'my_lddt']
        orig_lddt = comparison_df.loc[valid, 'orig_protein_lddt_max']
        
        # Correlation
        corr = np.corrcoef(my_lddt, orig_lddt)[0, 1]
        print(f"\nCorrelation: {corr:.4f}")
        
        # Mean absolute difference
        mae = np.mean(np.abs(my_lddt - orig_lddt))
        print(f"Mean Absolute Error: {mae:.4f}")
        
        # Statistics
        print(f"\nMy lDDT - mean: {my_lddt.mean():.4f}, std: {my_lddt.std():.4f}")
        print(f"Orig lDDT - mean: {orig_lddt.mean():.4f}, std: {orig_lddt.std():.4f}")
else:
    print("No overlapping pairs found for comparison!")

In [None]:
# Plot lDDT comparison
if len(comparison_df) > 0:
    valid = comparison_df['orig_protein_lddt_max'].notna()
    if valid.sum() > 0:
        fig, ax = plt.subplots(1, 1, figsize=(8, 8))
        
        ax.scatter(comparison_df.loc[valid, 'orig_protein_lddt_max'], 
                   comparison_df.loc[valid, 'my_lddt'], 
                   alpha=0.6, s=50)
        
        # Add diagonal line
        min_val = min(comparison_df.loc[valid, 'orig_protein_lddt_max'].min(),
                      comparison_df.loc[valid, 'my_lddt'].min())
        max_val = max(comparison_df.loc[valid, 'orig_protein_lddt_max'].max(),
                      comparison_df.loc[valid, 'my_lddt'].max())
        ax.plot([min_val, max_val], [min_val, max_val], 'r--', label='y=x')
        
        ax.set_xlabel('Original protein_lddt_max')
        ax.set_ylabel('My Foldseek lDDT')
        ax.set_title(f'lDDT Score Comparison (n={valid.sum()})')
        ax.legend()
        ax.set_aspect('equal')
        plt.tight_layout()
        plt.show()

## 5. Detailed Comparison for Sample Pairs

In [None]:
# Show detailed comparison for a few pairs
if len(comparison_df) > 0:
    print("Detailed comparison for first 10 pairs:")
    print("=" * 100)
    
    for i, row in comparison_df.head(10).iterrows():
        print(f"\nPair: {row['query_system']} vs {row['target_system']}")
        print(f"  My lDDT: {row['my_lddt']:.4f} | Orig lDDT: {row['orig_protein_lddt_max']:.4f} | Diff: {row['my_lddt'] - row['orig_protein_lddt_max']:.4f}")
        print(f"  My RMSD: {row['my_rmsd']:.4f} | My Alnlen: {row['my_alnlen']}")
        print(f"  Orig Tanimoto: {row['orig_tanimoto']:.4f} | Morgan: {row['orig_morgan_tanimoto']:.4f}")
        print(f"  Orig SuCOS protein: {row['orig_sucos_protein']:.4f} | shape: {row['orig_sucos_shape']:.4f}")

## 6. Summary Statistics

In [None]:
if len(comparison_df) > 0:
    print("Summary Statistics")
    print("=" * 60)
    comparison_df.describe()