# Protein Sequence Sampling & Inpainting Notebook

## User Guide

This notebook allows you to easily generate protein sequences using our trained Dirichlet Flow Matching (DFM) model. You can run either:

### **Mode 1: Full Sequence Sampling**
- Generate completely new sequences for given protein structures
- Input: PDB ID (e.g., "1abc", "1fcd.C") or PDB file path
- Output: New amino acid sequences that should fold to the input structure

### **Mode 2: Sequence Inpainting** 
- Predict specific amino acids while keeping others fixed
- Input: PDB ID + positions to predict OR template sequence with 'X' for unknowns
- Output: Completed sequences with predictions for masked positions

### **How to Use:**
1. **Choose your mode** by setting `mode = "sampling"` or `mode = "inpainting"`
2. **Set your PDB inputs** in the `pdb_inputs` list (e.g., `["1abc", "1fcd.C"]`)
3. **For inpainting mode**: specify either `mask_positions` or `template_sequence`
4. **Run all cells** - the notebook will handle everything else automatically
5. **Check results** in the output directory and the displayed results

### **Advanced Parameters:**
- `flow_temp`: Temperature for sampling (0.1-1.0, lower = more conservative)
- `steps`: Number of sampling steps (10-50, more = higher quality but slower)
- `ensemble_size`: Number of structural variants (1-10, more = better consensus)
- `structure_noise_mag_std`: Structural noise level (0.0-2.0, higher = more structural diversity)

### **Outputs:**
- **Console**: Sequences printed with confidence scores
- **CSV files**: Detailed results in `{output_dir}/sampling_results.csv`
- **Trajectory files**: For detailed analysis (when sampling <4 proteins)

In [1]:
# =============================================================================
# IMPORTS AND SETUP
# =============================================================================

import os
import sys
import torch
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path

# Add the current directory to Python path
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.append(current_dir)

# Import our sampling and inpainting modules
from training.sample_utils import (
    load_model_distributed, process_input_specification, CustomInputDataset,
    compute_sampling_metrics, save_results_to_files, IDX_TO_AA, THREE_TO_ONE
)
from training.sample import sample_chain
from training.inpainting import sample_chain_inpainting

print("All imports successful!")
print(f"Using device: {'GPU' if torch.cuda.is_available() else 'CPU'}")
print(f"Working directory: {current_dir}")

All imports successful!
Using device: GPU
Working directory: /home/t-alptartici/inverse-folding


In [2]:
# =============================================================================
# USER CONFIGURATION - MODIFY THESE SETTINGS
# =============================================================================

# ===== SAMPLING MODE =====
# Choose: "sampling" (generate new sequences) or "inpainting" (predict masked positions)
mode = "sampling"  # Change to "inpainting" for inpainting mode

# ===== INPUT PROTEINS =====
# List of PDB IDs or file paths to process
# Examples: ["1abc"], ["1fcd.C"], ["1abc", "1def.A"], ["/path/to/protein.pdb"]
pdb_inputs = ["1fcd.C", "1h2s.B"]  # MODIFY THIS: Add your PDB IDs here
verbose = False

# ===== INPAINTING SETTINGS (only used if mode="inpainting") =====
# Option 1: Specify positions to predict (0-indexed, in structure sequence)
mask_positions = [10, 15, 20, 25, 30]  # MODIFY THIS: Positions to predict

# Option 2: Use template sequence with 'X' for positions to predict
# Set to None to use mask_positions instead
template_sequence = None  # Example: "ACDEFXHIKLXNPQXSTVWY" 

# Option 3: Random masking percentage (only if above are None)
#mask_ratio = 0.15  # 15% of positions will be randomly masked

# ===== SAMPLING PARAMETERS =====
flow_temp = 0.1             # Temperature (0.1-1.0, lower = more conservative)
steps = 20                   # Number of sampling steps (10-50)
max_time = 8.0              # Maximum noise level
min_time = 0.0              # Minimum noise level  
dirichlet_concentration = 1.0 # Initial distribution concentration

# ===== ENSEMBLE PARAMETERS =====
ensemble_size = 3                 # Number of structural variants (1-10)
ensemble_consensus_strength = 1.0    # Consensus strength (0=independent, 1=full consensus)
structure_noise_mag_std = 0.1        # Structural noise level (0.0-2.0 Å)
uncertainty_struct_noise_scaling = False  # Scale noise by uncertainty

# ===== OUTPUT SETTINGS =====
output_dir = "./inference_output"   # Output directory
save_probabilities = True          # Save detailed probability distributions
verbose = True                     # Print detailed information

# ===== MODEL AND DATA PATHS =====
model_path = "ckpts/model_316.pt"
split_json = "datasets/cath-4.2/chain_set_splits.json"
map_pkl = "datasets/cath-4.2/chain_set_map_with_b_factors_dssp.pkl"

print("Configuration loaded!")
print(f"   Mode: {mode}")
print(f"   Input proteins: {pdb_inputs}")
print(f"   Ensemble size: {ensemble_size}")
print(f"   Output directory: {output_dir}")
if mode == "inpainting":
    if template_sequence:
        print(f"   Template sequence: {template_sequence}")
    elif mask_positions:
        print(f"   Positions to predict: {mask_positions}")
    else:
        print(f"   Random masking: {mask_ratio*100:.1f}%")

Configuration loaded!
   Mode: sampling
   Input proteins: ['1fcd.C', '1h2s.B']
   Ensemble size: 3
   Output directory: ./inference_output


In [3]:
# =============================================================================
# LOAD MODEL AND VALIDATE SETUP
# =============================================================================

# Create output directory
os.makedirs(output_dir, exist_ok=True)
print(f"Created output directory: {output_dir}")

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Validate model path
if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model not found at: {model_path}")
print(f"Model path validated: {model_path}")

# Load model and extract parameters
print("Loading model...")
try:
    model, dataset_params = load_model_distributed(model_path, device, None)
    print("Model loaded successfully!")
    print(f"   Model device: {next(model.parameters()).device}")
    if dataset_params:
        print(f"   Dataset parameters extracted: {len(dataset_params)} parameters")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# Validate input files
if mode != "inpainting" or any("." not in pdb_id for pdb_id in pdb_inputs):
    # Only validate dataset files if we might need them
    for file_path, name in [(split_json, "split JSON"), (map_pkl, "map PKL")]:
        if not os.path.exists(file_path):
            print(f"Warning: {name} not found at: {file_path}")
            print("   (This is okay if using direct PDB files)")
        else:
            print(f"{name} validated: {file_path}")

print("\nSetup complete! Ready to run inference.")
print("="*60)

Created output directory: ./inference_output
Using device: cuda
Model path validated: ckpts/model_316.pt
Loading model...


  from .autonotebook import tqdm as notebook_tqdm


Loading model from: ckpts/model_316.pt
CHECKPOINT PARAMETER EXTRACTION
Found 'args' in checkpoint - extracting model configuration
Args type: dict

All available parameters in checkpoint args:
  af2_chunk_dir: /mnt/default/data/inv_fold/datasets/af2_chunks_with_dssp/
  af2_chunk_limit: None
  alpha_max: 10.0
  alpha_min: 1.0
  alpha_range: 1.0
  alpha_spacing: 0.01
  alternating_pure_batches: False
  architecture: interleaved
  batch: 32
  checkpoint_copy_dir: /tmp
  config_file: None
  debug_mode: False
  device: auto
  dirichlet_multiplier_training: 1.0
  disable_time_conditioning: False
  distributed: False
  dropout: 0.0
  edge_dim_s: 32
  edge_dim_v: 1
  enable_checkpoint_rollback: True
  epoch_timeout: 4200.0
  epochs: 500
  external_checkpoint: None
  fail_fast: False
  flexible_loss_scaling: False
  grad_clip: 5.0
  head_hidden: 256
  heterogeneous_batches: True
  hidden_dim: 640
  hidden_dim_v: 80
  hybrid_fail_fast: False
  hybrid_max_af2_errors: 10
  hybrid_max_pdb_errors: 1

In [4]:
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def run_sampling_for_protein(pdb_input, mode="sampling", mask_positions=None, 
                            template_sequence=None, mask_ratio=0.15):
    """
    Run sampling or inpainting for a single protein.
    
    Args:
        pdb_input: PDB ID (e.g., "1abc", "1fcd.C") or file path
        mode: "sampling" or "inpainting"
        mask_positions: List of positions to predict (for inpainting)
        template_sequence: Template with 'X' for unknowns (for inpainting)
        mask_ratio: Random masking ratio (for inpainting)
    
    Returns:
        Dictionary with results
    """
    from data.graph_builder import GraphBuilder
    
    print(f"\n{'='*60}")
    print(f"Processing: {pdb_input}")
    print(f"Mode: {mode.upper()}")
    print(f"{'='*60}")
    
    # Process input to get protein structure
    try:
        print("Processing input structure...")
        entry, temp_files = process_input_specification(pdb_input, verbose=verbose)
        print(f"Structure loaded: {entry['name']}")
        print(f"   Length: {len(entry['seq'])} residues")
        print(f"   Source: {entry['source']}")
        
        # Build graph using parameters from checkpoint (following training/sample_utils.py pattern)
        print("Building molecular graph...")
        
        # Prepare graph builder parameters exactly like in training/sample_utils.py
        graph_builder_kwargs = {
            'k': dataset_params.get('k_neighbors'),
            'k_farthest': dataset_params.get('k_farthest'),
            'k_random': dataset_params.get('k_random'),
            'max_edge_dist': dataset_params.get('max_edge_dist'),
            'num_rbf_3d': dataset_params.get('num_rbf_3d'),
            'num_rbf_seq': dataset_params.get('num_rbf_seq'),
            'use_virtual_node': dataset_params.get('use_virtual_node', True),
            'no_source_indicator': dataset_params.get('no_source_indicator', False),
            # RBF distance range parameters
            'rbf_3d_min': dataset_params.get('rbf_3d_min'),
            'rbf_3d_max': dataset_params.get('rbf_3d_max'),
            'rbf_3d_spacing': dataset_params.get('rbf_3d_spacing'),
            'verbose': False  # Reduce clutter
        }
        
        # Validate RBF parameters (like in training code)
        if (graph_builder_kwargs['rbf_3d_min'] is None or 
            graph_builder_kwargs['rbf_3d_max'] is None or 
            graph_builder_kwargs['rbf_3d_spacing'] is None):
            raise RuntimeError(
                f"RBF parameters missing from checkpoint. "
                f"Got: rbf_3d_min={graph_builder_kwargs['rbf_3d_min']}, "
                f"rbf_3d_max={graph_builder_kwargs['rbf_3d_max']}, "
                f"rbf_3d_spacing={graph_builder_kwargs['rbf_3d_spacing']}"
            )
        
        # Remove None values to use GraphBuilder defaults (like in training code)
        graph_builder_kwargs = {k: v for k, v in graph_builder_kwargs.items() if v is not None}
        
        builder = GraphBuilder(**graph_builder_kwargs)
        graph_data = builder.build_from_dict(entry, time_param=0.0)
        graph_data = graph_data.to(device)
        print(f"Graph built: {graph_data.num_nodes} nodes, {graph_data.num_edges} edges")
        
        # Run sampling or inpainting
        if mode == "sampling":
            print("Running sequence sampling...")
            
            # Handle ensemble sampling if requested
            if ensemble_size > 1:
                from training.sample_utils import create_structural_ensemble, sample_with_ensemble_consensus
                
                print(f"   Creating ensemble of {ensemble_size} structural variants...")
                batched_ensemble = create_structural_ensemble(
                    entry, ensemble_size=ensemble_size,
                    structure_noise_mag_std=structure_noise_mag_std,
                    uncertainty_struct_noise_scaling=uncertainty_struct_noise_scaling,
                    device=device, args=None, dataset_params=dataset_params
                )
                
                print("   Running ensemble sampling with consensus...")
                predicted_sequence = sample_with_ensemble_consensus(
                    model, batched_ensemble, T=max_time, t_min=min_time, steps=steps,
                    K=21, consensus_strength=ensemble_consensus_strength, device=device,
                    use_virtual_node=dataset_params.get('use_virtual_node', True), args=None
                )
                
                # Convert indices to sequence (IDX_TO_AA is a list, not dict)
                predicted_seq_str = ''.join([THREE_TO_ONE.get(IDX_TO_AA[idx] if 0 <= idx < len(IDX_TO_AA) else 'UNK', 'X') for idx in predicted_sequence])
                final_probabilities = None  # Ensemble doesn't return probabilities
                eval_metrics = {}
                
            else:
                # Single structure sampling
                final_probabilities, predicted_sequence_indices, eval_metrics = sample_chain(
                    model, graph_data, dataset=None, structure_idx=None,
                    T=max_time, t_min=min_time, steps=steps, K=21, verbose=False, args=None  # Reduced verbosity
                )
                
                # Convert indices to amino acid sequence (IDX_TO_AA is a list, not dict)
                predicted_seq_str = ''.join([THREE_TO_ONE.get(IDX_TO_AA[idx] if 0 <= idx < len(IDX_TO_AA) else 'UNK', 'X') for idx in predicted_sequence_indices])
                
        elif mode == "inpainting":
            print("Running sequence inpainting...")
            
            final_probabilities, predicted_sequence_indices, inpainting_mask, alignment_info, eval_metrics = sample_chain_inpainting(
                model, graph_data, T=max_time, t_min=min_time, steps=steps, K=21,
                full_sequence=entry['seq'], structure_sequence=entry['seq'],
                mask_positions=mask_positions, known_sequence=template_sequence,
                mask_ratio=mask_ratio, verbose=False, args=None  # Reduced verbosity
            )
            
            # Convert indices to amino acid sequence (IDX_TO_AA is a list, not dict)
            predicted_seq_str = ''.join([THREE_TO_ONE.get(IDX_TO_AA[idx] if 0 <= idx < len(IDX_TO_AA) else 'UNK', 'X') for idx in predicted_sequence_indices])
            
        else:
            raise ValueError(f"Unknown mode: {mode}")
        
        # Prepare results
        result = {
            'protein_id': entry['name'],
            'original_sequence': entry['seq'],
            'predicted_sequence': predicted_seq_str,  # Now properly converted to amino acid letters
            'sequence_length': len(predicted_seq_str),
            'mode': mode,
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        
        # Add evaluation metrics if available
        if eval_metrics:
            result.update(eval_metrics)
            
        # Add inpainting-specific info
        if mode == "inpainting":
            if mask_positions:
                result['mask_positions'] = mask_positions
            if template_sequence:
                result['template_sequence'] = template_sequence
            result['mask_ratio'] = mask_ratio
        
        # Clean up temp files
        for temp_file in temp_files:
            try:
                if os.path.exists(temp_file):
                    os.remove(temp_file)
            except:
                pass
                
        return result
        
    except Exception as e:
        print(f"Error processing {pdb_input}: {e}")
        return None

def display_results(results):
    """Display results in a nice format."""
    print(f"\n{'='*80}")
    print(f"RESULTS SUMMARY")
    print(f"{'='*80}")
    
    for i, result in enumerate(results, 1):
        if result is None:
            continue
            
        print(f"\nProtein {i}: {result['protein_id']}")
        print(f"   Mode: {result['mode'].upper()}")
        print(f"   Length: {result['sequence_length']} residues")
        
        if result['mode'] == 'inpainting' and 'mask_positions' in result:
            print(f"   Masked positions: {result['mask_positions']}")
            
        print(f"   Original : {result['original_sequence']}")
        print(f"   Predicted: {result['predicted_sequence']}")
        
        if 'accuracy' in result:
            print(f"   Accuracy: {result['accuracy']:.3f}")
        if 'confidence' in result:
            print(f"   Confidence: {result['confidence']:.3f}")
            
    print(f"\n{'='*80}")

print("Helper functions defined!")

Helper functions defined!


In [5]:
# =============================================================================
# RUN INFERENCE - MAIN EXECUTION
# =============================================================================

print("Starting protein sequence inference...")
print(f"   Processing {len(pdb_inputs)} protein(s)")
print(f"   Mode: {mode.upper()}")

# Store all results
all_results = []

# Process each protein
for i, pdb_input in enumerate(pdb_inputs, 1):
    print(f"\nProcessing protein {i}/{len(pdb_inputs)}: {pdb_input}")
    
    result = run_sampling_for_protein(
        pdb_input=pdb_input,
        mode=mode,
        mask_positions=mask_positions if mode == "inpainting" else None,
        template_sequence=template_sequence if mode == "inpainting" else None,
        mask_ratio=mask_ratio if mode == "inpainting" else None
    )
    
    if result:
        all_results.append(result)
        print(f"Successfully processed {pdb_input}")
    else:
        print(f"Failed to process {pdb_input}")

# Display and save results
if all_results:
    display_results(all_results)
    
    # Save to CSV
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    csv_filename = f"{output_dir}/sampling_results_{timestamp}.csv"
    
    df = pd.DataFrame(all_results)
    df.to_csv(csv_filename, index=False)
    print(f"Results saved to: {csv_filename}")
    
    # Display DataFrame
    cols_to_display = ['protein_id', 'original_sequence', 'predicted_sequence', 'confidence', 'accuracy', 'top3_accuracy']
    print(f"\nResults DataFrame:")
    try:
        display(df[cols_to_display])
    except Exception as e:
        pass

else:
    print("No successful results to save.")

print("\nInference complete!")

Starting protein sequence inference...
   Processing 2 protein(s)
   Mode: SAMPLING

Processing protein 1/2: 1fcd.C

Processing: 1fcd.C
Mode: SAMPLING
Processing input structure...
Detected PDB ID with chain: 1fcd.C
Creating temporary directory: /tmp/tmpgojrs70w
Downloading 1fcd from RCSB PDB...
Downloaded to: /tmp/tmpgojrs70w/1fcd.pdb
Extracting chain C...
Extracted chain C to: /tmp/tmpgojrs70w/1fcd_chainC.pdb
Converting to internal dictionary format...
Successfully processed pdb_id_with_chain: 1FCD.C
Structure loaded: 1FCD.C
   Length: 174 residues
   Source: pdb
Building molecular graph...
Auto-detected project root: /home/t-alptartici/inverse-folding
Using cache directory: /home/t-alptartici/inverse-folding/rbf_cache
RBF Manager initialized with 3D range: 1.0-20.0Å
RBF 3D filename: rbf_3d_features16_centers1.0to20.0_table40.0_res0.001_exponential.npy
RBF cache directory: /home/t-alptartici/inverse-folding/rbf_cache
Found existing sequence RBF table: /home/t-alptartici/inverse-foldi

Ensemble sampling: 100%|██████████| 19/19 [00:06<00:00,  2.81it/s]


Successfully processed 1fcd.C

Processing protein 2/2: 1h2s.B

Processing: 1h2s.B
Mode: SAMPLING
Processing input structure...
Detected PDB ID with chain: 1h2s.B
Creating temporary directory: /tmp/tmp3g_hwwwe
Downloading 1h2s from RCSB PDB...
Downloaded to: /tmp/tmp3g_hwwwe/1h2s.pdb
Extracting chain B...
Extracted chain B to: /tmp/tmp3g_hwwwe/1h2s_chainB.pdb
Converting to internal dictionary format...
Successfully processed pdb_id_with_chain: 1H2S.B
Structure loaded: 1H2S.B
   Length: 60 residues
   Source: pdb
Building molecular graph...
Auto-detected project root: /home/t-alptartici/inverse-folding
Using cache directory: /home/t-alptartici/inverse-folding/rbf_cache
RBF Manager initialized with 3D range: 1.0-20.0Å
RBF 3D filename: rbf_3d_features16_centers1.0to20.0_table40.0_res0.001_exponential.npy
RBF cache directory: /home/t-alptartici/inverse-folding/rbf_cache
Graph built: 61 nodes, 929 edges
Running sequence sampling...
   Creating ensemble of 3 structural variants...
Creating en

Ensemble sampling: 100%|██████████| 19/19 [00:05<00:00,  3.29it/s]

Successfully processed 1h2s.B

RESULTS SUMMARY

Protein 1: 1FCD.C
   Mode: SAMPLING
   Length: 174 residues
   Original : EPTAEMLTNNCAGCHGTHGNSVGPASPSIAQMDPMVFVEVMEGFKSGEIASTIMGRIAKGYSTADFEKMAGYFKQQTYQPAKQSFDTALADTGAKLHDKYCEKCHVEGGKPLADEEDYHILAGQWTPYLQYAMSDFREERRPMEKKMASKLRELLKAEGDAGLDALFAFYASQQ
   Predicted: MKSAEELAAGCAKCHGKGGVAPPPSAPPLGNVDPELLLKLLSAAKSGTIPSPSLPEIASKYNEEEAEELAAYLSKLTPVPVKVEYDPELAEKGKELFKKYCAKCHKDGGRPVPDGEGYIPLGGANPARLKKLFELIRNGKIPVDEECREKLDEKLLEKGEAGLEAILAYLASLR

Protein 2: 1H2S.B
   Mode: SAMPLING
   Length: 60 residues
   Original : GAVFIFVGALTVLFGAIAYGEVTAAAATGDAAAVQEAAVSAILGLIILLGINLGLVAATL
   Predicted: MLLLLLFLLLFLLALLVGAAVVVAALATGDVAAVLVALLLAVLGLLLLLGVAAALERLLG

Results saved to: ./inference_output/sampling_results_20250905_212008.csv

Results DataFrame:

Inference complete!





## Usage Examples & Tips

### Example 1: Basic Sampling
```python
# Simple sequence generation
mode = "sampling"
pdb_inputs = ["1abc", "1fcd.C"]
flow_temp = 0.2
steps = 20
```

### Example 2: Inpainting with Specific Positions
```python
# Predict specific positions
mode = "inpainting"
pdb_inputs = ["1fcd.C"]
mask_positions = [10, 15, 20, 25]  # Predict these positions
template_sequence = None
```

### Example 3: Template-Based Inpainting
```python
# Use template with X for unknowns
mode = "inpainting"
pdb_inputs = ["1fcd.C"]
mask_positions = None
template_sequence = "ACDEFXHIKLXNPQXSTVWY"  # X marks positions to predict
```

### Example 4: High-Quality Ensemble Sampling
```python
# Generate multiple structural variants for consensus
mode = "sampling"
pdb_inputs = ["1fcd.C"]
ensemble_size = 5
ensemble_consensus_strength = 1.0
structure_noise_mag_std = 0.2
```

### Parameter Guidelines:
- **flow_temp**: 0.1 (conservative) → 1.0 (diverse)
- **steps**: 10 (fast) → 50 (high quality)
- **ensemble_size**: 1 (single) → 10 (consensus)
- **structure_noise_mag_std**: 0.0 (no noise) → 2.0 Å (high diversity)

In [None]:
# =============================================================================
# QUICK TEST (OPTIONAL) - Test with a small protein
# =============================================================================

# Uncomment and run this cell to test with a small protein first
"""
print("Running quick test with a small protein...")

# Test parameters (smaller/faster)
test_pdb = "1fcd.C"  # Small test protein
test_mode = "sampling"
test_flow_temp = 0.2
test_steps = 10

# Run test
test_result = run_sampling_for_protein(
    pdb_input=test_pdb,
    mode=test_mode,
    mask_positions=None,
    template_sequence=None,
    mask_ratio=0.15
)

if test_result:
    print("Quick test successful!")
    print(f"   Test protein: {test_result['protein_id']}")
    print(f"   Predicted sequence: {test_result['predicted_sequence']}")
else:
    print("Quick test failed!")
"""

print("Quick test cell ready (uncomment to run)")

🧪 Quick test cell ready (uncomment to run)


In [None]:
# =============================================================================
# RESULTS ANALYSIS (OPTIONAL)
# =============================================================================

# This cell provides additional analysis of the results
if 'all_results' in locals() and all_results:
    print("Additional Results Analysis")
    print("="*50)
    
    for result in all_results:
        if result is None:
            continue
            
        print(f"\n{result['protein_id']}:")
        
        # Sequence comparison
        orig_seq = result['original_sequence']
        pred_seq = result['predicted_sequence']
        
        # Calculate sequence identity if we have original
        if orig_seq and pred_seq and len(orig_seq) == len(pred_seq):
            matches = sum(1 for a, b in zip(orig_seq, pred_seq) if a == b)
            identity = matches / len(orig_seq)
            print(f"   Sequence Identity: {identity:.3f} ({matches}/{len(orig_seq)} matches)")
            
            # Show differences
            differences = []
            for i, (orig, pred) in enumerate(zip(orig_seq, pred_seq)):
                if orig != pred:
                    differences.append(f"{i+1}{orig}→{pred}")
            
            if differences and len(differences) <= 10:
                print(f"   Differences: {', '.join(differences)}")
            elif differences:
                print(f"   Differences: {len(differences)} changes (showing first 10): {', '.join(differences[:10])}...")
        
        # Amino acid composition analysis
        if pred_seq:
            aa_counts = {}
            for aa in pred_seq:
                aa_counts[aa] = aa_counts.get(aa, 0) + 1
            
            # Show most common amino acids
            sorted_aa = sorted(aa_counts.items(), key=lambda x: x[1], reverse=True)[:5]
            composition_str = ", ".join([f"{aa}({count})" for aa, count in sorted_aa])
            print(f"   Top amino acids: {composition_str}")
            
        # Additional metrics if available
        if 'confidence' in result:
            print(f"   Average confidence: {result['confidence']:.3f}")
        if 'entropy' in result:
            print(f"   Average entropy: {result['entropy']:.3f}")
            
    print(f"\nAll results saved to CSV files in: {output_dir}")
    
else:
    print("No results to analyze. Run the inference first!")