In [1]:
import os
import subprocess
import re
import csv

In [2]:
# Download and compile USalign
!wget https://zhanggroup.org/US-align/bin/module/USalign.cpp
!g++ -static -O3 -ffast-math -lm -o USalign USalign.cpp
!chmod +x USalign

--2025-06-11 22:24:27--  https://zhanggroup.org/US-align/bin/module/USalign.cpp
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving zhanggroup.org (zhanggroup.org)... 137.132.93.250
Connecting to zhanggroup.org (zhanggroup.org)|137.132.93.250|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 727450 (710K) [text/plain]
Saving to: ‘USalign.cpp’


2025-06-11 22:24:32 (281 KB/s) - ‘USalign.cpp’ saved [727450/727450]



In [3]:
!unzip pdbs.zip > /dev/null

In [4]:
# Directory containing the pdb files
input_directory = "./pdbs"
# Reference pdb file
reference_file = "ref.pdb"
# Output CSV file
output_csv = "usalign_results.csv"

In [6]:
# Define the regex patterns for RMSD and TM-score
rmsd_pattern = re.compile(r"RMSD=\s+([\d.]+)")
tm_score_pattern = re.compile(r"TM-score=\s+([\d.]+)")
rank_pattern = re.compile(r"rank_\d+_rank_1\.pdb")

# Initialize a list to store the results
results = []
# Iterate through each file in the directory
for filename in os.listdir(input_directory):
    if not rank_pattern.match(filename):
        continue

    mpnn_rank = rank_pattern.search(filename).group(0)
    file_path = os.path.join(input_directory, filename)

    # Check if it's a file
    if os.path.isfile(file_path):
        try:
            # Run USalign command
            result = subprocess.run(
                ["./USalign", file_path, reference_file],
                capture_output=True,
                text=True,
                check=True,
            )
            output = result.stdout

            # Extract RMSD and TM-score using regex
            rmsd_match = rmsd_pattern.search(output)
            tm_score_match = tm_score_pattern.search(output)

            # If both RMSD and TM-score were found, add to results
            if rmsd_match and tm_score_match:
                rmsd = rmsd_match.group(1)
                tm_score = tm_score_match.group(1)
                results.append([mpnn_rank, rmsd, tm_score])
            else:
                print(f"Warning: Could not parse output for file {filename}")

        except subprocess.CalledProcessError as e:
            print(f"Error processing file {filename}: {e}")

# Write the results to a CSV file
with open(output_csv, mode="w", newline="") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(["ProteinMPNN Rank", "RMSD", "TM-Score"])
    writer.writerows(results)

print(f"Results written to {output_csv}")

Results written to usalign_results.csv
