In [4]:
import os
import yaml
import argparse
import glob

## convert the fasta from chai-1 to fasta required for botlz

In [2]:
def convert_fasta_format(input_file, output_file=None):
    """
    Convert FASTA files by:
    1. Adding chain letters (A for protein, B for ligand)
    2. Changing "ligand" to "smiles" in the header
    3. Removing strings after the "|" symbol
    """
    if output_file is None:
        # Use the same filename if no output file is specified
        output_file = input_file
    
    with open(input_file, "r") as f:
        lines = f.readlines()
    
    new_lines = []
    
    for line in lines:
        line = line.strip()
        if line.startswith(">protein|"):
            # Replace with >A|protein|
            new_lines.append(">A|protein|")
        elif line.startswith(">ligand|"):
            # Replace with >B|smiles|
            new_lines.append(">B|smiles|")
        else:
            new_lines.append(line)
    
    with open(output_file, "w") as f:
        for i, line in enumerate(new_lines):
            f.write(line + "\n")
    
    print(f"Converted: {input_file} -> {output_file}")

# Process a single file
def process_file(file_path):
    convert_fasta_format(file_path)


In [8]:

# Example usage for a single file
input_file = "/home/aoxu/projects/PoseBench/forks/chai-lab/prediction_inputs/plinder/1afb__1__1.A__1.D_1.F/1afb__1__1.A__1.D_1.F.fasta"
output_file = "/home/aoxu/projects/PoseBench/forks/boltz/input/1afb_converted.fasta"
convert_fasta_format(input_file, output_file)

# To process all matching files in a directory:
directory = "/home/aoxu/projects/PoseBench/forks/chai-lab/prediction_inputs/plinder/"
for root, dirs, files in os.walk(directory):
    for dir in dirs:
            file_path = os.path.join(root, dir, f"{dir}.fasta")
            output_file = f"/home/aoxu/projects/PoseBench/forks/boltz/input/plinder/{dir}.fasta"
            convert_fasta_format(file_path, output_file)

Converted: /home/aoxu/projects/PoseBench/forks/chai-lab/prediction_inputs/plinder/1afb__1__1.A__1.D_1.F/1afb__1__1.A__1.D_1.F.fasta -> /home/aoxu/projects/PoseBench/forks/boltz/input/1afb_converted.fasta
Converted: /home/aoxu/projects/PoseBench/forks/chai-lab/prediction_inputs/plinder/6au0__1__1.A__1.B/6au0__1__1.A__1.B.fasta -> /home/aoxu/projects/PoseBench/forks/boltz/input/plinder/6au0__1__1.A__1.B.fasta
Converted: /home/aoxu/projects/PoseBench/forks/chai-lab/prediction_inputs/plinder/5m7u__1__1.A_1.B__1.D/5m7u__1__1.A_1.B__1.D.fasta -> /home/aoxu/projects/PoseBench/forks/boltz/input/plinder/5m7u__1__1.A_1.B__1.D.fasta
Converted: /home/aoxu/projects/PoseBench/forks/chai-lab/prediction_inputs/plinder/2yr6__1__1.A__1.D_1.E_1.F/2yr6__1__1.A__1.D_1.E_1.F.fasta -> /home/aoxu/projects/PoseBench/forks/boltz/input/plinder/2yr6__1__1.A__1.D_1.E_1.F.fasta
Converted: /home/aoxu/projects/PoseBench/forks/chai-lab/prediction_inputs/plinder/3vph__1__1.A_1.B_1.C_1.D__1.F_1.H_1.Q/3vph__1__1.A_1.B_1.

## convert the .fasta to yaml required for botlz

In [None]:
def convert_fasta_to_yaml(fasta_file, yaml_file=None):
    """Convert a FASTA file to YAML format for Boltz"""
    
    if yaml_file is None:
        yaml_file = os.path.splitext(fasta_file)[0] + '.yaml'
    
    # Parse the FASTA file
    protein_sequence = None
    smiles_string = None
    
    with open(fasta_file, 'r') as f:
        current_section = None
        for line in f:
            line = line.strip()
            if not line:
                continue
                
            if line.startswith('>'):
                if 'A|protein|' in line:
                    current_section = 'protein'
                elif 'B|smiles|' in line:
                    current_section = 'smiles'
            elif current_section == 'protein':
                protein_sequence = line
                current_section = None
            elif current_section == 'smiles':
                smiles_string = line
                current_section = None
    
    # Create YAML structure
    yaml_data = {
        'version': 1,
        'sequences': [
            {
                'protein': {
                    'id': 'A',
                    'sequence': protein_sequence
                }
            },
            {
                'ligand': {
                    'id': 'B',
                    'smiles': smiles_string
                }
            }
        ],
        'properties': [
            {
                'affinity': {
                    'binder': 'B'
                }
            }
        ]
    }
    
    # Write YAML file
    with open(yaml_file, 'w') as f:
        yaml.dump(yaml_data, f, default_flow_style=False)
    
    print(f"Converted {fasta_file} to {yaml_file}")
    return yaml_file

def batch_convert(input_dir, output_dir=None):
    """Convert all FASTA files in a directory"""
    if output_dir is None:
        output_dir = input_dir
        
    os.makedirs(output_dir, exist_ok=True)
    
    fasta_files = glob.glob(os.path.join(input_dir, '*.fasta'))
    
    for fasta_file in fasta_files:
        base_name = os.path.basename(fasta_file)
        name_without_ext = os.path.splitext(base_name)[0]
        yaml_file = os.path.join(output_dir, f"{name_without_ext}.yaml")
        convert_fasta_to_yaml(fasta_file, yaml_file)


## generate yaml file for plidner set 

In [None]:
input = "/home/aoxu/projects/PoseBench/forks/boltz/input/pinder/fasta"
output = "/home/aoxu/projects/PoseBench/forks/boltz/input/plinder/yaml"
if os.path.isdir(input):
    batch_convert(input, output)
else:
    convert_fasta_to_yaml(input, output)

## generate yaml file for runsNposes set 

In [3]:
# To process all matching files in a directory:
directory = "/home/aoxu/projects/PoseBench/data/runsNposes_archive/zenodo_downloads/ground_truth/"
for root, dirs, files in os.walk(directory):
    for dir in dirs:
            file_path = os.path.join(root, dir, f"sequences.fasta")
            output_file = f"/home/aoxu/projects/PoseBench/forks/boltz/input/runsNposes/{dir}.fasta"
            convert_fasta_format(file_path, output_file)

FileNotFoundError: [Errno 2] No such file or directory: '/home/aoxu/projects/PoseBench/forks/boltz/input/runsNposes/8ou2__1__1.A__1.C.fasta'

In [None]:
# To process all matching files in a directory:
directory = "/home/aoxu/projects/PoseBench/data/runsNposes_archive/zenodo_downloads/ground_truth/"
for root, dirs, files in os.walk(directory):
    for dir in dirs:
            file_path = os.path.join(root, dir, f"sequences.fasta")
            output_file = f"/home/aoxu/projects/PoseBench/forks/boltz/input/runsNposes/{dir}.fasta"
            convert_fasta_format(file_path, output_file)
input = "/home/aoxu/projects/PoseBench/forks/boltz/input/runsNposes/fasta"
output = "/home/aoxu/projects/PoseBench/forks/boltz/input/runsNposes/yaml"
if os.path.isdir(input):
    batch_convert(input, output)
else:
    convert_fasta_to_yaml(input, output)