In [None]:
APPEND_JOB = False
"""
INPUT PARAMETERS
"""
"""ProteinMPNN"""
NAME = "" #if empty, the name will be the PDB
PDB = "" #pdb (upload not implemented) or pdb folder containing folded proteins
NUM_SEQ = 1
FIXED = "" #pymol-formatted selection e.g. "10-15+17+20-54". Incompatible with pLDDT_THRESHOLD when a folder is given as input
FIXED_CHAIN = "A" #To be implemented: possibility to fix more than one chain
pLDDT_THRESHOLD = 100 #Will consider residues with pLDDT >= threshold as fixed. Only if pLDDT is stored in b values of the protein, so input must be from AlphaFold or OmegaFold
#Advanced
SAMPLING_T = "0.1"
"""FOLD"""
FOLD = "OF" #Choices are AF for AlphaFold and OF for OmegaFold
#AlphaFold
#Advanced
NUM_RELAX = 0 #How many of the best-ranked models do you want to relax with amber?
NUM_RECYCLE = 3 #Default (and recommended) is 3
RAND_SEED = 0
ONLY_FIRST = True #Only compare the best folding of each sequence generated
#OmegaFold
MODEL = 1 #Model 2 at the moment crashes due to lack of GPU memory
#Ranking
METRIC = "pLDDT" #"pLDDT" or "pTM" or "RMSD". pTM not available in omegafold
PYMOL_BEST = 10 #Create a pymol session contaning the N best models aligned with the original protein and colored by pLDDT

"""
CODE
"""
#Quick refinement of input
ONE_JOB = not APPEND_JOB
if FOLD == "AF": FOLD = "AlphaFold"
elif FOLD == "OF": FOLD = "OmegaFold"
if FIXED == "":
    FIXED = "-"
    
import os,shutil #WE USE ABSOLUTE PATHS
#CUDA
ENVIRONMENT = "ProteinEnv"
#General
MODELS_OUTPUT_FOLDER = "scratch" #Where to store output (subfolders will be created inside)
INSTALLATION_FOLDER = "data"
#GENERAL FOLDERS
NOTEBOOKS_FOLDER = os.getcwd()
PIPELINES_FOLDER = os.path.join(NOTEBOOKS_FOLDER,"Pipelines")
PDBs_FOLDER = os.path.join(NOTEBOOKS_FOLDER,"PDBs")
HELP_FOLDER = os.path.join(NOTEBOOKS_FOLDER,"HelpScripts")
USER_NAME = os.path.basename(os.path.dirname(NOTEBOOKS_FOLDER))
HOME_FOLDER = f"/home/{USER_NAME}"
DATA_FOLDER = f"/data/{USER_NAME}"
SCRATCH_FOLDER = f"/scratch/{USER_NAME}"
#MODIFIABLE FOLDERS
INSTALLATION_FOLDER = DATA_FOLDER
MODELS_OUTPUT_FOLDER = os.path.join(SCRATCH_FOLDER,"ProteinOutput")
if not os.path.exists(MODELS_OUTPUT_FOLDER):
    os.mkdir(MODELS_OUTPUT_FOLDER)
#MODELS
PMPNN_FOLDER = os.path.join(INSTALLATION_FOLDER,"ProteinMPNN")
COLAB_FOLD_FOLDER = os.path.join(INSTALLATION_FOLDER,"localcolabfold")
OMEGA_FOLDER = os.path.join(INSTALLATION_FOLDER,"OmegaFold")
OUT_FOLDER = os.path.join(MODELS_OUTPUT_FOLDER,"ProteinMPNN")
if not os.path.exists(OUT_FOLDER):
    os.mkdir(OUT_FOLDER)

#This will be used throughout to generate file and directory names to avoid overriding old outputs
def unique_name(directory,root,ext = "",fullpath=0,w=3):   
    i = 1     
    u_name = root + "_" + "{:0>{width}}".format(i, width=w)
    while os.path.exists(os.path.join(directory,u_name+ext)):
        i += 1
        u_name = root + "_" + "{:0>{width}}".format(i, width=w)
    if fullpath: return os.path.join(directory, u_name + ext)
    return u_name + ext

"""
SET JOB NAME AND CREATE OUTPUT DIRECTORIES
Data are stored in the "PMPNN_output" folder inside the data folder
Also, a folder containg all the results is created inside that folder
"""
JOB_BASE = NAME if NAME != "" else os.path.splitext(os.path.basename(PDB))[0]
JOB = unique_name(OUT_FOLDER,JOB_BASE)
JOB_FOLDER = os.path.join(OUT_FOLDER,JOB)
os.mkdir(JOB_FOLDER)
BASH_FOLDER = os.path.join(JOB_FOLDER,"bash_files")
if not os.path.exists(BASH_FOLDER):
    os.mkdir(BASH_FOLDER)
#Output from console will also be redirected to log files
pmpnn_log_file = os.path.join(JOB_FOLDER,"pmpnn.log")
fold_log_file = os.path.join(JOB_FOLDER,f"{FOLD}.log")
ranking_log_file = os.path.join(JOB_FOLDER,"ranking.log")

"""ProteinMPNN"""
#Parse folder for multiple pdb input
parsed_pdbs_jsonl_file = os.path.join(JOB_FOLDER,f"{JOB}_parsed_pdbs.jsonl")
parse_py_file = os.path.join(PMPNN_FOLDER,"helper_scripts/parse_multiple_chains.py")
fixed_jsonl_file = os.path.join(JOB_FOLDER,f"{JOB}_fixed_pos.jsonl")
sele_csv_file = os.path.join(JOB_FOLDER,"pmpnn_sele.csv")

pdb_temp = PDB if PDB.endswith(".pdb") else PDB + ".pdb"
pdb_temp_file = os.path.join(PDBs_FOLDER,pdb_temp)
if os.path.exists(pdb_temp_file): #PDB is a file
    pdb_file = os.path.join(BASH_FOLDER,pdb_temp)
    shutil.copy(pdb_temp_file,pdb_file)
else: #otherwise it is a folder
    pdb_folder = os.path.join(os.getcwd(),PDB)
    if os.path.exists(pdb_folder):
        pdb_files = [pdb for pdb in os.listdir(pdb_folder) if pdb.endswith(".pdb")]
        for pdb_file in pdb_files:
            new_pdb_file = os.path.join(BASH_FOLDER,pdb_file)
            shutil.copy(os.path.join(pdb_folder,pdb_file),new_pdb_file)

#IMPLEMENTATION OF FIXED POSITIONS
fixed_py_file = os.path.join(HELP_FOLDER,"make_fixed_dict.py")

#Set options
pmpnn_options = f"--num_seq_per_target {NUM_SEQ}"
pmpnn_options += f" --sampling_temp {SAMPLING_T}"
pmpnn_py_file = os.path.join(PMPNN_FOLDER,"protein_mpnn_run.py")

pmpnn_sh_file = unique_name(BASH_FOLDER,"pmpnn",".sh",1)
pmpnn_cmd = f"""
echo Determining fixed positions
python {fixed_py_file} {BASH_FOLDER} norfd {pLDDT_THRESHOLD} {FIXED} {FIXED_CHAIN} {fixed_jsonl_file} {sele_csv_file}
echo Parsing multiple pbds 
python {parse_py_file} --input_path {BASH_FOLDER} --output_path {parsed_pdbs_jsonl_file}
echo Running model
python {pmpnn_py_file} --jsonl_path {parsed_pdbs_jsonl_file} --fixed_positions_jsonl {fixed_jsonl_file} --out_folder {JOB_FOLDER} {pmpnn_options}
"""
with open(pmpnn_sh_file,'w') as pmpnn_sh:
    pmpnn_sh.write(pmpnn_cmd)
os.chmod(pmpnn_sh_file, 0o755) #Give execution rights

"""FOLD"""
FOLD_OUT_FOLDER = os.path.join(JOB_FOLDER,FOLD)
if not os.path.exists(FOLD_OUT_FOLDER):
    os.mkdir(FOLD_OUT_FOLDER)
PMPNN_FA_FOLDER = os.path.join(JOB_FOLDER,"seqs") #This is the folder where pmpnn outputs fasta files
queries_csv_file = os.path.join(JOB_FOLDER,"queries.csv") #These will contain the queries
queries_fasta_file = os.path.join(JOB_FOLDER,"queries.fasta")
fa_to_csv_fasta_py_file = os.path.join(HELP_FOLDER,"fa_to_csv_fasta.py")

fold_sh_file = ""
if FOLD == "AlphaFold":
    af2_options = ""
    if NUM_RELAX > 0: af2_options += f" --amber --use-gpu-relax --num-relax {NUM_RELAX}" #assumed to run on GPU
    if NUM_RECYCLE != 3: af2_options += f" --num-recycle {NUM_RECYCLE}"
    if RAND_SEED != 0: af2_options += f" --random-seed {RAND_SEED}"

    colabfold_batch_file = os.path.join(COLAB_FOLD_FOLDER,"colabfold-conda/bin/colabfold_batch")
    fold_sh_file = unique_name(BASH_FOLDER,"alphafold",".sh",1)
    af2_cmd = f"""
    echo Generating queries.csv file from .fa output
    python {fa_to_csv_fasta_py_file} {PMPNN_FA_FOLDER} {queries_csv_file} {queries_fasta_file}
    echo Initializing model...
    {colabfold_batch_file} {queries_csv_file} {FOLD_OUT_FOLDER} {af2_options}
    """
    with open(fold_sh_file,'w') as af2_sh:
        af2_sh.write(af2_cmd)
elif FOLD == "OmegaFold":
    of_options = ""
    of_options += f" --model {MODEL}"
    weights_pt = ["","release1.pt","release2.pt"][MODEL]
    weights_pt_file = os.path.join(OMEGA_FOLDER,weights_pt)
    of_options += f" --weights_file {weights_pt_file}"
    fold_sh_file = unique_name(BASH_FOLDER,"omegafold",".sh",1)
    of_cmd = f"""
    echo Generating queries.csv file from .fa output
    python {fa_to_csv_fasta_py_file} {PMPNN_FA_FOLDER} {queries_csv_file} {queries_fasta_file}
    echo Initializing model...
    omegafold {queries_fasta_file} {FOLD_OUT_FOLDER} {of_options}
    """
    with open(fold_sh_file,'w') as of_sh:
        of_sh.write(of_cmd)
os.chmod(fold_sh_file, 0o755) #Give execution rights

"""RANK EVERYTHING"""
rank_pdb = pdb_file if pdb_file != "" else "-"
rank_output_csv_file = os.path.join(JOB_FOLDER,JOB_BASE+".csv")
rank_pse_file = os.path.join(JOB_FOLDER,JOB_BASE+f"_{FOLD}.pse")
rank_py_file = os.path.join(HELP_FOLDER,f"rank_{FOLD}.py")
rank_cmd = f"python {rank_py_file} {fold_log_file} {queries_csv_file} {NUM_SEQ} {sele_csv_file} {FOLD_OUT_FOLDER} {rank_pdb} {rank_output_csv_file} {METRIC} {rank_pse_file} {PYMOL_BEST} {ONLY_FIRST}"
rank_sh_file = unique_name(BASH_FOLDER,f"rank_{FOLD}",".sh",1)
with open(rank_sh_file,'w') as rank_sh:
    rank_sh.write(rank_cmd)
os.chmod(rank_sh_file, 0o755) #Give execution rights

"""COMBINE ALL IN A UNIQUE PIPELINE"""
pipeline_sh_file = unique_name(BASH_FOLDER,"pipeline",".sh",1)
pipeline_cmd = f"""
echo Pipeline: {os.path.basename(pipeline_sh_file)}
echo
echo Name: {NAME}
echo PDB: {PDB}


echo ProteinMPNN
echo   NUM SEQUENCES PER TARGET: {NUM_SEQ}
echo   FIXED: {FIXED}
echo   FIXED CHAIN: {FIXED_CHAIN} 
echo   pLDDT THR: {pLDDT_THRESHOLD}
echo   SAMPLING T: {SAMPLING_T}
echo
echo FOLD MODEL: {FOLD}
echo   AlphaFold
echo     NUM RELAX: {NUM_RELAX}
echo     NUM RECYCLE: {NUM_RECYCLE}
echo     RANDOM GENERATOR SEED: {RAND_SEED}
echo   OmegaFold
echo     MODEL: {MODEL}

source activate {ENVIRONMENT}
echo
echo ProteinMPNN
{pmpnn_sh_file} | tee {pmpnn_log_file}
echo
echo {FOLD}
{fold_sh_file} | tee {fold_log_file}
echo 
echo Ranking...
{rank_sh_file} | tee {ranking_log_file}
echo 
echo Job done
echo {JOB_FOLDER}
"""
with open(pipeline_sh_file,'w') as pipeline_sh:
    pipeline_sh.write(pipeline_cmd)
os.chmod(pipeline_sh_file, 0o755) #Give execution rights

batch_sh_file = os.path.join(PIPELINES_FOLDER,"pmpnn_batch.sh")
if not ONE_JOB and os.path.exists(batch_sh_file):
    with open(batch_sh_file,"r") as pmpnn_batch_sh:
        previous_pipelines = pmpnn_batch_sh.readlines()
    with open(batch_sh_file,"w") as pmpnn_batch_sh:
        pmpnn_batch_sh.writelines(previous_pipelines)
        pmpnn_batch_sh.write("\n")
        pmpnn_batch_sh.write(pipeline_sh_file)  
else:      
    with open(batch_sh_file,"w") as pmpnn_batch_sh:
        pmpnn_batch_sh.write(pipeline_sh_file)  
os.chmod(batch_sh_file, 0o755) #Give execution rights

print("Run using next cell")
print(f"Single job:\n{pipeline_sh_file}")
print(f"Batch:\n{batch_sh_file}")
print(f"\nOutput of this pipeline will be in:\n{JOB_FOLDER}")

In [None]:
%%bash
/home/$USER/ProteinNotebooks/Pipelines/pmpnn_batch.sh

In [None]:
"""
MAKE SLURM FILE
"""
PACKAGE_MANAGER = "mamba"
GPU = "gpu" #options: "gpu" (first available), "t4", "v100", "v100-32g", "a100"

slurm_file = unique_name(PIPELINES_FOLDER,"pmpnn_slurm",".sh",1)
with open(batch_sh_file,"r") as pmpnn_batch_sh:
    all_pipelines = pmpnn_batch_sh.readlines()
with open(slurm_file,"w") as slurm_bash:
    slurm_bash.write(f"module load {PACKAGE_MANAGER}\n")
    slurm_bash.write(f"module load {GPU}\n")
    slurm_bash.writelines(all_pipelines)
os.chmod(slurm_file, 0o755)

print(f"""
ScienceApps > Jobs > JobComposer > New Job > From Default Template    
Edit Job name from Job Options.
Replace job.sh (Open in Editor) with the following (adapt required time hh:mm:ss) then Save:
      
#!/usr/bin/bash
#SBATCH --gpus=1
#SBATCH --mem=7800
#SBATCH --time=23:59:00
#SBATCH --output=job.out      
{slurm_file}
""")

In [None]:
"""
Conventions for names
Paths to files end with _type_file
Paths to folders end with _FOLDER
Opened files .type end with _type
Content of .sh end with _cmd
Content of .py end with _script
"""

"""
Cleanup instructions
Delete data/bash_files
Delete outputs folder: contains log files
"""

**Instructions**
---
---

Use `contigs` to define continious chains. Use a `:` to define multiple contigs and a `/` to define mutliple segments within a contig.
For example:

**unconditional**
- `contigs='100'` - diffuse **monomer** of length 100
- `contigs='50:100'` - diffuse **hetero-oligomer** of lengths 50 and 100
- `contigs='50'` `symmetry='cyclic'` `order=2` - make two copies of the defined contig(s) and add a symmetry constraint, for **homo-oligomeric** diffusion.

**binder design**
- `contigs='A:50'` `pdb='4N5T'` - diffuse a **binder** of length 50 to chain A of defined PDB.
- `contigs='E6-155:70-100'` `pdb='5KQV'` `hotspot='E64,E88,E96'` - diffuse a **binder** of length 70 to 100 (sampled randomly) to chain E and defined hotspot(s).

**motif scaffolding**
 - `contigs='40/A163-181/40'` `pdb='5TPN'`
 - `contigs='A3-30/36/A33-68'` `pdb='6MRR'` - diffuse a loop of length 36 between two segments of defined PDB ranges.

**partial diffusion**
- `contigs=''` `pdb='6MRR'` - noise all coordinates
- `contigs='A1-10'` `pdb='6MRR'` - keep first 10 positions fixed, noise the rest
- `contigs='A'` `pdb='1SSC'` - fix chain A, noise the rest

*hints and tips*
- `pdb=''` leave blank to get an upload prompt
- `contigs='50-100'` use dash to specify a range of lengths to sample from