In [None]:
#Copyright © 2024 LOCBP @ University of Zürich
#Distributed under MIT license

APPEND_JOB = False 
"""
INPUT PARAMETERS
"""
#Do not use the words "design", "rank" in the names or pdb files as these ar keywords for functionality of the code

"""RFdiffusion"""
NAME = "" #if empty, the name will be the PDB #No whitespaces
PDB = "" #pdb (upload not implemented)
CONTIGS = "" #[] will be added automatically 
INPAINT = "" #Same format as contigs, it determines which aminoacids of the original pdb to mask
NUM_DESIGNS = 25
"""
In this notebook, a first RFdiffusion scan is made, with number of designs equal to INPAINT_AUTO_NUM_DESIGNS.
Then, no ProteinMPNN/Folding are performed.
Rather, the generated structures are used to determine which aminoacids of the fixed chains tend to be close to the ones that are designed (Distance <= INPAINT_AUTO_DISTANCE).
If an aminoacid happens to be close to the desined ones at least a INPAINT_AUTO_MIN_OCCURENCY number of times, that it is chosen as be inpainted, unless it is part of the selection to be excluded.
The resulting list of aminoacids to be inpainted is combined with the one in INPAINT, and then the following is as in the basic RFdiffusion notebook.
"""
INPAINT_AUTO_NUM_DESIGNS = 10
INPAINT_AUTO_MIN_OCCURENCY = INPAINT_AUTO_NUM_DESIGNS/5
INPAINT_AUTO_DISTANCE = 5 #Å
INPAINT_AUTO_EXCLUDE = "" #Pymol formatted selection
#Advanced
ACTIVE_SITE = False #True if you're scaffolding a very small motif
STEPS = 50 #Don't recommend to modify this
PARTIAL_STEPS = 0 #Steps of partial diffusion. Contig string must be exactly the same length as the input protein (see official documentation)
REPRODUCIBLE = False
REPRODUCIBILITY_NUMBER = 0
"""ProteinMPNN"""
NUM_SEQ_PER_TARGET = 2
FIXED = "rfd" #Can be either "" (none) or "rfd" (scaffold is fixed) or a pymol-formatted selection e.g. "10-15+17+20-54" (Only with fixed contigs)
FIXED_CHAIN = "A" #To be implemented: possibility to fix more than one chain
#Advanced
SAMPLING_T = "0.1"
"""FOLD"""
FOLD = "OF" #Choices are AF for AlphaFold and OF for OmegaFold
#AlphaFold
ONLY_FIRST = True #Only compare the best folding of each sequence generated 
#Advanced
NUM_RELAX = 0 #How many of the best-ranked models do you want to relax with amber?
NUM_RECYCLE = 3 #Default (and recommended) is 3
RAND_SEED = 0
#OmegaFold
MODEL = 2 #Model 2 can only be used with V100-32GB or A100-80GB
#Ranking
METRIC = "pLDDT" #"pLDDT" or "pTM" or "RMSD" (Fold) or "score" or "global_score" (PMPNN). pTM not available in omegafold
RMSD_ALIGNMENT = 'super' #'cealign' or 'align' or 'super' or 'fit'
PYMOL_BEST = min(max(5,int(NUM_DESIGNS/5)),20) #Create a pymol session contaning the N best designs
#Refolding and reranking for models other than alphafold
FOLD_BEST_WITH_ALPHAFOLD = PYMOL_BEST 
#DNA
DNA=True #Generates DNA sequences for Homo Sapiens and E coli

"""
CODE
"""
#Quick refinement of input
ONE_JOB = not APPEND_JOB
if FOLD == "AF": FOLD = "AlphaFold"
elif FOLD == "OF": FOLD = "OmegaFold"
if INPAINT == "": INPAINT="-"
if INPAINT_AUTO_EXCLUDE == "": INPAINT_AUTO_EXCLUDE="-"
INPAINT_AUTO_MIN_OCCURENCY = int(INPAINT_AUTO_MIN_OCCURENCY)

import os,shutil #WE USE ABSOLUTE PATHS
#CUDA
ENVIRONMENT = "ProteinEnv"
#General
MODELS_OUTPUT_FOLDER = "scratch" #Where to store output (subfolders will be created inside)
INSTALLATION_FOLDER = "data"
#GENERAL FOLDERS
NOTEBOOKS_FOLDER = os.getcwd()
PIPELINES_FOLDER = os.path.join(NOTEBOOKS_FOLDER,"Pipelines")
PDBs_FOLDER = os.path.join(NOTEBOOKS_FOLDER,"PDBs")
HELP_FOLDER = os.path.join(NOTEBOOKS_FOLDER,"HelpScripts")
USER_NAME = os.path.basename(os.path.dirname(NOTEBOOKS_FOLDER))
HOME_FOLDER = f"/home/{USER_NAME}"
DATA_FOLDER = f"/data/{USER_NAME}"
SCRATCH_FOLDER = f"/scratch/{USER_NAME}"
#MODIFIABLE FOLDERS
INSTALLATION_FOLDER = DATA_FOLDER
MODELS_OUTPUT_FOLDER = os.path.join(SCRATCH_FOLDER,"ProteinOutput")
if not os.path.exists(MODELS_OUTPUT_FOLDER):
    os.mkdir(MODELS_OUTPUT_FOLDER)
#MODELS
RFD_FOLDER = os.path.join(INSTALLATION_FOLDER,"RFdiffusion")
PMPNN_FOLDER = os.path.join(INSTALLATION_FOLDER,"ProteinMPNN")
OMEGA_FOLDER = os.path.join(INSTALLATION_FOLDER,"OmegaFold")
COLAB_FOLD_FOLDER = os.path.join(INSTALLATION_FOLDER,"localcolabfold")
OUT_FOLDER = os.path.join(MODELS_OUTPUT_FOLDER,"RFdiffusion_InPaint")
if not os.path.exists(OUT_FOLDER):
    os.mkdir(OUT_FOLDER)

#This will be used throughout to generate file and directory names to avoid overriding old outputs
def unique_name(directory,root,ext = "",fullpath=0,w=3):   
    i = 1     
    u_name = root + "_" + "{:0>{width}}".format(i, width=w)
    while os.path.exists(os.path.join(directory,u_name+ext)):
        i += 1
        u_name = root + "_" + "{:0>{width}}".format(i, width=w)
    if fullpath: return os.path.join(directory, u_name + ext)
    return u_name + ext

"""
SET JOB NAME AND CREATE OUTPUT DIRECTORIES
Data are stored in the "RFD_output" folder inside the data folder
Also, a folder containg all the results is created inside that folder
"""
JOB_BASE = NAME if NAME != "" else os.path.splitext(os.path.basename(PDB))[0]
JOB = unique_name(OUT_FOLDER,JOB_BASE)
JOB_FOLDER = os.path.join(OUT_FOLDER,JOB)
os.mkdir(JOB_FOLDER)
RUNTIME_FOLDER = os.path.join(JOB_FOLDER,"RunTime")
if not os.path.exists(RUNTIME_FOLDER):
    os.mkdir(RUNTIME_FOLDER)
#Output from consol will also be redirected to log files
pre_rfd_log_file = os.path.join(JOB_FOLDER,"_pre_rfd.log")
inpaint_log_file = os.path.join(JOB_FOLDER,"_inpaint.log")
rfd_log_file = os.path.join(JOB_FOLDER,"_rfd.log")
pmpnn_log_file = os.path.join(JOB_FOLDER,"_pmpnn.log")
fold_log_file = os.path.join(JOB_FOLDER,"_fold.log")   #####changed af2 to fold
ranking_log_file = os.path.join(JOB_FOLDER,"_ranking.log")
dna_log_file = os.path.join(JOB_FOLDER,"_dna.log")

"""
PRELIMINARY RFdiffusion to determine inpainted positions
"""
PRE_RFD_JOB_FOLDER = os.path.join(JOB_FOLDER,"RFdiffusion-InPaint")
pre_rfd_options = f"'contigmap.contigs=[{CONTIGS}]'"
if INPAINT != "-": pre_rfd_options += f" 'contigmap.inpaint_seq=[{INPAINT}]'"
pdb_file = PDB
if PDB != "": 
    pdb_temp = PDB if PDB.endswith(".pdb") else PDB + ".pdb"
    pdb_temp_file = os.path.join(PDBs_FOLDER,pdb_temp)
    pdb_file = os.path.join(RUNTIME_FOLDER,pdb_temp)
    shutil.copy(pdb_temp_file,pdb_file) #copy original input file into destination foler
    pre_rfd_options += f" inference.input_pdb={pdb_file}"
prefix = os.path.join(PRE_RFD_JOB_FOLDER,f"{JOB}_pre_design") #polyG Designs will be .._design_0.pdb, .._design_1.pdb, ...
pre_rfd_options += f" inference.output_prefix={prefix}"
pre_rfd_options += f" inference.num_designs={INPAINT_AUTO_NUM_DESIGNS}"
pre_rfd_options += f" inference.deterministic={REPRODUCIBLE}"
pre_rfd_options += f" inference.design_startnum={REPRODUCIBILITY_NUMBER}"
if STEPS != 50: pre_rfd_options += f" diffuser.T = {STEPS}"
if PARTIAL_STEPS > 0: pre_rfd_options += f" diffuser.partial_T = {PARTIAL_STEPS}"
if ACTIVE_SITE: pre_rfd_options += " inference.ckpt_override_path=models/ActiveSite_ckpt.pt"

inference_py_file = os.path.join(RFD_FOLDER,"scripts","run_inference.py")
pre_rfd_cmd = f"""
{inference_py_file} {pre_rfd_options}
"""
pre_rfd_sh_file = os.path.join(RUNTIME_FOLDER,"pre_rfd.sh")
with open(pre_rfd_sh_file ,"w") as pre_rfd_sh:
    pre_rfd_sh.write(pre_rfd_cmd)
os.chmod(pre_rfd_sh_file, 0o755) #By default, the bash is created without execution rights, we add them here


"""
RFDiffusion
"""
RFD_JOB_FOLDER = os.path.join(JOB_FOLDER,"RFdiffusion")
rfd_options = f"'contigmap.contigs=[{CONTIGS}]'"
#####Inpaint will be handled afterwards
pdb_file = PDB
if PDB != "": 
    pdb_temp = PDB if PDB.endswith(".pdb") else PDB + ".pdb"
    pdb_temp_file = os.path.join(PDBs_FOLDER,pdb_temp)
    pdb_file = os.path.join(RUNTIME_FOLDER,pdb_temp)
    shutil.copy(pdb_temp_file,pdb_file) #copy original input file into destination foler
    rfd_options += f" inference.input_pdb={pdb_file}"
prefix = os.path.join(RFD_JOB_FOLDER,f"{JOB}_design") #polyG Designs will be .._design_0.pdb, .._design_1.pdb, ...
rfd_options += f" inference.output_prefix={prefix}"
rfd_options += f" inference.num_designs={NUM_DESIGNS}"
rfd_options += f" inference.deterministic={REPRODUCIBLE}"
rfd_options += f" inference.design_startnum={REPRODUCIBILITY_NUMBER}"
if STEPS != 50: rfd_options += f" diffuser.T = {STEPS}"
if PARTIAL_STEPS > 0: rfd_options += f" diffuser.partial_T = {PARTIAL_STEPS}"
if ACTIVE_SITE: rfd_options += " inference.ckpt_override_path=models/ActiveSite_ckpt.pt"

inference_py_file = os.path.join(RFD_FOLDER,"scripts","run_inference.py")
rfd_cmd = f"""
{inference_py_file} {rfd_options}
"""
rfd_sh_file = os.path.join(RUNTIME_FOLDER,"rfd.sh")
with open(rfd_sh_file ,"w") as rfd_sh:
    rfd_sh.write(rfd_cmd)
os.chmod(rfd_sh_file, 0o755) #By default, the bash is created without execution rights, we add them here

"""
Determine position and determine which positions to inpaint
Then will simply open the file rfd_sh_file and add the flag for the inpaint
"""
inpaint_py_file = os.path.join(HELP_FOLDER,"find_inpaint.py")
inpaint_csv_file = os.path.join(JOB_FOLDER,JOB_BASE+"_inpaint.csv")
inpaint_pse_file = os.path.join(JOB_FOLDER,JOB_BASE+"_inpaint.pse")
inpaint_sh_file = os.path.join(RUNTIME_FOLDER,"find_inpaint.sh")
inpaint_cmd = f"""
python {inpaint_py_file} {PRE_RFD_JOB_FOLDER} {pre_rfd_log_file} {rfd_sh_file} {CONTIGS} {INPAINT} {INPAINT_AUTO_NUM_DESIGNS} {INPAINT_AUTO_DISTANCE} {INPAINT_AUTO_MIN_OCCURENCY} {INPAINT_AUTO_EXCLUDE} {pdb_file} {inpaint_csv_file} {inpaint_pse_file}
"""
with open(inpaint_sh_file,'w') as inpaint_sh:
    inpaint_sh.write(inpaint_cmd)
os.chmod(inpaint_sh_file, 0o755) #Give execution rights

"""ProteinMPNN"""
#Parse folder for multiple pdb input
parsed_pdbs_jsonl_file = os.path.join(RUNTIME_FOLDER,f"{JOB}_parsed_pdbs.jsonl")
parse_py_file = os.path.join(PMPNN_FOLDER,"helper_scripts/parse_multiple_chains.py")
fixed_jsonl_file = os.path.join(RUNTIME_FOLDER,f"{JOB}_fixed_pos.jsonl")
sele_csv_file = os.path.join(RUNTIME_FOLDER,"pmpnn_sele.csv")

#IMPLEMENTATION OF FIXED POSITIONS
fixed_py_file = os.path.join(HELP_FOLDER,"make_fixed_dict.py")

#Set options
pmpnn_options = f"--num_seq_per_target {NUM_SEQ_PER_TARGET}"
pmpnn_options += f" --sampling_temp {SAMPLING_T}"
pmpnn_py_file = os.path.join(PMPNN_FOLDER,"protein_mpnn_run.py")

pmpnn_sh_file = os.path.join(RUNTIME_FOLDER,"rfd_pmpnn.sh")
pmpnn_cmd = f"""
echo Determining fixed positions
python {fixed_py_file} {RFD_JOB_FOLDER} {rfd_log_file} 100 {FIXED} {FIXED_CHAIN} {fixed_jsonl_file} {sele_csv_file}
echo Parsing multiple pbds 
python {parse_py_file} --input_path {RFD_JOB_FOLDER} --output_path {parsed_pdbs_jsonl_file}
echo Running model
python {pmpnn_py_file} --jsonl_path {parsed_pdbs_jsonl_file} --fixed_positions_jsonl {fixed_jsonl_file} --out_folder {JOB_FOLDER} {pmpnn_options}
"""
with open(pmpnn_sh_file,'w') as pmpnn_sh:
    pmpnn_sh.write(pmpnn_cmd)
os.chmod(pmpnn_sh_file, 0o755) #Give execution rights

"""FOLD"""
FOLD_OUT_FOLDER = os.path.join(JOB_FOLDER,FOLD)
if not os.path.exists(FOLD_OUT_FOLDER):
    os.mkdir(FOLD_OUT_FOLDER)
PMPNN_FA_FOLDER = os.path.join(JOB_FOLDER,"seqs") #This is the folder where pmpnn outputs fasta files
queries_csv_file = os.path.join(RUNTIME_FOLDER,JOB_BASE+"_queries.csv") #These will contain the queries
queries_fasta_file = os.path.join(JOB_FOLDER,JOB_BASE+"_queries.fasta")
fa_to_csv_fasta_py_file = os.path.join(HELP_FOLDER,"fa_to_csv_fasta.py")

fold_sh_file = ""
if FOLD == "AlphaFold":
    af2_options = ""
    if NUM_RELAX > 0: af2_options += f" --amber --use-gpu-relax --num-relax {NUM_RELAX}" #assumed to run on GPU
    if NUM_RECYCLE != 3: af2_options += f" --num-recycle {NUM_RECYCLE}"
    if RAND_SEED != 0: af2_options += f" --random-seed {RAND_SEED}"

    colabfold_batch_file = os.path.join(COLAB_FOLD_FOLDER,"colabfold-conda/bin/colabfold_batch")
    fold_sh_file = os.path.join(RUNTIME_FOLDER,"rfd_pmpnn_af2.sh")
    af2_cmd = f"""
    echo Generating queries.csv file from .fa output
    python {fa_to_csv_fasta_py_file} {PMPNN_FA_FOLDER} {queries_csv_file} {queries_fasta_file}
    echo Initializing model...
    {colabfold_batch_file} {queries_csv_file} {FOLD_OUT_FOLDER} {af2_options}
    """
    with open(fold_sh_file,'w') as af2_sh:
        af2_sh.write(af2_cmd)
elif FOLD == "OmegaFold":
    of_options = ""
    of_options += f" --model {MODEL}"
    weights_pt = ["","release1.pt","release2.pt"][MODEL]
    weights_pt_file = os.path.join(OMEGA_FOLDER,weights_pt)
    of_options += f" --weights_file {weights_pt_file}"
    fold_sh_file = os.path.join(RUNTIME_FOLDER,"rfd_pmpnn_of.sh")
    of_cmd = f"""
    echo Generating queries.csv file from .fa output
    python {fa_to_csv_fasta_py_file} {PMPNN_FA_FOLDER} {queries_csv_file} {queries_fasta_file}
    echo Initializing model...
    omegafold {queries_fasta_file} {FOLD_OUT_FOLDER} {of_options}
    """
    with open(fold_sh_file,'w') as of_sh:
        of_sh.write(of_cmd)
os.chmod(fold_sh_file, 0o755) #Give execution rights

"""RANK EVERYTHING"""
rank_pdb = pdb_file if pdb_file != "" else "-"
rank_output_csv_file = os.path.join(JOB_FOLDER,JOB_BASE+"_"+FOLD+".csv")
rank_best_fasta_file = os.path.join(JOB_FOLDER,JOB_BASE+f"_Best{PYMOL_BEST}.fasta")
rank_pse_file = os.path.join(JOB_FOLDER,JOB_BASE+f"_{FOLD}.pse")
rank_py_file = os.path.join(HELP_FOLDER,f"rank_{FOLD}.py")
rank_cmd = f"python {rank_py_file} {fold_log_file} {queries_csv_file} {NUM_SEQ_PER_TARGET} {sele_csv_file} {FOLD_OUT_FOLDER} {rank_pdb} {rank_output_csv_file} {METRIC} {RMSD_ALIGNMENT} {rank_pse_file} {PYMOL_BEST} {rank_best_fasta_file} {ONLY_FIRST}"
rank_sh_file = os.path.join(RUNTIME_FOLDER,f"rank_{FOLD}.sh")
with open(rank_sh_file,'w') as rank_sh:
    rank_sh.write(rank_cmd)
os.chmod(rank_sh_file, 0o755) #Give execution rights

"""DNA"""
DNA_FOLDER=os.path.join(JOB_FOLDER,"DNA")
if not os.path.exists(DNA_FOLDER):
    os.mkdir(DNA_FOLDER)
dna_prefix = os.path.join(DNA_FOLDER,JOB_BASE)
dna_encoder_py_file = os.path.join(HELP_FOLDER,"dna_encoder.py")
dna_encoder_cmd = f"python {dna_encoder_py_file} {rank_best_fasta_file} {dna_prefix}"
dna_encoder_sh_file = os.path.join(RUNTIME_FOLDER,"dna_encoder.sh")
with open(dna_encoder_sh_file,'w') as dna_encoder_sh:
    dna_encoder_sh.write(dna_encoder_cmd)
os.chmod(dna_encoder_sh_file, 0o755) #Give execution rights

"""Additional step: fold best ones with AlphaFold"""
if FOLD_BEST_WITH_ALPHAFOLD > 0 and not FOLD=="AlphaFold":
    refold_log_file = os.path.join(JOB_FOLDER,"_af2_best_fold.log")
    reranking_log_file = os.path.join(JOB_FOLDER,"_reranking.log")

    ALPHAFOLD_OUT_FOLDER = os.path.join(JOB_FOLDER,"AlphaFold")
    if not os.path.exists(ALPHAFOLD_OUT_FOLDER):
        os.mkdir(ALPHAFOLD_OUT_FOLDER)

    best_queries_py_file = os.path.join(HELP_FOLDER,"best_queries.py")
    best_queries_csv_file = os.path.join(RUNTIME_FOLDER,"best_queries.csv")

    af2_options = ""
    if NUM_RELAX > 0: af2_options += f" --amber --use-gpu-relax --num-relax {NUM_RELAX}" #assumed to run on GPU
    if NUM_RECYCLE != 3: af2_options += f" --num-recycle {NUM_RECYCLE}"
    if RAND_SEED != 0: af2_options += f" --random-seed {RAND_SEED}"

    colabfold_batch_file = os.path.join(COLAB_FOLD_FOLDER,"colabfold-conda/bin/colabfold_batch")
    af2_refolding_sh_file = os.path.join(RUNTIME_FOLDER,"rfd_pmpnn_refold_af2.sh")
    af2_refolding_cmd = f"""
    echo Selecting best queries {FOLD} output
    python {best_queries_py_file} {queries_csv_file} {rank_output_csv_file} {FOLD_BEST_WITH_ALPHAFOLD} {best_queries_csv_file}
    echo Initializing model...
    {colabfold_batch_file} {best_queries_csv_file} {ALPHAFOLD_OUT_FOLDER} {af2_options}
    """
    with open(af2_refolding_sh_file,'w') as af2_sh:
        af2_sh.write(af2_refolding_cmd)
    os.chmod(af2_refolding_sh_file, 0o755) #Give execution rights

    """RERANK EVERYTHING"""
    rerank_output_csv_file = os.path.join(JOB_FOLDER,JOB_BASE+"_AlphaFold.csv")
    rerank_pse_file = os.path.join(JOB_FOLDER,JOB_BASE+f"_AlphaFold.pse")
    rank_py_file = os.path.join(HELP_FOLDER,f"rank_AlphaFold.py")
    rank_cmd = f"python {rank_py_file} {refold_log_file} {best_queries_csv_file} {NUM_SEQ_PER_TARGET} {sele_csv_file} {ALPHAFOLD_OUT_FOLDER} {rank_pdb} {rerank_output_csv_file} {METRIC} {RMSD_ALIGNMENT} {rerank_pse_file} {PYMOL_BEST}  {rank_best_fasta_file} {ONLY_FIRST}"
    rerank_sh_file = os.path.join(RUNTIME_FOLDER,f"rerank_af2.sh")
    with open(rerank_sh_file,'w') as rank_sh:
        rank_sh.write(rank_cmd)
    os.chmod(rerank_sh_file, 0o755) #Give execution rights

"""COMBINE ALL IN A UNIQUE PIPELINE"""
pipeline_sh_file = unique_name(RUNTIME_FOLDER,"RFdiffusion_InPaint_pipeline",".sh",1)
print_options_cmd = f"""
echo Pipeline: {os.path.basename(pipeline_sh_file)}
echo
echo "Name: {NAME}"
echo "PDB: {PDB}"
echo "PDB PATH: {pdb_file}"
echo
echo "Determination of inpainted positions"
echo "  NUM DESIGNS: {INPAINT_AUTO_NUM_DESIGNS}"
echo "  MAX DISTANCE: {INPAINT_AUTO_DISTANCE}"
echo "  MIN OCCURANCY: {INPAINT_AUTO_MIN_OCCURENCY}"
echo "  EXCLUDE: {INPAINT_AUTO_EXCLUDE}"
echo
echo "RFdiffusion"
echo "  CONTIGS: {CONTIGS}"
echo "  NUM DESIGNS: {NUM_DESIGNS}"
echo "  ACTIVE SITE: {ACTIVE_SITE}"
echo "  STEPS: {STEPS}"
echo "  PARTIAL STEPS: {PARTIAL_STEPS}"
echo
echo "ProteinMPNN"
echo "  NUM SEQUENCES PER TARGET: {NUM_SEQ_PER_TARGET}"
echo "  FIXED: {FIXED}"
echo "  FIXED CHAIN: {FIXED_CHAIN}"
echo "  SAMPLING T: {SAMPLING_T}"
echo
echo "FOLD MODEL: {FOLD}"
echo "  AlphaFold"
echo "    NUM RELAX: {NUM_RELAX}"
echo "    NUM RECYCLE: {NUM_RECYCLE}"
echo "    RANDOM GENERATOR SEED: {RAND_SEED}"
echo "  OmegaFold"
echo "    MODEL: {MODEL}"
echo
echo "Alignment algorithm: {RMSD_ALIGNMENT}"
"""
pipeline_cmd = print_options_cmd + f"""
source activate {ENVIRONMENT}
echo
echo Preliminary RFdiffusion
echo
{pre_rfd_sh_file} | tee {pre_rfd_log_file}
echo Inpainting determination
echo
{inpaint_sh_file} | tee {inpaint_log_file}
echo
echo RFDiffusion
{rfd_sh_file} | tee {rfd_log_file}
echo
echo ProteinMPNN
{pmpnn_sh_file} | tee {pmpnn_log_file}
echo
echo {FOLD}
{fold_sh_file} | tee {fold_log_file}
echo 
echo Ranking...
{rank_sh_file} | tee {ranking_log_file}
"""
if DNA: pipeline_cmd += f"""
echo
echo DNA
{dna_encoder_sh_file} | tee {dna_log_file}
"""
if FOLD_BEST_WITH_ALPHAFOLD > 0 and not FOLD=="AlphaFold":
    pipeline_cmd += f"""
echo
echo Refolding best {FOLD_BEST_WITH_ALPHAFOLD} proteins with AlphaFold
echo
echo AlphaFold
{af2_refolding_sh_file} | tee {refold_log_file}
echo 
echo Ranking...
{rerank_sh_file} | tee {reranking_log_file}
"""
pipeline_cmd += f"""
echo 
echo Job done
"""
zip_py_file = os.path.join(HELP_FOLDER,"zip_results.py")
pipeline_cmd += f"""
echo
echo Zipping results
python {zip_py_file} {JOB_FOLDER}
echo 
echo Results in:
echo {JOB_FOLDER}
"""
with open(pipeline_sh_file,'w') as pipeline_sh:
    pipeline_sh.write(pipeline_cmd)
os.chmod(pipeline_sh_file, 0o755) #Give execution rights

batch_sh_file = os.path.join(PIPELINES_FOLDER,"rfd_batch.sh")
if not ONE_JOB and os.path.exists(batch_sh_file):
    with open(batch_sh_file,"r") as rfd_batch_sh:
        previous_pipelines = rfd_batch_sh.readlines()
    with open(batch_sh_file,"w") as rfd_batch_sh:
        rfd_batch_sh.writelines(previous_pipelines)
        rfd_batch_sh.write("\n")
        rfd_batch_sh.write(pipeline_sh_file)  
else:      
    with open(batch_sh_file,"w") as rfd_batch_sh:
        rfd_batch_sh.write(pipeline_sh_file)  
os.chmod(batch_sh_file, 0o755) #Give execution rights

#write configuration file
config_txt_file = os.path.join(JOB_FOLDER,JOB_BASE+"_config.txt")
with open(config_txt_file,'w') as config_txt:
    config_txt.write(print_options_cmd)

print("Run using next cell")
print(f"Single job:\n{pipeline_sh_file}")
print(f"Batch:\n{batch_sh_file}")
print(f"\nOutput of this pipeline will be in:\n{JOB_FOLDER}")

In [None]:
%%bash
/home/$USER/ProteinNotebooks/Pipelines/rfd_batch.sh

In [None]:
"""
MAKE SLURM FILE
"""
PACKAGE_MANAGER = "mamba"
GPU = "A100" 
"""
GPUs available are:
t4      0.022 CHF/h
v100    0.057 CHF/h
a100    0.081 CHF/h
These can be allocated by setting GPU = "t4", "v100", "v100-32g" or "a100"
By setting GPU="gpu", the first available will be used
By setting GPU="high-memory", either v100-32gb or a100 are selected. 
Important: OmegaFold model 2 only works with high-memory GPUs, sometimes crashes even with 32 GB
"""

slurm_file = unique_name(PIPELINES_FOLDER,"rfd_ip_slurm",".sh",1)
with open(batch_sh_file,"r") as rfd_batch_sh:
    all_pipelines = rfd_batch_sh.readlines()
with open(slurm_file,"w") as slurm_bash:
    slurm_bash.write("""# Check if nvidia-smi is available
if ! command -v nvidia-smi &> /dev/null
then
    echo "Could not load GPU correctly: nvidia-smi could not be found"
    exit
fi
gpu_type=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader)
echo "GPU Type: $gpu_type"\n
""")
    slurm_bash.write(f"module load {PACKAGE_MANAGER}\n")
    slurm_bash.writelines(all_pipelines)
os.chmod(slurm_file, 0o755)

if GPU == "high-memory":
    gpu_line = "#SBATCH --gpus=1\n#SBATCH --constraint=\"GPUMEM32GB|GPUMEM80GB\""
elif GPU == "gpu":
    gpu_line = "#SBATCH --gpus=1"
else:
    gpu_line = f"#SBATCH --gpus={GPU}:1"

job_comps=JOB_FOLDER.split('/')
job_name=f"{job_comps[-2]}: {job_comps[-1]}"

print(f"""
ScienceApps > Jobs > JobComposer > New Job > From Default Template    
Edit Job name from Job Options. Suggested:

{job_name}       

Replace job.sh (Open in Editor) with the following (adapt required time hh:mm:ss) then Save:
      
#!/usr/bin/bash
{gpu_line}
#SBATCH --mem=15650
#SBATCH --time=23:59:00
#SBATCH --output=job.out      
{slurm_file}
""")

In [None]:
"""
Conventions for names
Paths to files end with _type_file
Paths to folders end with _FOLDER
Opened files .type end with _type
Content of .sh end with _cmd
Content of .py end with _script
"""

"""
Cleanup instructions
Delete data/bash_files
Delete outputs folder: contains log files
"""

**Instructions**
---
---

Use `contigs` to define continious chains. Use a `:` to define multiple contigs and a `/` to define mutliple segments within a contig.
For example:

**unconditional**
- `contigs='100'` - diffuse **monomer** of length 100
- `contigs='50:100'` - diffuse **hetero-oligomer** of lengths 50 and 100
- `contigs='50'` `symmetry='cyclic'` `order=2` - make two copies of the defined contig(s) and add a symmetry constraint, for **homo-oligomeric** diffusion.

**binder design**
- `contigs='A:50'` `pdb='4N5T'` - diffuse a **binder** of length 50 to chain A of defined PDB.
- `contigs='E6-155:70-100'` `pdb='5KQV'` `hotspot='E64,E88,E96'` - diffuse a **binder** of length 70 to 100 (sampled randomly) to chain E and defined hotspot(s).

**motif scaffolding**
 - `contigs='40/A163-181/40'` `pdb='5TPN'`
 - `contigs='A3-30/36/A33-68'` `pdb='6MRR'` - diffuse a loop of length 36 between two segments of defined PDB ranges.

**partial diffusion**
- `contigs=''` `pdb='6MRR'` - noise all coordinates
- `contigs='A1-10'` `pdb='6MRR'` - keep first 10 positions fixed, noise the rest
- `contigs='A'` `pdb='1SSC'` - fix chain A, noise the rest

*hints and tips*
- `pdb=''` leave blank to get an upload prompt
- `contigs='50-100'` use dash to specify a range of lengths to sample from