In [None]:
#Copyright © 2024 LOCBP @ University of Zürich
#Distributed under MIT license

APPEND_JOB = False 
"""
INPUT PARAMETERS
"""
#AlphaFold
NAME = "test" #No whitespaces
SEQUENCE = "" #Instead of providing the sequence, you can provide the relative path ot a csv file. Important: it must contain two columns named "id" and "sequence"
#Advanced
NUM_RELAX = 0 #How many of the best-ranked models do you want to relax with amber?
NUM_RECYCLE = 3 #Default (and recommended) is 3
RAND_SEED = 0
#Pymol
ONLY_FIRST = True #Only compare the best folding of each sequence generated
#DNA
DNA=True #Generates DNA sequences for Homo Sapiens and E coli

"""
CODE
"""
ONE_JOB = not APPEND_JOB
import os #WE USE ABSOLUTE PATHS
#CUDA
ENVIRONMENT = "ProteinEnv"
#General
MODELS_OUTPUT_FOLDER = "scratch" #Where to store output (subfolders will be created inside)
INSTALLATION_FOLDER = "data"
#GENERAL FOLDERS
NOTEBOOKS_FOLDER = os.getcwd()
PIPELINES_FOLDER = os.path.join(NOTEBOOKS_FOLDER,"Pipelines")
PDBs_FOLDER = os.path.join(NOTEBOOKS_FOLDER,"PDBs")
HELP_FOLDER = os.path.join(NOTEBOOKS_FOLDER,"HelpScripts")
USER_NAME = os.path.basename(os.path.dirname(NOTEBOOKS_FOLDER))
HOME_FOLDER = f"/home/{USER_NAME}"
DATA_FOLDER = f"/data/{USER_NAME}"
SCRATCH_FOLDER = f"/scratch/{USER_NAME}"
#MODIFIABLE FOLDERS
INSTALLATION_FOLDER = DATA_FOLDER
MODELS_OUTPUT_FOLDER = os.path.join(SCRATCH_FOLDER,"ProteinOutput")
if not os.path.exists(MODELS_OUTPUT_FOLDER):
    os.mkdir(MODELS_OUTPUT_FOLDER)
#MODELS
COLAB_FOLD_FOLDER = os.path.join(INSTALLATION_FOLDER,"localcolabfold")
PROTEIN_OUTPUT_FOLDER = os.path.join(MODELS_OUTPUT_FOLDER,"ProteinOutput")
OUT_FOLDER = os.path.join(MODELS_OUTPUT_FOLDER,"AlphaFold")
if not os.path.exists(OUT_FOLDER):
    os.mkdir(OUT_FOLDER)

#This will be used throughout to generate file and directory names to avoid overriding old outputs
def unique_name(directory,root,ext = "",fullpath=0,w=3):   
    i = 1     
    u_name = root + "_" + "{:0>{width}}".format(i, width=w)
    while os.path.exists(os.path.join(directory,u_name+ext)):
        i += 1
        u_name = root + "_" + "{:0>{width}}".format(i, width=w)
    if fullpath: return os.path.join(directory, u_name + ext)
    return u_name + ext

"""
SET JOB NAME AND CREATE OUTPUT DIRECTORIES
Data are stored in the "AF2_output" folder inside the data folder
Also, a folder containg all the results is created inside that folder
"""
JOB = unique_name(OUT_FOLDER,NAME)
JOB_FOLDER = os.path.join(OUT_FOLDER,JOB)
os.mkdir(JOB_FOLDER)
RUNTIME_FOLDER = os.path.join(JOB_FOLDER,"RunTime")
if not os.path.exists(RUNTIME_FOLDER):
    os.mkdir(RUNTIME_FOLDER)
#Output from console will also be redirected to log files
af2_log_file = os.path.join(JOB_FOLDER,"_af2.log")
af2_pse_log_file = os.path.join(JOB_FOLDER,"_af2_pse.log")
dna_log_file = os.path.join(JOB_FOLDER,"_dna.log")

"""ALPHAFOLD"""
if SEQUENCE.endswith(".csv"):
    queries_csv_file = os.path.join(os.getcwd(),SEQUENCE)
else:
    queries_csv_file = os.path.join(JOB_FOLDER,"af2_queries.csv") #This will contain the queries
    with open(queries_csv_file,"w") as queries_csv:
        queries_csv.write(f"id,sequence\n{NAME},{SEQUENCE}")

af2_options = ""
if NUM_RELAX > 0: af2_options += f" --amber --use-gpu-relax --num-relax {NUM_RELAX}" #assumed to run on GPU
if NUM_RECYCLE != 3: af2_options += f" --num-recycle {NUM_RECYCLE}"
if RAND_SEED != 0: af2_options += f" --random-seed {RAND_SEED}"

colabfold_batch_file = os.path.join(COLAB_FOLD_FOLDER,"colabfold-conda/bin/colabfold_batch")
af2_sh_file = unique_name(RUNTIME_FOLDER,"af2",".sh",1)
af2_cmd = f"""
echo Initializing model...
{colabfold_batch_file} {queries_csv_file} {JOB_FOLDER} {af2_options}
"""
with open(af2_sh_file,'w') as af2_sh:
    af2_sh.write(af2_cmd)
os.chmod(af2_sh_file, 0o755) #Give execution rights

"""PSE CREATION"""
af2_pse_py_file = os.path.join(HELP_FOLDER,"make_pse_af2.py")
af2_fasta_file = os.path.join(JOB_FOLDER,NAME+".fasta")
af2_pse_file = os.path.join(JOB_FOLDER,NAME+".pse")
af2_pse_cmd = f"python {af2_pse_py_file} {JOB_FOLDER} {af2_pse_file} {af2_fasta_file} {ONLY_FIRST}"
af2_pse_sh_file = unique_name(RUNTIME_FOLDER,"pmpnn_af2_rank",".sh",1)
with open(af2_pse_sh_file,'w') as pse_sh:
    pse_sh.write(af2_pse_cmd)
os.chmod(af2_pse_sh_file, 0o755) #Give execution rights

"""DNA"""
DNA_FOLDER=os.path.join(JOB_FOLDER,"DNA")
if not os.path.exists(DNA_FOLDER):
    os.mkdir(DNA_FOLDER)
dna_prefix = os.path.join(DNA_FOLDER,NAME)
dna_encoder_py_file = os.path.join(HELP_FOLDER,"dna_encoder.py")
dna_encoder_cmd = f"python {dna_encoder_py_file} {af2_fasta_file} {dna_prefix}"
dna_encoder_sh_file = os.path.join(RUNTIME_FOLDER,"dna_encoder.sh")
with open(dna_encoder_sh_file,'w') as dna_encoder_sh:
    dna_encoder_sh.write(dna_encoder_cmd)
os.chmod(dna_encoder_sh_file, 0o755) #Give execution rights

"""COMBINE ALL IN A UNIQUE PIPELINE"""
pipeline_sh_file = unique_name(RUNTIME_FOLDER,"pipeline",".sh",1)
print_options_cmd = f"""
echo Pipeline: {os.path.basename(pipeline_sh_file)}
echo
echo Name: {NAME}
echo SEQUENCE: {SEQUENCE}
"""
pipeline_cmd = print_options_cmd + f"""
source activate {ENVIRONMENT}
echo
echo AlphaFold
{af2_sh_file} | tee {af2_log_file}
echo
echo PyMol
{af2_pse_sh_file} | tee {af2_pse_log_file}"""
if DNA: pipeline_cmd += f"""
echo
echo DNA
{dna_encoder_sh_file} | tee {dna_log_file}"""
pipeline_cmd += f"""
echo 
echo Job done
echo {JOB_FOLDER}
"""
with open(pipeline_sh_file,'w') as pipeline_sh:
    pipeline_sh.write(pipeline_cmd)
os.chmod(pipeline_sh_file, 0o755) #Give execution rights

#write configuration file
config_txt_file = os.path.join(JOB_FOLDER,NAME+"_config.txt")
with open(config_txt_file,'w') as config_txt:
    config_txt.write(print_options_cmd)

batch_sh_file = os.path.join(PIPELINES_FOLDER,"af2_batch.sh")
if not ONE_JOB and os.path.exists(batch_sh_file):
    with open(batch_sh_file,"r") as af2_batch_sh:
        previous_pipelines = af2_batch_sh.readlines()
    with open(batch_sh_file,"w") as af2_batch_sh:
        af2_batch_sh.writelines(previous_pipelines)
        af2_batch_sh.write("\n")
        af2_batch_sh.write(pipeline_sh_file)  
else:      
    with open(batch_sh_file,"w") as af2_batch_sh:
        af2_batch_sh.write(pipeline_sh_file)  
os.chmod(batch_sh_file, 0o755) #Give execution rights

print("Run using next cell")
print(f"Single job:\n{pipeline_sh_file}")
print(f"Batch:\n{batch_sh_file}")
print(f"\nOutput of this pipeline will be in:\n{JOB_FOLDER}")

In [None]:
%%bash
/home/$USER/ProteinNotebooks/Pipelines/af2_batch.sh

In [None]:
"""
MAKE SLURM FILE
"""
PACKAGE_MANAGER = "mamba"
GPU = "gpu" 
"""
GPUs available are:
T4      0.022 CHF/h
V100    0.057 CHF/h
A100    0.081 CHF/h
These can be allocated by setting GPU = "T4", "V100", or "A100"
By setting GPU="gpu", the first available will be used
By setting GPU="high-memory", either v100-32gb or a100 are selected. 
Important: OmegaFold model 2 only works with high-memory GPUs. Sometimes it fails with 32G as well, so it is safer to use A100
"""

slurm_file = unique_name(PIPELINES_FOLDER,"af2_slurm",".sh",1)
with open(batch_sh_file,"r") as rfd_batch_sh:
    all_pipelines = rfd_batch_sh.readlines()
with open(slurm_file,"w") as slurm_bash:
    slurm_bash.write("""# Check if nvidia-smi is available
if ! command -v nvidia-smi &> /dev/null
then
    echo "Could not load GPU correctly: nvidia-smi could not be found"
    exit
fi
gpu_type=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader)
echo "GPU Type: $gpu_type"\n
""")
    slurm_bash.write(f"module load {PACKAGE_MANAGER}\n")
    slurm_bash.writelines(all_pipelines)
os.chmod(slurm_file, 0o755)

if GPU == "high-memory":
    gpu_line = "#SBATCH --gpus=1\n#SBATCH --constraint=\"GPUMEM32GB|GPUMEM80GB\""
elif GPU == "gpu":
    gpu_line = "#SBATCH --gpus=1"
else:
    gpu_line = f"#SBATCH --gpus={GPU}:1"


job_comps=JOB_FOLDER.split('/')
job_name=f"{job_comps[-2]}: {job_comps[-1]}"

print(f"""
ScienceApps > Jobs > JobComposer > New Job > From Default Template    
Edit Job name from Job Options. Suggested:

{job_name}       

Replace job.sh (Open in Editor) with the following (adapt required time hh:mm:ss) then Save:
      
#!/usr/bin/bash
{gpu_line}
#SBATCH --mem=7800
#SBATCH --time=23:59:00
#SBATCH --output=job.out      
{slurm_file}
""")

In [None]:
"""
Conventions for names
Paths to files end with _type_file
Paths to folders end with _FOLDER
Opened files .type end with _type
Content of .sh end with _cmd
Content of .py end with _script
"""

"""
Cleanup instructions
Delete data/bash_files
Delete outputs folder: contains log files
"""