<img src="https://raw.githubusercontent.com/sokrypton/ColabFold/main/.github/ColabFold_Marv_Logo_Small.png" height="200" align="right" style="height:240px">

##ColabFold v1.5.2-patch: AlphaFold2 using MMseqs2

Easy to use protein structure and complex prediction using [AlphaFold2](https://www.nature.com/articles/s41586-021-03819-2) and [Alphafold2-multimer](https://www.biorxiv.org/content/10.1101/2021.10.04.463034v1). Sequence alignments/templates are generated through [MMseqs2](mmseqs.com) and [HHsearch](https://github.com/soedinglab/hh-suite). For more details, see <a href="#Instructions">bottom</a> of the notebook, checkout the [ColabFold GitHub](https://github.com/sokrypton/ColabFold) and read our manuscript.
Old versions: [v1.4](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.4.0/AlphaFold2.ipynb), [v1.5.1](https://colab.research.google.com/github/sokrypton/ColabFold/blob/v1.5.1/AlphaFold2.ipynb)

[Mirdita M, Schütze K, Moriwaki Y, Heo L, Ovchinnikov S, Steinegger M. ColabFold: Making protein folding accessible to all.
*Nature Methods*, 2022](https://www.nature.com/articles/s41592-022-01488-1)

-----------

### News
- <b><font color='green'>2023/07/31: The ColabFold MSA server is back to normal. It was using older DB (UniRef30 2202/PDB70 220313) from 27th ~8:30 AM CEST to 31st ~11:10 AM CEST.</font></b>
- <b><font color='green'>2023/06/12: New databases! UniRef30 updated to 2023_02 and PDB to 230517. We now use PDB100 instead of PDB70 (see [notes](#pdb100)).</font></b>
- <b><font color='green'>2023/06/12: We introduced a new default pairing strategy: Previously, for multimer predictions with more than 2 chains, we only pair if all sequences taxonomically match ("complete" pairing). The new default "greedy" strategy pairs any taxonomically matching subsets.</font></b>

In [1]:
#@title Install dependencies
%%time
import os
from sys import version_info
python_version = f"{version_info.major}.{version_info.minor}"

USE_AMBER = True
USE_TEMPLATES = True
PYTHON_VERSION = python_version

if not os.path.isfile("COLABFOLD_READY"):
  print("installing colabfold...")
  os.system("pip install -q --no-warn-conflicts 'colabfold[alphafold-minus-jax] @ git+https://github.com/sokrypton/ColabFold'")
  os.system("pip install --upgrade dm-haiku")
  os.system("ln -s /usr/local/lib/python3.*/dist-packages/colabfold colabfold")
  os.system("ln -s /usr/local/lib/python3.*/dist-packages/alphafold alphafold")
  # patch for jax > 0.3.25
  os.system("sed -i 's/weights = jax.nn.softmax(logits)/logits=jnp.clip(logits,-1e8,1e8);weights=jax.nn.softmax(logits)/g' alphafold/model/modules.py")
  os.system("touch COLABFOLD_READY")

if USE_AMBER or USE_TEMPLATES:
  if not os.path.isfile("CONDA_READY"):
    print("installing conda...")
    os.system("wget -qnc https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh")
    os.system("bash Mambaforge-Linux-x86_64.sh -bfp /usr/local")
    os.system("mamba config --set auto_update_conda false")
    os.system("touch CONDA_READY")

if USE_TEMPLATES and not os.path.isfile("HH_READY") and USE_AMBER and not os.path.isfile("AMBER_READY"):
  print("installing hhsuite and amber...")
  os.system(f"mamba install -y -c conda-forge -c bioconda kalign2=2.04 hhsuite=3.3.0 openmm=7.7.0 python='{PYTHON_VERSION}' pdbfixer")
  os.system("touch HH_READY")
  os.system("touch AMBER_READY")
else:
  if USE_TEMPLATES and not os.path.isfile("HH_READY"):
    print("installing hhsuite...")
    os.system(f"mamba install -y -c conda-forge -c bioconda kalign2=2.04 hhsuite=3.3.0 python='{PYTHON_VERSION}'")
    os.system("touch HH_READY")
  if USE_AMBER and not os.path.isfile("AMBER_READY"):
    print("installing amber...")
    os.system(f"mamba install -y -c conda-forge openmm=7.7.0 python='{PYTHON_VERSION}' pdbfixer")
    os.system("touch AMBER_READY")

installing colabfold...
installing conda...
installing hhsuite and amber...
CPU times: user 327 ms, sys: 60.7 ms, total: 388 ms
Wall time: 1min 32s


In [2]:
#Imports and default config

from google.colab import drive
from google.colab import files
import os
import re
import hashlib
import random

from pydrive.drive import GoogleDrive
from pydrive.auth import GoogleAuth
from google.colab import auth
from oauth2client.client import GoogleCredentials
import sys
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from Bio import BiopythonDeprecationWarning
warnings.simplefilter(action='ignore', category=BiopythonDeprecationWarning)
from pathlib import Path
from colabfold.download import download_alphafold_params, default_data_dir
from colabfold.utils import setup_logging
from colabfold.batch import get_queries, run, set_model_type
from colabfold.plot import plot_msa_v2

import os
import numpy as np
try:
  K80_chk = os.popen('nvidia-smi | grep "Tesla K80" | wc -l').read()
except:
  K80_chk = "0"
  pass
if "1" in K80_chk:
  print("WARNING: found GPU Tesla K80: limited to total length < 1000")
  if "TF_FORCE_UNIFIED_MEMORY" in os.environ:
    del os.environ["TF_FORCE_UNIFIED_MEMORY"]
  if "XLA_PYTHON_CLIENT_MEM_FRACTION" in os.environ:
    del os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"]

from colabfold.colabfold import plot_protein
from pathlib import Path
import matplotlib.pyplot as plt

# For some reason we need that to get pdbfixer to import
if USE_AMBER and f"/usr/local/lib/python{python_version}/site-packages/" not in sys.path:
    sys.path.insert(0, f"/usr/local/lib/python{python_version}/site-packages/")

#Login to drive
drive.mount('/content/drive')
print("You are logged into Google Drive and are good to go!")

Mounted at /content/drive
You are logged into Google Drive and are good to go!


In [3]:
# Methods and more
def add_hash(x,y):
  return x+"_"+hashlib.sha1(y.encode()).hexdigest()[:5]

# check if directory with jobname exists
def check(folder):
  if os.path.exists(folder):
    return False
  else:
    return True

def set_basic_config():
  num_relax = 0 #param [0, 1, 5] {type:"raw"}
  template_mode = "none" #param ["none", "pdb100","custom"]
  _use_amber = num_relax > 0
  num_seeds = 1 #param [1,2,4,8,16] {type:"raw"}
  use_dropout = False #param {type:"boolean"}
  num_models = 1


  #markdown #### Save settings
  save_all = False #param {type:"boolean"}
  save_recycles = False #param {type:"boolean"}
  #markdown -  if the save_to_google_drive option was selected, the result zip will be uploaded to your Google Drive

  if template_mode == "pdb100":
    _use_templates = True
    custom_template_path = None
  else:
    custom_template_path = None
    _use_templates = False

  msa_mode = "mmseqs2_uniref_env" #param ["mmseqs2_uniref_env", "mmseqs2_uniref","single_sequence","custom"]
  pair_mode = "unpaired_paired" #param ["unpaired_paired","paired","unpaired"] {type:"string"}

  # # decide which a3m to use
  # if "mmseqs2" in msa_mode:
  #   a3m_file = os.path.join(jobname,f"{jobname}.a3m")

  # else:
  #   a3m_file = os.path.join(jobname,f"{jobname}.single_sequence.a3m")
  #   with open(a3m_file, "w") as text_file:
  #     text_file.write(">1\n%s" % query_sequence)

  #markdown ### Advanced settings
  model_type = "auto" #param ["auto", "alphafold2_ptm", "alphafold2_multimer_v1", "alphafold2_multimer_v2", "alphafold2_multimer_v3"]
  #markdown - if `auto` selected, will use `alphafold2_ptm` for monomer prediction and `alphafold2_multimer_v3` for complex prediction.
  #markdown Any of the mode_types can be used (regardless if input is monomer or complex).
  num_recycles = "auto" #param ["auto", "0", "1", "3", "6", "12", "24", "48"]
  num_recycles = None if num_recycles == "auto" else int(num_recycles)
  recycle_early_stop_tolerance = "auto" #param ["auto", "0.0", "0.5", "1.0"]
  recycle_early_stop_tolerance = None if recycle_early_stop_tolerance == "auto" else float(recycle_early_stop_tolerance)
  #markdown - if `auto` selected, will use 20 recycles if `model_type=alphafold2_multimer_v3` (with tol=0.5), all else 3 recycles (with tol=0.0).
  pairing_strategy = "greedy" #param ["greedy", "complete"] {type:"string"}
  #markdown - `greedy` = pair any taxonomically matching subsets, `complete` = all sequences have to match in one line.

  max_msa = "auto" #param ["auto", "512:1024", "256:512", "64:128", "32:64", "16:32"]
  if max_msa == "auto": max_msa = None

  return dict(
      max_msa = max_msa,
      use_templates = _use_templates,
      custom_template_path = custom_template_path,
      num_relax = num_relax,
      msa_mode = msa_mode,
      num_recycles = num_recycles,
      recycle_early_stop_tolerance = recycle_early_stop_tolerance,
      num_seeds = num_seeds,
      use_dropout = use_dropout,
      pair_mode = pair_mode,
      pairing_strategy = pairing_strategy,
      save_all = save_all,
      save_recycles = save_recycles,
      model_type = model_type,
      num_models = num_models
      )

In [4]:
def get_config(fasta_file_path: str, results_dir: str):
  queries, is_complex = get_queries(fasta_file_path)
  jobname = os.path.basename(fasta_file_path).split(".")[0]
  result_dir = os.path.join(results_dir, jobname)
  config = dict(
      queries = queries,
      is_complex = is_complex,
      result_dir = result_dir
      )
  config.update(set_basic_config())
  return config

In [5]:

def run_job(config):
  result_dir = config["result_dir"]
  # log_filename = os.path.join(result_dir,"log.txt")
  # if not os.path.isfile(log_filename) or 'logging_setup' not in globals():
  #   setup_logging(Path(log_filename))
  #   logging_setup = True


  model_type = set_model_type(config["is_complex"], config["model_type"])

  if "multimer" in model_type and config["max_msa"] is not None:
    use_cluster_profile = False
  else:
    use_cluster_profile = True

  download_alphafold_params(model_type, Path("."))
  results = run(
      queries=config["queries"],
      result_dir=result_dir,
      use_templates=config["use_templates"],
      custom_template_path=config["custom_template_path"],
      num_relax=config["num_relax"],
      msa_mode=config["msa_mode"],
      model_type=model_type,
      num_models=config["num_models"],
      num_recycles=config["num_recycles"],
      recycle_early_stop_tolerance=config["recycle_early_stop_tolerance"],
      num_seeds=config["num_seeds"],
      use_dropout=config["use_dropout"],
      model_order=[1,2,3,4,5],
      is_complex=config["is_complex"],
      data_dir=Path("."),
      keep_existing_results=False,
      rank_by="auto",
      pair_mode=config["pair_mode"],
      pairing_strategy=config["pairing_strategy"],
      stop_at_score=float(100),
      # prediction_callback=prediction_callback,
      zip_results=False,
      save_all=config["save_all"],
      max_msa=config["max_msa"],
      use_cluster_profile=use_cluster_profile,
      # input_features_callback=input_features_callback,
      save_recycles=config["save_recycles"],
  )

In [None]:
from typing import List

from glob import glob
from tqdm.auto import tqdm
import multiprocessing

sequences_dir = "/content/drive/MyDrive/Colabfold/sequences"
results_dir = "/content/drive/MyDrive/Colabfold/results"

def process_prediction(fasta_file, results_dir):
  print(f"COMPUTING RESULTS FOR {os.path.basename(fasta_file)}")
  config = get_config(fasta_file, results_dir=results_dir)
  run_job(config)

workers_limit = 2  # Número máximo de trabajos en paralelo
with multiprocessing.Pool(processes=workers_limit) as pool:
    pool.starmap(process_prediction, [(fasta_file, results_dir) for fasta_file in glob(sequences_dir+"/*.fasta")])



COMPUTING RESULTS FOR euskadiensis_alpha16mannotransfeoch1like.fasta


Downloading alphafold2 weights to .:  36%|███▌      | 1.25G/3.47G [00:34<00:58, 41.1MB/s]