In [2]:
import os
import re
import json
import numpy as np

def get_protein_token_probs(json_path, protein_chain='A', ligand_chain='B', threshold=0.5):
    """
    Reads an AlphaFold full_data JSON, identifies tokens belonging to protein_chain
    vs. ligand_chain, then for each protein token computes the max contact probability
    to ANY ligand token. Returns a list of dicts, each with:
      {
        "protein_token": <int>,
        "max_contact_prob": <float>
      }
    for all tokens above the threshold.
    
    Args:
        json_path      (str)  : path to fold_alphafold_XXXX_Y_full_data_N.json
        protein_chain  (str)  : chain ID for protein (e.g. 'A')
        ligand_chain   (str)  : chain ID for ligand (e.g. 'B')
        threshold      (float): only include tokens with max_contact_prob >= threshold

    Returns:
        List[dict]: [
          {
            "protein_token": <int>,
            "max_contact_prob": <float>
          },
          ...
        ]
    """
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    contact_probs = np.array(data['contact_probs'])      # shape [num_tokens, num_tokens]
    token_chain_ids = data['token_chain_ids']            # length = num_tokens

    # Collect token indices
    protein_tokens = [i for i, c in enumerate(token_chain_ids) if c == protein_chain]
    ligand_tokens  = [i for i, c in enumerate(token_chain_ids) if c == ligand_chain]
    
    results = []
    for p_tkn in protein_tokens:
        # Find the maximum contact probability to any ligand token
        max_prob = 0.0
        for l_tkn in ligand_tokens:
            prob = contact_probs[p_tkn, l_tkn]
            if prob > max_prob:
                max_prob = prob

        # Only keep if above threshold
        if max_prob >= threshold:
            results.append({
                "protein_token": p_tkn,
                "max_contact_prob": float(max_prob)
            })
    
    return results


def process_alphafold_dir(root_dir,
                          protein_chain='A',
                          ligand_chain='B',
                          threshold=0.5,
                          output_json='alphafold_contacts.json'):
    """
    Recursively scans 'root_dir', finds JSON files matching the pattern:
       fold_alphafold_XXXX_Y_full_data_N.json
    For each file, extracts protein token -> max ligand contact probability,
    then stores results in a dictionary keyed by "[XXXX-Y, model N]".
    Writes the entire structure to output_json as a single JSON at the end.

    Args:
        root_dir      (str)  : top-level directory containing subfolders
        protein_chain (str)  : chain ID for the protein (default 'A')
        ligand_chain  (str)  : chain ID for the ligand (default 'B')
        threshold     (float): include only tokens with max_contact_prob >= threshold
        output_json   (str)  : filename for the final JSON output
    """
    # Regex to match filenames like: fold_alphafold_6sqz_d_full_data_0.json
    pattern = re.compile(r"^fold_alphafold_([0-9a-zA-Z]{4})_([A-Za-z0-9])_full_data_(\d+)\.json$")
    
    all_results = {}  # { "[6sqz-d, model 0]": [ {protein_token, max_contact_prob}, ... ], ... }

    for subdir, _, files in os.walk(root_dir):
        for filename in files:
            match = pattern.match(filename)
            if match:
                pdb_code   = match.group(1)  # e.g. '6sqz'
                chain_char = match.group(2)  # e.g. 'd'
                model_idx  = match.group(3)  # e.g. '0'
                
                json_path = os.path.join(subdir, filename)
                label = f"[{pdb_code}-{chain_char}, model {model_idx}]"

                contacts_list = get_protein_token_probs(
                    json_path, 
                    protein_chain=protein_chain,
                    ligand_chain=ligand_chain,
                    threshold=threshold
                )
                all_results[label] = contacts_list
    
    # Write all results to a single JSON file
    with open(output_json, 'w') as out_f:
        json.dump(all_results, out_f, indent=2)

    print(f"Done! Results saved to {output_json}")


if __name__ == "__main__":
    # Example usage:
    # 1) Place this script in a .py file
    # 2) Adjust 'root_dir' to your actual Alphafold directory path
    # 3) Optionally tweak 'protein_chain', 'ligand_chain', 'threshold'
    # 4) Run: python script.py

    root_dir = "Alphafold"  # e.g., top-level folder containing subfolders
    process_alphafold_dir(
        root_dir=root_dir,
        protein_chain='A',
        ligand_chain='B',
        threshold=0.5,
        output_json='alphafold_contacts.json'
    )


Done! Results saved to alphafold_contacts.json


In [3]:
import os
import re
import json
import numpy as np
import csv

def get_protein_token_probs(json_path, protein_chain='A', ligand_chain='B', threshold=0.5):
    """
    Reads an AlphaFold full_data JSON, identifies tokens belonging to protein_chain
    vs. ligand_chain, then for each protein token computes the max contact probability
    to ANY ligand token. Returns a list of (protein_token, max_contact_prob).
    Only includes tokens >= threshold.
    """
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    contact_probs = np.array(data['contact_probs'])      # shape [num_tokens, num_tokens]
    token_chain_ids = data['token_chain_ids']            # length = num_tokens

    # Collect token indices
    protein_tokens = [i for i, c in enumerate(token_chain_ids) if c == protein_chain]
    ligand_tokens  = [i for i, c in enumerate(token_chain_ids) if c == ligand_chain]
    
    results = []
    for p_tkn in protein_tokens:
        # Find the maximum contact probability to any ligand token
        max_prob = 0.0
        for l_tkn in ligand_tokens:
            prob = contact_probs[p_tkn, l_tkn]
            if prob > max_prob:
                max_prob = prob

        # Only keep if above threshold
        if max_prob >= threshold:
            results.append((p_tkn, float(max_prob)))
    
    return results


def process_alphafold_dir_to_csv(root_dir,
                                 protein_chain='A',
                                 ligand_chain='B',
                                 threshold=0.5,
                                 output_csv='alphafold_contacts.csv'):
    """
    Recursively scans 'root_dir', finds JSON files matching:
       fold_alphafold_XXXX_Y_full_data_N.json
    For each file, extracts per-protein-token max ligand contact probability,
    then writes rows to a CSV file with columns:
      pdb_code, chain_label, model_idx, protein_token, max_contact_prob
    
    Args:
        root_dir      (str)  : top-level directory containing subfolders
        protein_chain (str)  : chain ID for the protein (default 'A')
        ligand_chain  (str)  : chain ID for the ligand (default 'B')
        threshold     (float): include only tokens >= this max probability
        output_csv    (str)  : filename for the final CSV output
    """
    pattern = re.compile(r"^fold_alphafold_([0-9a-zA-Z]{4})_([A-Za-z0-9])_full_data_(\d+)\.json$")
    
    # We'll accumulate all rows in a list of tuples, then write them at the end
    csv_rows = []
    
    for subdir, _, files in os.walk(root_dir):
        for filename in files:
            match = pattern.match(filename)
            if match:
                pdb_code   = match.group(1)  # e.g. '6sqz'
                chain_char = match.group(2)  # e.g. 'd'
                model_idx  = match.group(3)  # e.g. '0'
                
                json_path = os.path.join(subdir, filename)
                
                token_probs = get_protein_token_probs(
                    json_path,
                    protein_chain=protein_chain,
                    ligand_chain=ligand_chain,
                    threshold=threshold
                )
                
                # Append a row for each protein token that passes threshold
                for (p_token, max_prob) in token_probs:
                    csv_rows.append((pdb_code, chain_char, model_idx, p_token, max_prob))
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as out_f:
        writer = csv.writer(out_f)
        # Write header
        writer.writerow(["pdb_code", "chain_label", "model_idx", "protein_token", "max_contact_prob"])
        # Write data rows
        writer.writerows(csv_rows)

    print(f"Done! Wrote {len(csv_rows)} rows to {output_csv}")


if __name__ == "__main__":
    # Example usage:
    #   1) Put this in a script (e.g., script.py)
    #   2) Adjust the arguments below as needed
    #   3) Run: python script.py
    root_dir = "Alphafold"  # top-level directory with subfolders
    process_alphafold_dir_to_csv(
        root_dir=root_dir,
        protein_chain='A',
        ligand_chain='B',
        threshold=0.5,
        output_csv='alphafold_contacts.csv'
    )


Done! Wrote 9430 rows to alphafold_contacts.csv
