In [6]:
import os
import csv

def load_alltokens_predictions(predictions_csv):
    """
    Reads the 'unfiltered' CSV with columns:
      pdb_code, chain_label, model_idx, protein_token, max_contact_prob, predicted_bind
    Returns a nested dict:
      preds[(pdb_code, chain_label, model_idx)][token] = probability (float)
    """
    preds_dict = {}
    with open(predictions_csv, 'r', newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            pdb_code      = row['pdb_code']
            chain_label   = row['chain_label']
            model_idx     = row['model_idx']
            token_str     = row['protein_token']
            prob_str      = row['max_contact_prob']

            token = int(token_str)
            prob  = float(prob_str)

            key = (pdb_code, chain_label, model_idx)
            if key not in preds_dict:
                preds_dict[key] = {}
            preds_dict[key][token] = prob
    return preds_dict

def load_experimental_csv(exp_csv, token_field='NO', bind_field='ATP Binding Site'):
    """
    Reads an experimental file, e.g. 'XXXX_aligned.csv', with columns like:
      NO, ATP Binding Site, ...
    Where:
      NO -> integer token/residue ID that matches 'protein_token'
      ATP Binding Site -> 'Y' or 'N'
    
    Returns a dict: exp_dict[token] = 1/0
    """
    exp_dict = {}
    with open(exp_csv, 'r', newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if token_field not in row or bind_field not in row:
                continue  # or raise an error if columns missing
            token_str = row[token_field].strip()
            bind_str  = row[bind_field].strip().upper()
            if not token_str:
                continue
            token = int(token_str)
            y = 1 if bind_str == 'Y' else 0
            exp_dict[token] = y
    return exp_dict

def compute_brier_score(preds_by_token, exp_by_token):
    """
    Brier = (1/N)* sum( (p_i - y_i)^2 ), 
    p_i = predicted probability, y_i = 0 or 1
    We iterate only over tokens present in 'exp_by_token' (the ground truth).
    If a token is missing in preds_by_token, assume p=0.
    
    Returns brier_score or None if no tokens found in experimental data.
    """
    tokens = exp_by_token.keys()
    n = len(tokens)
    if n == 0:
        return None

    total = 0.0
    for t in tokens:
        y = exp_by_token[t]
        p = preds_by_token.get(t, 0.0)
        total += (p - y)**2
    return total / n

def main(
    predictions_csv='alphafold_alltokens.csv',
    experimental_dir='Experimental BS',
    output_csv='brier_scores.csv'
):
    """
    1) Load the unfiltered predictions from 'predictions_csv'.
    2) For each CSV in 'experimental_dir' that ends with '_aligned.csv',
       parse out pdb_code (and optionally chain_label) from filename.
    3) Load that experimental data, then find all (pdb_code, chain_label, model_idx)
       combos in the predictions that match. For each combo, compute Brier score.
    4) Write results to 'output_csv'.
    """

    # 1) Load predictions
    preds_dict = load_alltokens_predictions(predictions_csv)
    # preds_dict[(pdb_code, chain_label, model_idx)] = {token -> probability}

    results = []  # each row = [pdb_code, chain_label, model_idx, brier_score]
    
    # 2) We'll assume each experimental file is named like "3wdl_aligned.csv"
    #    or "3wdl_b_aligned.csv". Adjust logic if your naming is different.
    for fname in os.listdir(experimental_dir):
        if not fname.endswith("_aligned.csv"):
            continue
        
        path = os.path.join(experimental_dir, fname)
        
        # Example parse: "3wdl_b_aligned.csv" -> base = "3wdl_b"
        # If your file lacks a chain label, you might only see e.g. "3wdl_aligned.csv".
        base = fname.replace("_aligned.csv","")
        parts = base.split("_")
        
        # Minimal approach: if there's only 1 part, we assume there's no chain label
        if len(parts) == 1:
            pdb_code = parts[0]
            chain_label = ""  # or some placeholder
        else:
            pdb_code = parts[0]
            chain_label = parts[1]
        
        # 3) Load the experimental data
        exp_dict = load_experimental_csv(path)  # {token -> 0/1}
        
        # Find all (pdb_code, chain_label, model_idx) that match
        # (If your chain_label in predictions is different, adjust.)
        candidate_keys = []
        for (pc, ch, m_idx) in preds_dict.keys():
            # If your experimental filename doesn't have chain_label, maybe skip matching on chain
            # For example, only match on pc == pdb_code
            # Or if the chain is empty in the experimental file, do pc match + optional chain check
            # We'll do a simple example:
            if pc == pdb_code and ch == chain_label:
                candidate_keys.append((pc, ch, m_idx))
        
        # compute Brier for each
        for key in candidate_keys:
            pc, ch, m_idx = key
            preds_by_token = preds_dict[key]
            bscore = compute_brier_score(preds_by_token, exp_dict)
            if bscore is not None:
                results.append([pc, ch, m_idx, bscore])
    
    # 4) Write results
    with open(output_csv, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["pdb_code","chain_label","model_idx","brier_score"])
        for row in results:
            writer.writerow(row)
    
    print(f"Done! Wrote {len(results)} Brier scores to {output_csv}")

if __name__ == "__main__":
    # Example usage
    main(
        predictions_csv="alphafold_alltokens.csv",
        experimental_dir="Experimental BS",
        output_csv="brier_scores.csv"
    )


Done! Wrote 0 Brier scores to brier_scores.csv


In [7]:
import os
import re
import json
import csv
import numpy as np

def get_protein_token_probs_no_filter(json_path, protein_chain='A', ligand_chain='B'):
    """
    Reads an AlphaFold full_data JSON, identifies tokens belonging to protein_chain vs. ligand_chain,
    then for EACH protein token (with no probability threshold) computes the max contact probability
    to ANY ligand token. Returns a list of dicts, each with:
      {
        "protein_token": <int>,
        "max_contact_prob": <float>
      }
    """
    with open(json_path, 'r') as f:
        data = json.load(f)

    contact_probs = np.array(data['contact_probs'])  # shape [num_tokens, num_tokens]
    token_chain_ids = data['token_chain_ids']        # length = num_tokens

    protein_tokens = [i for i, c in enumerate(token_chain_ids) if c == protein_chain]
    ligand_tokens  = [i for i, c in enumerate(token_chain_ids) if c == ligand_chain]
    
    results = []
    for p_tkn in protein_tokens:
        # Find the maximum contact probability to any ligand token
        if len(ligand_tokens) == 0:
            # If there's no ligand tokens at all, max_prob = 0
            max_prob = 0.0
        else:
            max_prob = 0.0
            for l_tkn in ligand_tokens:
                prob = contact_probs[p_tkn, l_tkn]
                if prob > max_prob:
                    max_prob = prob
        
        # We ALWAYS include the token, with no threshold filter
        results.append({
            "protein_token": p_tkn,
            "max_contact_prob": float(max_prob)
        })
    
    # Sort by token index so they appear "in order"
    results.sort(key=lambda x: x["protein_token"])
    return results

def process_alphafold_dir_no_filter(
    root_dir,
    protein_chain='A',
    ligand_chain='B',
    threshold=0.5,
    output_csv='alphafold_alltokens.csv'
):
    """
    Recursively scans 'root_dir', finds JSON files matching pattern:
      fold_alphafold_XXXX_Y_full_data_N.json
    For each file:
      - extracts per-protein-token max ligand contact probability (no filtering),
      - adds a Y/N label (predicted_bind) if >= threshold,
      - writes rows to a master CSV file with columns:
          pdb_code, chain_label, model_idx, protein_token, max_contact_prob, predicted_bind
    
    Args:
        root_dir      (str)  : top-level directory with subfolders
        protein_chain (str)  : chain ID for protein (default 'A')
        ligand_chain  (str)  : chain ID for ligand (default 'B')
        threshold     (float): if max_contact_prob >= threshold → predicted_bind=Y, else N
        output_csv    (str)  : the final CSV filename
    """
    pattern = re.compile(r"^fold_alphafold_([0-9a-zA-Z]{4})_([A-Za-z0-9])_full_data_(\d+)\.json$")
    
    # We'll append all rows to a list, then write to CSV at the end.
    all_rows = []
    
    for subdir, _, files in os.walk(root_dir):
        for filename in files:
            match = pattern.match(filename)
            if match:
                pdb_code   = match.group(1)  # e.g. '6sqz'
                chain_char = match.group(2)  # e.g. 'd'
                model_idx  = match.group(3)  # e.g. '0'
                
                json_path = os.path.join(subdir, filename)
                
                token_info = get_protein_token_probs_no_filter(
                    json_path, 
                    protein_chain=protein_chain,
                    ligand_chain=ligand_chain
                )
                
                # Build rows
                for item in token_info:
                    p_tkn = item["protein_token"]
                    max_prob = item["max_contact_prob"]
                    predicted_bind = "Y" if max_prob >= threshold else "N"
                    
                    row = [
                        pdb_code,        # e.g. '6sqz'
                        chain_char,      # e.g. 'd'
                        model_idx,       # e.g. '0'
                        p_tkn,           # token index
                        max_prob,        # float
                        predicted_bind   # 'Y' or 'N'
                    ]
                    all_rows.append(row)
    
    # Write to CSV
    with open(output_csv, 'w', newline='') as out_f:
        writer = csv.writer(out_f)
        # header
        writer.writerow([
            "pdb_code",
            "chain_label",
            "model_idx",
            "protein_token",
            "max_contact_prob",
            "predicted_bind"
        ])
        # rows
        for row in all_rows:
            writer.writerow(row)
    
    print(f"Done! Wrote {len(all_rows)} rows to {output_csv}")

if __name__ == "__main__":
    # Example usage:
    process_alphafold_dir_no_filter(
        root_dir="Alphafold",
        protein_chain='A',
        ligand_chain='B',
        threshold=0.5,  # <--- set your desired threshold
        output_csv='alphafold_alltokens_0.5.csv'
    )


Done! Wrote 173470 rows to alphafold_alltokens_0.5.csv


In [13]:
import os
import csv

def load_alltokens_predictions(predictions_csv):
    """
    Reads an unfiltered CSV with columns:
      pdb_code, chain_label, model_idx, protein_token, max_contact_prob, predicted_bind
    Returns a nested dict:
      preds[(pdb_code, chain_label, model_idx)][token] = probability
    """
    preds_dict = {}

    print(f"Loading predictions from: {predictions_csv}")
    with open(predictions_csv, 'r', newline='') as f:
        reader = csv.DictReader(f)
        print("Columns found in predictions CSV:", reader.fieldnames)
        
        row_count = 0
        for row in reader:
            row_count += 1
            pdb_code    = row['pdb_code']
            chain_label = row['chain_label']
            model_idx   = row['model_idx']
            token_str   = row['protein_token']
            prob_str    = row['max_contact_prob']

            # Convert to correct types
            token = int(token_str)
            prob  = float(prob_str)

            key = (pdb_code, chain_label, model_idx)
            if key not in preds_dict:
                preds_dict[key] = {}
            preds_dict[key][token] = prob
        
        print(f"Total rows loaded in predictions: {row_count}")
    return preds_dict

def load_experimental_csv(exp_csv, token_field='NO', bind_field='ATP Binding Site'):
    """
    Reads an experimental file, e.g. 'XXXX_aligned.csv', with columns like:
      NO, ATP Binding Site
    Where NO -> integer token ID, ATP Binding Site -> 'Y' or 'N'.
    Returns a dict: exp_dict[token] = 1/0.
    """
    exp_dict = {}

    print(f"Loading experimental data from: {exp_csv}")
    with open(exp_csv, 'r', newline='') as f:
        reader = csv.DictReader(f)
        print("Columns found in experimental CSV:", reader.fieldnames)
        
        row_count = 0
        for row in reader:
            row_count += 1
            # Check columns
            if token_field not in row or bind_field not in row:
                print(f"  Missing expected columns {token_field}, {bind_field} in row -> skipping.")
                continue

            token_str = row[token_field].strip()
            bind_str  = row[bind_field].strip().upper()
            if not token_str:
                continue

            token = int(token_str)
            y = 1 if bind_str == 'Y' else 0
            exp_dict[token] = y
        
        print(f"Total rows loaded in experimental file: {row_count}")
        print(f"Number of unique tokens in experimental: {len(exp_dict)}")
    return exp_dict

def compute_brier_score(preds_by_token, exp_by_token):
    """
    Brier = (1/N)* sum( (p_i - y_i)^2 ), 
    p_i = predicted probability, y_i = 0 or 1
    We iterate only over tokens present in 'exp_by_token'.
    If a token is missing in preds_by_token, assume p=0.
    """
    tokens = exp_by_token.keys()
    n = len(tokens)
    if n == 0:
        print("  No experimental tokens found -> returning None")
        return None

    total = 0.0
    missing_count = 0
    for t in tokens:
        y = exp_by_token[t]
        p = preds_by_token.get(t, 0.0)
        if t not in preds_by_token:
            missing_count += 1
        total += (p - y)**2

    brier = total / n
    print(f"  Brier score computed on {n} tokens (missing {missing_count} from predictions). Brier = {brier:.4f}")
    return brier

def main(
    predictions_csv='alphafold_alltoken_debug.csv',
    experimental_dir='Experimental BS',
    output_csv='brier_scores.csv'
):
    """
    1) Load the unfiltered predictions from 'predictions_csv'.
    2) For each CSV in 'experimental_dir' that ends with '_aligned.csv',
       parse out pdb_code (ignore chain label, if any).
    3) Load that experimental data, then find all predictions that have matching pdb_code
       (any chain_label, any model_idx).
    4) Compute Brier score for each matching key, write to 'output_csv'.
    """
    print("=== Step 1: Load all predictions ===")
    preds_dict = load_alltokens_predictions(predictions_csv)
    print(f"Found {len(preds_dict)} unique (pdb_code, chain_label, model_idx) keys in predictions.")

    results = []  # each row = [pdb_code, chain_label, model_idx, brier_score]
    
    print("\n=== Step 2: Checking experimental files in directory:", experimental_dir, "===")
    if not os.path.isdir(experimental_dir):
        print(f"ERROR: Directory '{experimental_dir}' does not exist.")
        return

    exp_files = [f for f in os.listdir(experimental_dir) if f.endswith("_aligned.csv")]
    print(f"Found {len(exp_files)} experimental CSVs matching '_aligned.csv'")

    for fname in exp_files:
        path = os.path.join(experimental_dir, fname)
        base = fname.replace("_aligned.csv","")

        # We'll treat everything before "_aligned" as the pdb_code, ignoring chain label
        # For instance, if you had "3wdl_b_aligned.csv", this becomes "3wdl_b".
        # But we only want "3wdl" as the code, ignoring "_b".
        #
        # If your naming is truly just "3wdl_aligned.csv" with no chain,
        # then base = "3wdl" => pdb_code="3wdl".
        #
        # If you do have chain in the name, you'll need to decide how to handle it.
        # We'll do a simple approach: we take only the *first* segment if there's an underscore.
        parts = base.split("_")
        pdb_code = parts[0]  # ignore anything after the first underscore
        print(f"\n--- Processing {fname} -> parsed pdb_code='{pdb_code}' (ignoring chain)")

        # 3) Load experimental data
        exp_dict = load_experimental_csv(path)  # {token -> 0/1}
        if len(exp_dict) == 0:
            print("  Experimental file has no valid data -> skipping.")
            continue

        # Find all relevant combos in preds_dict
        # We ignore the chain_label from the experimental side
        candidate_keys = []
        for (pc, ch, m_idx) in preds_dict.keys():
            if pc == pdb_code:
                candidate_keys.append((pc, ch, m_idx))
        
        print(f"  Found {len(candidate_keys)} candidate keys to compute Brier score.")
        for key in candidate_keys:
            pc, ch, m_idx = key
            print(f"  Computing Brier for key={key} ...")
            preds_by_token = preds_dict[key]
            bscore = compute_brier_score(preds_by_token, exp_dict)
            if bscore is not None:
                results.append([pc, ch, m_idx, bscore])

    print("\n=== Step 4: Writing results to:", output_csv, "===")
    with open(output_csv, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["pdb_code","chain_label","model_idx","brier_score"])
        for row in results:
            writer.writerow(row)
    print(f"Done! Wrote {len(results)} Brier scores to {output_csv}.")

if __name__ == "__main__":
    main(
        predictions_csv="alphafold_alltokens_debug.csv",
        experimental_dir="Experimental BS",
        output_csv="brier_scores.csv"
    )


=== Step 1: Load all predictions ===
Loading predictions from: alphafold_alltokens_debug.csv
Columns found in predictions CSV: ['pdb_code', 'chain_label', 'model_idx', 'protein_token', 'max_contact_prob', 'predicted_bind']
Total rows loaded in predictions: 173470
Found 445 unique (pdb_code, chain_label, model_idx) keys in predictions.

=== Step 2: Checking experimental files in directory: Experimental BS ===
Found 88 experimental CSVs matching '_aligned.csv'

--- Processing 3wdl_aligned.csv -> parsed pdb_code='3wdl' (ignoring chain)
Loading experimental data from: Experimental BS/3wdl_aligned.csv
Columns found in experimental CSV: ['Prot.ID', 'NO', 'Residue', 'ATP Binding Site']
Total rows loaded in experimental file: 261
Number of unique tokens in experimental: 261
  Found 5 candidate keys to compute Brier score.
  Computing Brier for key=('3wdl', 'b', '4') ...
  Brier score computed on 261 tokens (missing 1 from predictions). Brier = 0.0714
  Computing Brier for key=('3wdl', 'b', '3'

In [14]:
import os
import re
import json
import csv
import numpy as np

def get_protein_token_probs_no_filter(json_path, protein_chain='A', ligand_chain='B'):
    """
    Reads an AlphaFold full_data JSON, identifies tokens belonging to protein_chain vs. ligand_chain,
    then for EACH protein token (with no threshold) computes the max contact probability
    to ANY ligand token. Returns a list of dicts, each with:
      {
        "protein_token": <int>,
        "max_contact_prob": <float>
      }
    """
    with open(json_path, 'r') as f:
        data = json.load(f)

    contact_probs = np.array(data['contact_probs'])  # shape [num_tokens, num_tokens]
    token_chain_ids = data['token_chain_ids']        # length = num_tokens

    protein_tokens = [i for i, c in enumerate(token_chain_ids) if c == protein_chain]
    ligand_tokens  = [i for i, c in enumerate(token_chain_ids) if c == ligand_chain]
    
    results = []
    for p_tkn in protein_tokens:
        if len(ligand_tokens) == 0:
            max_prob = 0.0
        else:
            max_prob = 0.0
            for l_tkn in ligand_tokens:
                prob = contact_probs[p_tkn, l_tkn]
                if prob > max_prob:
                    max_prob = prob

        results.append({
            "protein_token": p_tkn,
            "max_contact_prob": float(max_prob)
        })
    
    results.sort(key=lambda x: x["protein_token"])
    return results

def process_alphafold_dir_no_filter(
    root_dir,
    protein_chain='A',
    ligand_chain='B',
    threshold=0.5,
    output_csv='alphafold_alltokens_debug.csv'
):
    """
    1) Recursively scans 'root_dir' for JSONs matching pattern:
         fold_alphafold_XXXX_Y_full_data_N.json
       Extracts per-protein-token max ligand contact probability (no filtering).
    2) Adds a Y/N label (predicted_bind) if >= threshold, else N.
    3) Writes rows to CSV.
    4) If '1j09' wasn't found at all in normal pass, we explicitly search for
       fold_alphafold_1j09_X_full_data_N.json files and do the same extraction.
    """
    # Normal pattern for everything except 1j09 (or including 1j09 if it matches):
    pattern = re.compile(r"^fold_alphafold_([0-9a-zA-Z]{4})_([A-Za-z0-9])_full_data_(\d+)\.json$")
    
    # We'll store all CSV rows here:
    all_rows = []
    
    # ------------------
    # Normal Extraction
    # ------------------
    for subdir, _, files in os.walk(root_dir):
        for filename in files:
            match = pattern.match(filename)
            if match:
                pdb_code   = match.group(1)  # e.g. '6sqz'
                chain_char = match.group(2)  # e.g. 'd'
                model_idx  = match.group(3)  # e.g. '0'
                
                json_path = os.path.join(subdir, filename)
                
                token_info = get_protein_token_probs_no_filter(
                    json_path, 
                    protein_chain=protein_chain,
                    ligand_chain=ligand_chain
                )
                
                for item in token_info:
                    p_tkn = item["protein_token"]
                    max_prob = item["max_contact_prob"]
                    predicted_bind = "Y" if max_prob >= threshold else "N"
                    
                    row = [
                        pdb_code,
                        chain_char,
                        model_idx,
                        p_tkn,
                        max_prob,
                        predicted_bind
                    ]
                    all_rows.append(row)

    # Check if 1j09 is already in the extracted rows
    has_1j09 = any(row[0].lower() == "1j09" for row in all_rows)
    
    if not has_1j09:
        print("NOTE: '1j09' not found via normal pattern. Attempting explicit extraction for 1j09.")
        
        # A special pattern just for 1j09, e.g.: fold_alphafold_1j09_A_full_data_0.json
        # We'll assume single-letter chain, numeric model index
        special_pattern = re.compile(r"^fold_alphafold_1j09_([A-Za-z0-9])_full_data_(\d+)\.json$")
        
        for subdir, _, files in os.walk(root_dir):
            for filename in files:
                match_1j09 = special_pattern.match(filename)
                if match_1j09:
                    chain_char = match_1j09.group(1)
                    model_idx  = match_1j09.group(2)
                    
                    json_path = os.path.join(subdir, filename)
                    
                    # Extract real data for 1j09
                    token_info = get_protein_token_probs_no_filter(
                        json_path,
                        protein_chain=protein_chain,
                        ligand_chain=ligand_chain
                    )
                    
                    for item in token_info:
                        p_tkn = item["protein_token"]
                        max_prob = item["max_contact_prob"]
                        predicted_bind = "Y" if max_prob >= threshold else "N"
                        
                        row = [
                            "1j09",
                            chain_char,
                            model_idx,
                            p_tkn,
                            max_prob,
                            predicted_bind
                        ]
                        all_rows.append(row)

    # Finally, write everything out
    with open(output_csv, 'w', newline='') as out_f:
        writer = csv.writer(out_f)
        writer.writerow([
            "pdb_code",
            "chain_label",
            "model_idx",
            "protein_token",
            "max_contact_prob",
            "predicted_bind"
        ])
        writer.writerows(all_rows)
    
    print(f"Done! Wrote {len(all_rows)} rows to {output_csv}")

if __name__ == "__main__":
    process_alphafold_dir_no_filter(
        root_dir="Alphafold",
        protein_chain='A',
        ligand_chain='B',
        threshold=0.5,
        output_csv='alphafold_alltokens_debug.csv'
    )


Done! Wrote 173470 rows to alphafold_alltokens_debug.csv
