In [2]:
import os
import tarfile
import json
import csv
from pathlib import Path

# Setup the working directory where jobs are stored
base_dir = Path("/mnt/home/bulat/apps/af3")  # Adjust if needed
archive_suffix = "_output.tar.gz"
output_summary = []

# Identify job output archives
archives = list(base_dir.glob(f"job_*{archive_suffix}"))

# Temporary extraction path
extraction_dir = base_dir / "extracted_temp"
extraction_dir.mkdir(exist_ok=True)

# Process each archive
for archive_path in archives:
    job_id = archive_path.stem.split("_")[1]
    extract_path = extraction_dir / f"job_{job_id}"
    extract_path.mkdir(exist_ok=True)

    # Extract archive
    with tarfile.open(archive_path, "r:gz") as tar:
        tar.extractall(path=extract_path)

    # Search for summary_confidences.json and ligands
    for root, _, files in os.walk(extract_path):
        root_path = Path(root)
        for file in files:
            if file.endswith("_summary_confidences.json"):
                job_name = root_path.parts[-2] if len(root_path.parts) > 1 else "unknown"
                summary_path = root_path / file
                data_path = summary_path.with_name(summary_path.name.replace("summary_confidences.json", "data.json"))

                # Extract average confidence
                with open(summary_path) as f:
                    conf_data = json.load(f)
                avg_conf = conf_data.get("mean_plddt", "NA")

                # Extract ligand info if available
                ligands = []
                if data_path.exists():
                    with open(data_path) as f:
                        model_input = json.load(f)
                    for entity in model_input.get("sequences", []):
                        if "ligand" in entity:
                            ligands.append(entity["ligand"].get("ccdCodes", entity["ligand"].get("smiles", [""]))[0])

                output_summary.append({
                    "Job ID": job_id,
                    "Model Folder": str(root_path),
                    "Avg pLDDT": avg_conf,
                    "Ligands": ", ".join(ligands) if ligands else "None"
                })

import pandas as pd

df = pd.DataFrame(output_summary)
print(df)
# Or write to a CSV:
df.to_csv("alphafold_summary.csv", index=False)


     Job ID                                       Model Folder Avg pLDDT  \
0  24715679  /mnt/home/bulat/apps/af3/extracted_temp/job_24...        NA   
1  24715679  /mnt/home/bulat/apps/af3/extracted_temp/job_24...        NA   

  Ligands  
0  HEM, N  
1  HEM, N  


In [3]:
import json
import pandas as pd
from pathlib import Path
import tarfile

# --- Configuration ---
base_dir = Path("/mnt/home/bulat/apps/af3")
archives = list(base_dir.glob("job_*_output.tar.gz"))
extraction_dir = base_dir / "extracted_temp"
extraction_dir.mkdir(exist_ok=True)
summary_data = []

for archive in archives:
    job_id = archive.stem.split("_")[1]
    job_extract_path = extraction_dir / f"job_{job_id}"
    job_extract_path.mkdir(exist_ok=True)

    with tarfile.open(archive, "r:gz") as tar:
        tar.extractall(path=job_extract_path)

    for summary_path in job_extract_path.rglob("*_summary_confidences.json"):
        job_folder = summary_path.parent
        data_path = job_folder / summary_path.name.replace("summary_confidences.json", "data.json")

        try:
            with open(summary_path) as f:
                summary = json.load(f)

            with open(data_path) as f:
                job_input = json.load(f)

            avg_plddt = summary.get("ptm", "NA")
            iptm = summary.get("iptm", "NA")
            rank_score = summary.get("ranking_score", "NA")
            disordered = summary.get("fraction_disordered", "NA")

            ligands = []
            for seq in job_input.get("sequences", []):
                if "ligand" in seq:
                    ligands.append(seq["ligand"].get("ccdCodes", seq["ligand"].get("smiles", [""]))[0])

            summary_data.append({
                "Job ID": job_id,
                "Folder": str(job_folder),
                "Avg pLDDT": avg_plddt,
                "ipTM": iptm,
                "Ranking Score": rank_score,
                "Disordered Fraction": disordered,
                "Ligands": ", ".join(ligands) if ligands else "None"
            })

        except Exception as e:
            print(f"❌ Error processing {summary_path}: {e}")

# --- Save Summary ---
df = pd.DataFrame(summary_data)
df.to_csv(base_dir / "af3_summary.csv", index=False)
print("✅ Summary saved to af3_summary.csv")


✅ Summary saved to af3_summary.csv


In [7]:
import os
import json
import csv
from pathlib import Path
from statistics import mean

base_dir = Path("/mnt/home/bulat/apps/af3/extracted_output/scratch/bulat/24715679")
base_out = Path("/mnt/home/bulat/apps/af3")
summary_rows = []

# Traverse all folders like seed-1_sample-*
for model_dir in base_dir.glob("**/seed-*_sample-*"):
    model_name = model_dir.parent.name
    sample_id = model_dir.name

    # Parse confidences
    conf_path = model_dir / "confidences.json"
    avg_plddt = "NA"
    if conf_path.exists():
        with open(conf_path) as f:
            conf_data = json.load(f)
        if "atom_plddts" in conf_data:
            avg_plddt = round(mean(conf_data["atom_plddts"]), 2)

    # Parse ranking_score
    summary_path = model_dir / "summary_confidences.json"
    ranking_score = "NA"
    if summary_path.exists():
        with open(summary_path) as f:
            summary_data = json.load(f)
        ranking_score = summary_data.get("ranking_score", "NA")

    # Ligand info from top-level *_data.json
    data_json_path = model_dir.parent / f"{model_name}_data.json"
    ligands = []
    if data_json_path.exists():
        with open(data_json_path) as f:
            data_json = json.load(f)
        for entry in data_json.get("sequences", []):
            if "ligand" in entry:
                ligand = entry["ligand"]
                ligands.append(ligand.get("ccdCodes", ligand.get("smiles", [""]))[0])

    summary_rows.append({
        "Model Name": model_name,
        "Sample ID": sample_id,
        "Avg pLDDT": avg_plddt,
        "Ranking Score": ranking_score,
        "Ligands": ", ".join(ligands) if ligands else "None"
    })

# Write to CSV
csv_path = base_out / "per_structure_summary.csv"
with open(csv_path, "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=summary_rows[0].keys())
    writer.writeheader()
    writer.writerows(summary_rows)

print(f"✅ Summary written to: {csv_path}")

✅ Summary written to: /mnt/home/bulat/apps/af3/per_structure_summary.csv
