In [None]:
# Import necessary libraries
import glob
import json
import os

import pandas as pd

# Define your JSON folder and final CSV file name
json_folder = "path/to/your/json/folder"  # Replace with the actual folder path
csv_name = "protease_results"  # replace with final file name
csv_file_path = os.path.join(json_folder, csv_name + ".csv")


# Modified function to extract pTM, ipTM, and average pLDDT
def extract_values(json_file):
    with open(json_file) as file:
        data = json.load(file)

    ptm = data.get("ptm", 0)  # Default to 0 if not found
    iptm = data.get("iptm", 0)  # Default to 0 if not found

    # Calculate the average pLDDT if pLDDT scores are present
    plddt_scores = data.get("plddt", [])
    avg_plddt = sum(plddt_scores) / len(plddt_scores) if plddt_scores else 0

    return {"ptm": ptm, "iptm": iptm, "avg_plddt": avg_plddt}


# Preparation to process JSON files and collect data
json_files = glob.glob(os.path.join(json_folder, "*.json"))
result_data = pd.DataFrame(
    columns=["Protein_1", "Protein_2", "pTM", "ipTM", "Average_pLDDT", "Ranking_confidence"]
)

# Processing each JSON file
for json_file_path in json_files:
    file_name = os.path.splitext(os.path.basename(json_file_path))[0]
    split_1 = "_scores"  # Assumed parameter for splitting
    split_2 = "_vs_"  # Assumed parameter for splitting

    # File naming and splitting logic (remains unchanged)
    file_name_parts = file_name.split(split_1)
    file_name_short = file_name_parts[0]
    protein_names = file_name_short.split(split_2)

    if len(protein_names) >= 2:
        if protein_names[1].endswith("_"):
            protein_2 = protein_names[1].rstrip("_")
        else:
            protein_2 = protein_names[1]

        # Now including average pLDDT in the extraction
        values = extract_values(json_file_path)

        # Incorporate average pLDDT into the DataFrame
        file_data = pd.DataFrame(
            {
                "Protein_1": [protein_names[0]],
                "Protein_2": [protein_2],
                "pTM": [values["ptm"]],
                "ipTM": [values["iptm"]],
                "Average_pLDDT": [values["avg_plddt"]],
                "Ranking_confidence": [0.2 * values["ptm"] + 0.8 * values["iptm"]],
            }
        )

        result_data = pd.concat([result_data, file_data], ignore_index=True)

# Saving the results
result_data.to_csv(csv_file_path, index=False)