In [3]:
import os
import papermill as pm
import glob
import logging

# Define the base directory
base_dir = os.getcwd()

# Create papermill and output directories
papermill_dir = os.path.join(base_dir, "papermill")
output_dir = os.path.join(papermill_dir, "extrema_1pt0LD_torsion_notebooks")
os.makedirs(output_dir, exist_ok=True)

# Set up logging
log_file = os.path.join(papermill_dir, "extrema_1pt0LD_torsion_runner.log")
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

# List of proteins to skip
SKIP_PROTEINS = ["AMPC", "DRD4"]


def find_files(protein_dir):
    protein_name = os.path.basename(protein_dir)

    # Use more flexible glob patterns
    active_sdf = glob.glob(
        os.path.join(
            protein_dir, f"{protein_name}_std_dudez_1pt0LD*ligand*poses_lib_sorted.sdf"
        )
    )
    decoy_sdf = glob.glob(
        os.path.join(
            protein_dir, f"{protein_name}_std_extrema_1pt0LD*decoy*poses_lib_sorted.sdf"
        )
    )
    active_strain = glob.glob(
        os.path.join(
            protein_dir,
            f"{protein_name}_std_dudez_1pt0LD*ligand*poses_lib_sorted_tstrain.csv",
        )
    )
    decoy_strain = glob.glob(
        os.path.join(
            protein_dir,
            f"{protein_name}_std_extrema_1pt0LD*decoy*poses_lib_sorted_tstrain.csv",
        )
    )

    # Check if we found exactly one file for each type
    if (
        len(active_sdf) != 1
        or len(decoy_sdf) != 1
        or len(active_strain) != 1
        or len(decoy_strain) != 1
    ):
        raise ValueError(f"Unexpected number of files found for {protein_name}")

    return active_sdf[0], decoy_sdf[0], active_strain[0], decoy_strain[0]


def run_notebook(protein_dir):
    protein_name = os.path.basename(protein_dir)

    if protein_name in SKIP_PROTEINS:
        logging.info(f"Skipping {protein_name} as it requires special handling")
        print(f"Skipping {protein_name} as it requires special handling")
        return

    try:
        active_sdf, decoy_sdf, active_strain, decoy_strain = find_files(protein_dir)
        output_notebook = os.path.join(
            output_dir, f"{protein_name}_extrema_1pt0_output.ipynb"
        )
        pm.execute_notebook(
            "input_for_dudez_analysis_papermill.ipynb",
            output_notebook,
            parameters={
                "title_suffix": f"{protein_name}_extrema_1pt0",
                "file_path_sdf_active": active_sdf,
                "file_path_sdf_decoy": decoy_sdf,
                "file_path_strain_active": active_strain,
                "file_path_strain_decoy": decoy_strain,
            },
        )
        logging.info(f"Completed analysis for {protein_name}")
        print(f"Completed analysis for {protein_name}")
    except ValueError as ve:
        logging.error(f"Error finding files for {protein_name}: {str(ve)}")
        print(f"Error finding files for {protein_name}: {str(ve)}")
    except Exception as e:
        logging.error(f"Error processing {protein_name}: {str(e)}")
        print(f"Error processing {protein_name}: {str(e)}")


# Find all protein directories
protein_dirs = [
    d for d in glob.glob(os.path.join(base_dir, "[A-Z0-9]*")) if os.path.isdir(d)
]

# Run the notebook for each protein
total_proteins = len(protein_dirs)
for i, protein_dir in enumerate(protein_dirs, 1):
    protein_name = os.path.basename(protein_dir)
    print(f"Processing protein {i} of {total_proteins}: {protein_name}")
    run_notebook(protein_dir)

print(f"All proteins processed. Output notebooks are in {output_dir}")
print(f"Check the log file at {log_file} for details.")
print(f"Note: AMPC and DRD4 were skipped and require special handling.")

Processing protein 1 of 43: MK01


Executing:   0%|          | 0/132 [00:00<?, ?cell/s]

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


Completed analysis for MK01
Processing protein 2 of 43: ADA


Executing:   0%|          | 0/132 [00:00<?, ?cell/s]

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


KeyboardInterrupt: 