In [1]:
from rdkit import Chem
import pandas as pd
import pickle
import numpy as np
import subprocess
import datetime
import os
from glob import glob


In [None]:
import os
import subprocess
import datetime
import pandas as pd
import numpy as np
import pickle
from pathlib import Path

# ──────────────────────────────────────────────────────────────
def unique_dir_name():
    return datetime.datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

# ──────────────────────────────────────────────────────────────
def smiles_to_embeddings(smiles, gpu, kpgt_root, env_python, model_path, config_name="base"):
    folder = unique_dir_name()
    datasets_dir = Path(kpgt_root) / "datasets" / folder
    datasets_dir.mkdir(parents=True)

    csv_path = datasets_dir / f"{folder}.csv"
    pd.DataFrame({"Class": [0]*len(smiles), "smiles": smiles}).to_csv(csv_path, index=False)

    original_path = Path.cwd()
    script_dir = Path(kpgt_root) / "scripts"
    os.chdir(script_dir)

    try:
        subprocess.run([
            env_python,
            str(script_dir / "preprocess_downstream_dataset.py"),
            "--data_path", str(Path(kpgt_root) / "datasets"),
            "--dataset", folder
        ], check=True)

        print("🧠 Extracting features...")

        subprocess.run([
            env_python,
            str(script_dir / "extract_features.py"),
            "--config", config_name,
            "--model_path", str(model_path),
            "--data_path", str(Path(kpgt_root) / "datasets"),
            "--gpu", str(gpu),
            "--dataset", folder
        ], check=True)

    finally:
        os.chdir(original_path)

    # Load embeddings
    embeddings_npz = datasets_dir / "kpgt_base.npz"
    data = np.load(embeddings_npz)
    fps_array = data["fps"]

    # Clean up
    import shutil
    shutil.rmtree(datasets_dir)

    return fps_array

# ──────────────────────────────────────────────────────────────
def compute_kpgt_embeddings_for_dataset(csv_paths, output_fp_cache_path, gpu, kpgt_root, env_python, model_path):
    all_smiles = set()
    for path in csv_paths:
        df = pd.read_csv(path)
        all_smiles.update(df["smiles"])

    all_smiles = sorted(list(all_smiles))
    print(f"🧬 Total unique SMILES: {len(all_smiles)}")

    fps_array = smiles_to_embeddings(all_smiles, gpu=gpu, kpgt_root=kpgt_root,
                                     env_python=env_python, model_path=model_path)

    smiles_to_fp = {smi: fps_array[i] for i, smi in enumerate(all_smiles)}

    with open(output_fp_cache_path, "wb") as f:
        pickle.dump(smiles_to_fp, f)

    print(f"✅ Embeddings saved to: {output_fp_cache_path}")

In [None]:
import os
from glob import glob

# --- Set up root paths ---
SCRIPT_DIR = os.getcwd()
REPO_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "..", ".."))
cv_root = os.path.join(REPO_ROOT, "data", "cv", "raw_h37rv_nr", "folds")
h37rv_root = os.path.join(cv_root, "h37rv")

# --- KPGT setup (based on your original structure) ---
KPGT_ROOT = os.path.expanduser("~/predator/KPGT")  # /home/malves/predator/KPGT
ENV_PYTHON = os.path.expanduser("~/miniconda3/envs/KPGT/bin/python")  # /home/malves/miniconda3/envs/KPGT/bin/python
MODEL_PATH = os.path.join(KPGT_ROOT, "models", "pretrained", "base", "base.pth")

# --- Find CSVs ---
csv_paths = glob(os.path.join(h37rv_root, "h37rv_*.csv"))
print("📄 Found", len(csv_paths), "CSV files for H37Rv")

# --- Compute embeddings ---
compute_kpgt_embeddings_for_dataset(
    csv_paths=csv_paths,
    output_fp_cache_path=os.path.join(h37rv_root, "kpgt_embeddings_cache.pkl"),
    gpu=0
    kpgt_root=KPGT_ROOT,
    env_python=ENV_PYTHON,
    model_path=MODEL_PATH
)


📄 Found 25 CSV files for H37Rv
🧬 Total unique SMILES: 14187


Using backend: pytorch
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.


constructing graphs


Using backend: pytorch
Using backend: pytorch
Using backend: pytorchUsing backend: pytorch

Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorchUsing backend: pytorch

Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    3.5s
[Parallel(n_jobs=32)]: Done 744 tasks      | elapsed:    4.8s
[Parallel(n_jobs=32)]: Done 2144 tasks      | elapsed:    6.8s
[Parallel(n_jobs=32)]: Done 3944 tasks      | elapsed:    9.3s
[Parallel(n_jo

saving graphs
extracting fingerprints
saving fingerprints
extracting molecular descriptors
🧠 Extracting features.../14187


Using backend: pytorch


The extracted features were saved at /home/malves/predator/KPGT/datasets//16-04-2025_01-21-09/kpgt_base.npz
✅ Embeddings saved to: /home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/h37rv/kpgt_embeddings_cache.pkl


In [None]:
import os
from glob import glob

# --- Assuming SCRIPT_DIR, REPO_ROOT already defined ---
SCRIPT_DIR = os.getcwd()
REPO_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "..", ".."))

cv_root = os.path.join(REPO_ROOT, "data", "cv", "raw_h37rv_nr", "folds")
raw_root = os.path.join(cv_root, "raw")

# --- KPGT setup ---
KPGT_ROOT = os.path.expanduser("~/predator/KPGT")
ENV_PYTHON = os.path.expanduser("~/miniconda3/envs/KPGT/bin/python")
MODEL_PATH = os.path.join(KPGT_ROOT, "models", "pretrained", "base", "base.pth")

# --- Find CSVs ---
csv_paths = glob(os.path.join(raw_root, "raw_*.csv"))
print("📄 Found", len(csv_paths), "CSV files for RAW")

# --- Compute embeddings ---
compute_kpgt_embeddings_for_dataset(
    csv_paths=csv_paths,
    output_fp_cache_path=os.path.join(raw_root, "kpgt_embeddings_cache.pkl"),
    gpu=0,
    kpgt_root=KPGT_ROOT,
    env_python=ENV_PYTHON,
    model_path=MODEL_PATH
)


📄 Found 25 CSV files for RAW
🧬 Total unique SMILES: 18780


Using backend: pytorch
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.


constructing graphs


Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
[Parallel(n_jobs=32)]: Done 136 tasks      | elapsed:    3.4s
[Parallel(n_jobs=32)]: Done 936 tasks      | elapsed:    5.1s
[Parallel(n_jobs=32)]: Done 2336 tasks      | elapsed:    7.0s
[Parallel(n_jobs=32)]: Done 4136 tasks      | elapsed:    9.5s
[Parallel(n_jo

saving graphs
extracting fingerprints
saving fingerprints
extracting molecular descriptors
🧠 Extracting features.../18780


Using backend: pytorch


The extracted features were saved at /home/malves/predator/KPGT/datasets//16-04-2025_00-47-00/kpgt_base.npz
✅ Embeddings saved to: /home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/raw/kpgt_embeddings_cache.pkl


In [None]:
cv_root = os.path.join(REPO_ROOT, "data", "cv", "raw_h37rv_nr", "folds")
nr_root = os.path.join(cv_root, "nr")

# --- KPGT setup ---
KPGT_ROOT = os.path.expanduser("~/predator/KPGT")
ENV_PYTHON = os.path.expanduser("~/miniconda3/envs/KPGT/bin/python")
MODEL_PATH = os.path.join(KPGT_ROOT, "models", "pretrained", "base", "base.pth")

# --- Find CSVs ---
csv_paths = glob(os.path.join(nr_root, "nr_*.csv"))
print("📄 Found", len(csv_paths), "CSV files for NR")

# --- Compute embeddings ---
compute_kpgt_embeddings_for_dataset(
    csv_paths=csv_paths,
    output_fp_cache_path=os.path.join(nr_root, "kpgt_embeddings_cache.pkl"),
    gpu=0,
    kpgt_root=KPGT_ROOT,
    env_python=ENV_PYTHON,
    model_path=MODEL_PATH
)


📄 Found 25 CSV files for NR
🧬 Total unique SMILES: 18402


Using backend: pytorch
[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.


constructing graphs


Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
Using backend: pytorch
[Parallel(n_jobs=32)]: Done 162 tasks      | elapsed:    3.3s
[Parallel(n_jobs=32)]: Done 1032 tasks      | elapsed:    4.8s
[Parallel(n_jobs=32)]: Done 2432 tasks      | elapsed:    6.9s
[Parallel(n_jobs=32)]: Done 4232 tasks      | elapsed:    9.8s
[Parallel(n_j

saving graphs
extracting fingerprints
saving fingerprints
extracting molecular descriptors
🧠 Extracting features.../18402


Using backend: pytorch


The extracted features were saved at /home/malves/predator/KPGT/datasets//16-04-2025_01-36-42/kpgt_base.npz
✅ Embeddings saved to: /home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/nr/kpgt_embeddings_cache.pkl
