In [None]:
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
import pandas as pd
import pickle
import os
from glob import glob

def compute_fingerprints_for_dataset(csv_paths, output_fp_cache_path):
    all_smiles = set()
    for path in csv_paths:
        df = pd.read_csv(path)
        all_smiles.update(df["smiles"])
    print(f"🧪 Total unique SMILES: {len(all_smiles)}")

    generator = GetMorganGenerator(radius=2, fpSize=2048)
    fps = {
        smi: generator.GetFingerprint(Chem.MolFromSmiles(smi))
        for smi in all_smiles if Chem.MolFromSmiles(smi) is not None
    }

    with open(output_fp_cache_path, "wb") as f:
        pickle.dump(fps, f)
    print(f"✅ Fingerprints saved to: {output_fp_cache_path}")


In [None]:
import os
from glob import glob

# Get repo root based on known structure: we are in pipeline/5_h37rv_nr_raw
SCRIPT_DIR = os.getcwd()
REPO_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "..", ".."))
cv_root = os.path.join(REPO_ROOT, "data", "cv", "raw_h37rv_nr", "folds")

# --- Raw ---
root = os.path.join(cv_root, "raw")
csv_paths = glob(os.path.join(root, "raw_*.csv"))
print("📄 Found", len(csv_paths), "CSV files for", root)
compute_fingerprints_for_dataset(csv_paths, os.path.join(root, "fingerprint_cache.pkl"))

# --- H37Rv ---
root = os.path.join(cv_root, "h37rv")
csv_paths = glob(os.path.join(root, "h37rv_*.csv"))
compute_fingerprints_for_dataset(csv_paths, os.path.join(root, "fingerprint_cache.pkl"))

# --- NR ---
root = os.path.join(cv_root, "nr")
csv_paths = glob(os.path.join(root, "nr_*.csv"))
compute_fingerprints_for_dataset(csv_paths, os.path.join(root, "fingerprint_cache.pkl"))


📄 Found 25 CSV files for /home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/raw
🧪 Total unique SMILES: 18780
✅ Fingerprints saved to: /home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/raw/fingerprint_cache.pkl
🧪 Total unique SMILES: 14187
✅ Fingerprints saved to: /home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/h37rv/fingerprint_cache.pkl
🧪 Total unique SMILES: 18402
✅ Fingerprints saved to: /home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/nr/fingerprint_cache.pkl


In [6]:
import pickle
import sys
sys.path.append("../../..")
with open("../../../data/cv/raw_h37rv_nr/folds/h37rv/fingerprint_cache.pkl", "rb") as f:
    data = pickle.load(f)
    
data

{'CC1(COCc2ccc(OC(F)(F)F)cc2)Cn2cc([N+](=O)[O-])nc2S1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f13a04dbb50>,
 'O=C(N/N=C/c1ccccc1)c1ccncc1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f135bfe4450>,
 'COc1nc(C)nc(/N=C/c2ccc[nH]2)n1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f135bfe7790>,
 'CC1(C)[C@@H]2CC[C@@]1([C@H](O)CN1CCN(c3ccccc3)CC1)C(=O)C2': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f135bfe4770>,
 'C/C(=N\\NC(=O)COc1cccc2ccccc12)c1ccc(-n2c(C)ccc2C)cc1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f135bfe7970>,
 'COc1ccc2nc(C)cc(NC(=O)CN3CC(CNCC4CCCCC4)OC3=O)c2c1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f13a0372c50>,
 'CCOC(=O)c1cc(COc2cc(C(F)(F)F)nc3cc(C(F)(F)F)ccc23)on1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f13a0470860>,
 'CC(=O)O[C@@H]1[C@H](OC(C)=O)[C@H](OC(C)=O)CO[C@H]1SCC(N)=O': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7f13a0472570>,
 'Cc1ccc(C)n1NC(=O)c1ccncc1': <rdkit.