In [5]:
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
import pandas as pd
import pickle

def compute_fingerprints_for_dataset(csv_paths, output_fp_cache_path):
    all_smiles = set()
    for path in csv_paths:
        df = pd.read_csv(path)
        all_smiles.update(df["smiles"])
    print(f"🧪 Total unique SMILES: {len(all_smiles)}")

    generator = GetMorganGenerator(radius=2, fpSize=2048)
    fps = {
        smi: generator.GetFingerprint(Chem.MolFromSmiles(smi))
        for smi in all_smiles if Chem.MolFromSmiles(smi) is not None
    }

    with open(output_fp_cache_path, "wb") as f:
        pickle.dump(fps, f)
    print(f"✅ Fingerprints saved to: {output_fp_cache_path}")


In [6]:
import os
from glob import glob



#Raw
root = "/home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/raw"
csv_paths = glob(os.path.join(root, "raw_*.csv"))
print("📄 Found", len(csv_paths), "CSV files for", root)
compute_fingerprints_for_dataset(csv_paths, "/home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/raw/fingerprint_cache.pkl")

# H37Rv
root = "/home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/h37rv"
csv_paths = glob(os.path.join(root, "h37rv_*.csv"))
compute_fingerprints_for_dataset(csv_paths, "/home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/h37rv/fingerprint_cache.pkl")

# NR
root = "/home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/nr"
csv_paths = glob(os.path.join(root, "nr_*.csv"))
compute_fingerprints_for_dataset(csv_paths, "/home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/nr/fingerprint_cache.pkl")


📄 Found 25 CSV files for /home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/raw
🧪 Total unique SMILES: 18780
✅ Fingerprints saved to: /home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/raw/fingerprint_cache.pkl
🧪 Total unique SMILES: 14187
✅ Fingerprints saved to: /home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/h37rv/fingerprint_cache.pkl
🧪 Total unique SMILES: 18402
✅ Fingerprints saved to: /home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/nr/fingerprint_cache.pkl


In [7]:
import pickle
with open("/home/malves/predinhib_mtb/data/cv/raw_h37rv_nr/folds/h37rv/fingerprint_cache.pkl", "rb") as f:
    data = pickle.load(f)
    
data

{'CC1(COCc2ccc(OC(F)(F)F)cc2)Cn2cc([N+](=O)[O-])nc2S1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fa4919e4b30>,
 'O=C(N/N=C/c1ccccc1)c1ccncc1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fa4919e4950>,
 'COc1nc(C)nc(/N=C/c2ccc[nH]2)n1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fa4900b0bd0>,
 'CC1(C)[C@@H]2CC[C@@]1([C@H](O)CN1CCN(c3ccccc3)CC1)C(=O)C2': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fa4900b23e0>,
 'C/C(=N\\NC(=O)COc1cccc2ccccc12)c1ccc(-n2c(C)ccc2C)cc1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fa4900b1ee0>,
 'COc1ccc2nc(C)cc(NC(=O)CN3CC(CNCC4CCCCC4)OC3=O)c2c1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fa4900b3420>,
 'CCOC(=O)c1cc(COc2cc(C(F)(F)F)nc3cc(C(F)(F)F)ccc23)on1': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fa4900b2660>,
 'CC(=O)O[C@@H]1[C@H](OC(C)=O)[C@H](OC(C)=O)CO[C@H]1SCC(N)=O': <rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x7fa4900b3f60>,
 'Cc1ccc(C)n1NC(=O)c1ccncc1': <rdkit.