# 0) Modules & Functions

In [None]:
import os
import pandas as pd
from rdkit import Chem
import subprocess
import hashlib
import tempfile
import base64
import numpy as np
from itertools import combinations
from scipy.stats import pearsonr
from tqdm import tqdm
import time
from collections import defaultdict
from rdkit.Chem import AllChem
from rdkit import RDLogger
import warnings
import re
from rdkit.Chem import inchi
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from rdkit.Chem import Descriptors
from rdkit import Chem
from rdkit.Chem import Draw

# Disable RDKit warnings
RDLogger.DisableLog('rdApp.*')

warnings.filterwarnings("ignore", category=UserWarning, module="rdkit")
warnings.filterwarnings("ignore", category=FutureWarning, module="rdkit")
warnings.filterwarnings("ignore", category=DeprecationWarning, module="rdkit")

class MMPGenerator:
    def __init__(self, df_input, output_csv, mmpa_dir='../mmpa',
                 symmetric=True, max_heavy=14, max_ratio=0.7, verbose=True):
        self.df_original = df_input.copy()
        self.output_csv = output_csv
        self.mmpa_dir = mmpa_dir
        self.symmetric = symmetric
        self.max_heavy = max_heavy
        self.max_ratio = max_ratio
        self.verbose = verbose

    def _get_inchikey(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.inchi.MolToInchiKey(mol)
        return self.encode_string(smiles)

    def encode_string(self, s):
        return base64.urlsafe_b64encode(s.encode()).decode()

    def decode_string(self, b64):
        return base64.urlsafe_b64decode(b64.encode()).decode()

    def run(self):
        self.df_original['ID'] = [self._get_inchikey(smi) for smi in self.df_original['SMILES']]
        y_map = self.df_original.set_index('ID')['Y'].to_dict()

        with tempfile.TemporaryDirectory() as tmp:
            smi_path = os.path.join(tmp, 'input.smi')
            frag_path = os.path.join(tmp, 'fragmented.txt')
            mmps_path = os.path.join(tmp, 'mmps.csv')
            smirks_path = os.path.join(tmp, 'smirks.txt')
            cansmirks_path = os.path.join(tmp, 'cansmirks.txt')

            self.df_original[['SMILES', 'ID']].to_csv(smi_path, index=False, sep=' ', header=False)

            print("0) Fragments generation")
            with open(frag_path, 'w') as out:
                subprocess.run(['python', f'{self.mmpa_dir}/rfrag.py'], stdin=open(smi_path), stdout=out)

            print("1) Indexing")
            cmd = ['python', f'{self.mmpa_dir}/indexing.py']
            if self.symmetric:
                cmd.append('-s')
            if self.max_heavy:
                cmd.extend(['-m', str(self.max_heavy)])
            if self.max_ratio:
                cmd.extend(['-r', str(self.max_ratio)])

            with open(mmps_path, 'w') as out:
                subprocess.run(cmd, stdin=open(frag_path), stdout=out)

            with open(mmps_path) as f:
                lines = [line.strip() for line in f if line.strip()]


            splits = [line.split(',') for line in lines]
            # max_len = max(len(s) for s in splits) # error as nothing in max so nothing in split, and not used...

            df = pd.DataFrame(splits, columns=['L_SMILES', 'R_SMILES', 'L_ID', 'R_ID', 'SMIRKS', 'CORE'])


            df['L_Y'] = df['L_ID'].map(y_map)
            df['R_Y'] = df['R_ID'].map(y_map)
            df['Delta_Y'] = df['R_Y'] - df['L_Y']

            df = df[df['SMIRKS'].apply(lambda x: isinstance(x, str) and '>>' in x)]

            df['__row'] = range(len(df))
            df[['SMIRKS', '__row']].to_csv(smirks_path, index=False, sep=' ', header=False)

            print("2) Canonical SMIRKS generation")
            with open(cansmirks_path, 'w') as out:
                subprocess.run(['python', f'{self.mmpa_dir}/cansmirk.py'], stdin=open(smirks_path), stdout=out)


            canon_df = pd.read_csv(cansmirks_path, sep=' ', names=['Canonical_SMIRKS', 'index'])


            df = df.merge(canon_df, left_on='__row', right_on='index').drop(columns=['__row', 'index'])

            df[['L_sub', 'R_sub']] = df['Canonical_SMIRKS'].str.split('>>', expand=True)


            df['L_sub_ID'] = [self.encode_string(k) for k in df['L_sub'].tolist()]
            df['R_sub_ID'] = [self.encode_string(k) for k in df['R_sub'].tolist()]
            df['SMIRKS_ID'] = [self.encode_string(k) for k in df['Canonical_SMIRKS'].tolist()]
            df['CORE_ID'] = [self.encode_string(k) for k in df['CORE'].tolist()]
            df = df.drop_duplicates()


            df[['L_SMILES', 'R_SMILES', 'L_ID', 'R_ID', 'SMIRKS', 'CORE', 'L_Y', 'R_Y', 'Delta_Y', 'L_sub', 'R_sub', 'L_sub_ID', 'R_sub_ID', 'SMIRKS_ID', 'CORE_ID']].to_csv(
                self.output_csv, index=False
            )


class MMPAugmentorFixed:
    def __init__(self, df, min_common=4, pearson_thresh=0.3, crmsd_thresh=0.4):
        self.df = df.copy()
        self.min_common = min_common
        self.pearson_thresh = pearson_thresh
        self.crmsd_thresh = crmsd_thresh
        self.series = {}
        self.pair_scores = []
        self.filtered_pairs = []
        self.augmented_data = []

    def _extract_series(self):
        start = time.time()
        self.series = {
            core: group for core, group in self.df.groupby("CORE")
        }

    def _compute_pairwise_scores(self):
        start = time.time()
        self.pair_scores = []
        series_items = list(self.series.items())
        total_combinations = len(series_items) * (len(series_items) - 1) // 2
        for (core1, df1), (core2, df2) in tqdm(combinations(series_items, 2), desc="3) Computing pairwise correlations", total=total_combinations):
            subs1 = set(df1["L_sub"])
            subs2 = set(df2["L_sub"])
            common = subs1 & subs2
            if len(common) < self.min_common:
                continue

            merged = pd.merge(
                df1, df2,
                left_on=["L_sub", "R_sub"],
                right_on=["L_sub", "R_sub"],
                suffixes=('_1', '_2')
            )
            if len(merged) < self.min_common:
                continue

            y1 = merged['Delta_Y_1'].values
            y2 = merged['Delta_Y_2'].values
            crmsd = np.sqrt(np.mean((y1 - y2) ** 2))
            try:
                corr = pearsonr(y1, y2)[0]
            except:
                corr = np.nan

            self.pair_scores.append((core1, core2, crmsd, corr, len(merged)))

    def _filter_pairs(self):
        start = time.time()
        self.filtered_pairs = [
            (s1, s2) for s1, s2, rmsd, corr, n in self.pair_scores
            if rmsd <= self.crmsd_thresh and (not np.isnan(corr) and corr >= self.pearson_thresh)
        ]

    def _augment(self):
        start = time.time()
        augmented_entries = []

        for s1, s2 in tqdm(self.filtered_pairs, desc="4) Augmenting data"):
            df1 = self.series[s1]
            df2 = self.series[s2]

            tf1 = df1[["L_sub", "R_sub", "Delta_Y"]].to_dict("records")
            tf2 = df2[["L_sub", "R_sub", "Delta_Y"]].to_dict("records")

            df1_dict = defaultdict(list)
            for _, row in df1.iterrows():
                df1_dict[row["L_sub"]].append(row.to_dict())

            df2_dict = defaultdict(list)
            for _, row in df2.iterrows():
                df2_dict[row["L_sub"]].append(row.to_dict())

            for entry in tf1:
                l_sub = entry["L_sub"]
                for base in df2_dict.get(l_sub, []):
                    r_sub = entry["R_sub"]
                    delta = entry["Delta_Y"]
                    new_y = base["L_Y"] + delta
                    smirks_new = l_sub + ">>" + r_sub
                    augmented_entries.append({
                        "CORE": s1,
                        "L_sub": l_sub,
                        "R_sub": r_sub,
                        "L_Y": base["L_Y"],
                        "R_Y": new_y,
                        "Delta_Y": delta,
                        "AUG": True,
                        "L_SMILES": base.get("L_SMILES"),
                        "L_ID": base.get("L_ID"),
                        "L_sub_ID": base.get("L_sub_ID"),
                        "R_sub_ID": base64.urlsafe_b64encode(r_sub.encode()).decode(),
                        "SMIRKS": smirks_new,
                        "SMIRKS_ID": base64.urlsafe_b64encode(smirks_new.encode()).decode(),
                        "CORE_ID": base64.urlsafe_b64encode(s1.encode()).decode()
                    })

            for entry in tf2:
                l_sub = entry["L_sub"]
                for base in df1_dict.get(l_sub, []):
                    r_sub = entry["R_sub"]
                    delta = entry["Delta_Y"]
                    new_y = base["L_Y"] + delta
                    smirks_new = l_sub + ">>" + r_sub
                    augmented_entries.append({
                        "CORE": s2,
                        "L_sub": l_sub,
                        "R_sub": r_sub,
                        "L_Y": base["L_Y"],
                        "R_Y": new_y,
                        "Delta_Y": delta,
                        "AUG": True,
                        "L_SMILES": base.get("L_SMILES"),
                        "L_ID": base.get("L_ID"),
                        "L_sub_ID": base.get("L_sub_ID"),
                        "R_sub_ID": base64.urlsafe_b64encode(r_sub.encode()).decode(),
                        "SMIRKS": smirks_new,
                        "SMIRKS_ID": base64.urlsafe_b64encode(smirks_new.encode()).decode(),
                        "CORE_ID": base64.urlsafe_b64encode(s2.encode()).decode()
                    })

        self.augmented_data = pd.DataFrame(augmented_entries)


    def run(self):
        self._extract_series()
        self._compute_pairwise_scores()
        self._filter_pairs()
        self._augment()

        original = self.df.copy()
        original["AUG"] = False
        return pd.concat([original, self.augmented_data], ignore_index=True)
    
    def get_pair_scores_df(self):
        """
        Return a DataFrame of scaffold pair scores (cRMSD, Pearson, common MMP count)
        """
        return pd.DataFrame(
            self.pair_scores,
            columns=["Scaffold_1", "Scaffold_2", "cRMSD", "Pearson", "N_common"]
        )

    def get_augmented_only(self):
        """
        Return only the augmented (predicted) entries.
        """
        return self.augmented_data.copy()
    

def analyze_scaffold_pair_scores(df):
    summary = {
        "Total Pairs": len(df),
        "Mean cRMSD": df["cRMSD"].mean(),
        "Median cRMSD": df["cRMSD"].median(),
        "Std cRMSD": df["cRMSD"].std(),
        "Mean Pearson": df["Pearson"].mean(),
        "Median Pearson": df["Pearson"].median(),
        "Std Pearson": df["Pearson"].std(),
        "Mean N_common": df["N_common"].mean(),
        "Median N_common": df["N_common"].median()
    }

    high_corr = df["Pearson"] > 0.7
    low_crmsd = df["cRMSD"] < 0.5
    enough_common = df["N_common"] >= 5
    strong_pairs = df[high_corr & low_crmsd & enough_common]

    summary.update({
        "High Pearson (>0.7)": high_corr.sum(),
        "Low cRMSD (<0.5)": low_crmsd.sum(),
        "N_common ≥ 5": enough_common.sum(),
        "Strong Pairs (all 3)": len(strong_pairs)
    })

    top_corr = df.sort_values("Pearson", ascending=False).head(5)
    top_low_crmsd = df.sort_values("cRMSD").head(5)
    top_common = df.sort_values("N_common", ascending=False).head(5)

    return {
        "summary": summary,
        "top_corr": top_corr,
        "top_low_crmsd": top_low_crmsd,
        "top_common": top_common,
        "strong_pairs": strong_pairs
    }




def fast_apply_transformation(transformation, l_smiles, rxn_cache, core_cache, heavy_cache, core_smarts, failure_tracker=None):
    if pd.isna(transformation) or pd.isna(l_smiles):
        return None

    # --- Cache reaction and Δheavy ---
    if transformation not in rxn_cache:
        try:
            rxn = AllChem.ReactionFromSmarts(transformation)
            left_smi, right_smi = transformation.split(">>")
            left_mol = Chem.MolFromSmarts(left_smi)
            right_mol = Chem.MolFromSmarts(right_smi)
            delta_heavy = right_mol.GetNumHeavyAtoms() - left_mol.GetNumHeavyAtoms()
            rxn_cache[transformation] = (rxn, delta_heavy)
        except:
            return None
    else:
        rxn, delta_heavy = rxn_cache[transformation]

    # --- Cache L_SMILES heavy atom count ---
    if l_smiles not in heavy_cache:
        mol_l = Chem.MolFromSmiles(l_smiles)
        if mol_l is None:
            return None
        n_heavy_l = mol_l.GetNumHeavyAtoms()
        heavy_cache[l_smiles] = (mol_l, n_heavy_l)
    else:
        mol_l, n_heavy_l = heavy_cache[l_smiles]

    # --- Cache core mol ---
    if core_smarts not in core_cache:
        core_mol = Chem.MolFromSmarts(core_smarts)
        if core_mol is None:
            return None
        core_cache[core_smarts] = core_mol
    else:
        core_mol = core_cache[core_smarts]

    # --- Run reaction ---
    try:
        products = rxn.RunReactants((mol_l,))
    except Exception:
        return None

    if failure_tracker is not None:
        failure_tracker["total"] += 1
        if not products:
            failure_tracker["empty_prodsets"] += 1

    all_products = []
    for prod_set in products:
        if not prod_set and failure_tracker is not None:
            failure_tracker["empty_prodsets"] += 1
        for prod in prod_set:
            if prod is None:
                continue
            try:
                if not prod.HasSubstructMatch(core_mol):
                    continue
            except:
                continue
            n_heavy_r = prod.GetNumHeavyAtoms()
            if n_heavy_r - n_heavy_l != delta_heavy:
                continue
            all_products.append(Chem.MolToSmiles(prod, isomericSmiles=True))

    return all_products if all_products else None

def update_predicted_rows(df):
    df = df.drop_duplicates()
    mask = df["AUG"] == True
    indices = df[mask].index

    rxn_cache = {}
    heavy_cache = {}
    core_cache = {}

    new_rows = []
    failure_tracker = {"total": 0, "empty_prodsets": 0}

    for idx in tqdm(indices, desc="5) Updating predicted rows"):
        row = df.loc[idx]
        l_smiles = row["L_SMILES"]
        smirks = row["SMIRKS"]
        core = row["CORE"]

        r_smiles_list = fast_apply_transformation(
            smirks, l_smiles, rxn_cache, core_cache, heavy_cache, core,
            failure_tracker=failure_tracker
        )

        if r_smiles_list:
            for r_smiles in r_smiles_list:
                mol = Chem.MolFromSmiles(r_smiles)
                if mol:
                    r_id = Chem.InchiToInchiKey(Chem.MolToInchi(mol))
                    new_row = row.copy()
                    new_row["R_SMILES"] = r_smiles
                    new_row["R_ID"] = r_id
                    new_rows.append(new_row)

    df_non_aug = df[~mask]
    df_aug_expanded = pd.DataFrame(new_rows)
    df_final = pd.concat([df_non_aug, df_aug_expanded], ignore_index=True)

    # Print failure rate
    total = failure_tracker["total"]
    failed = failure_tracker["empty_prodsets"]
    if total > 0:
        print(f"⚠️  Empty product sets in {failed} / {total} ({100 * failed / total:.2f}%) of transformations")

    return df_final








def prepare_and_plot_prediction_vs_experiment_complete_with_output(imputed_df, std_threshold=0.5):

    print("6) Preparing output file")

    l_df = imputed_df[["L_SMILES", "L_Y", "AUG"]]
    r_df = imputed_df[["R_SMILES", "R_Y", "AUG"]]
    l_df.columns = ["SMILES", "Y", "AUG"]
    r_df.columns = ["SMILES", "Y", "AUG"]
    clean_df = pd.concat([l_df, r_df])


    # Count total before
    total_before = len(clean_df)

    # Drop rows with missing or invalid SMILES
    clean_df = clean_df.dropna(subset=["SMILES"]).copy()
    clean_df = clean_df[clean_df["SMILES"].apply(lambda x: Chem.MolFromSmiles(x) is not None)]

    # Count total after
    total_after = len(clean_df)

    # Print failure rate
    fail_pct = 100 * (total_before - total_after) / total_before
    print(f"Invalid SMILES removed: {total_before - total_after} / {total_before} ({fail_pct:.2f}%)")

    tqdm.pandas(desc="7) Standardizing SMILES")
    clean_df["SMILES"] = clean_df["SMILES"].progress_apply(
        lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=True)
    )

    # Step 8) InChIKey generation
    tqdm.pandas(desc="8) Generating InChIKeys")
    clean_df['InChIKey'] = clean_df["SMILES"].progress_apply(smiles_to_inchikey)


    # Insert InChIKey as first column
    cols = ['InChIKey'] + [col for col in clean_df.columns if col != 'InChIKey']
    clean_df = clean_df[cols]


    clean_df_exp = clean_df[clean_df["AUG"] != True].copy()
    clean_df_pred = clean_df[clean_df["AUG"] == True].copy()

    # Normalize column names
    clean_df_exp = clean_df_exp.rename(columns={"R_SMILES": "SMILES", "R_Y": "Y"})
    clean_df_pred = clean_df_pred.rename(columns={"R_SMILES": "SMILES", "R_Y": "Y"})

    # Compute median and std per SMILES in predicted
    grouped_pred = clean_df_pred.groupby("InChIKey")["Y"].agg(["median", "std"]).reset_index()
    grouped_pred.columns = ["InChIKey", "Y", "STD"]
    grouped_pred["AUG"] = True

    # Re-add SMILES by mapping InChIKey → first SMILES in original pred dataframe
    inchikey_to_smiles = clean_df_pred.dropna(subset=["SMILES"]).drop_duplicates("InChIKey").set_index("InChIKey")["SMILES"].to_dict()
    grouped_pred["SMILES"] = grouped_pred["InChIKey"].map(inchikey_to_smiles)
    grouped_pred = grouped_pred[["InChIKey", "SMILES", "Y", "STD", "AUG"]]
    grouped_pred = grouped_pred[grouped_pred["STD"]<= std_threshold]

    # Final prediction rows
    exp_out = clean_df_exp[['InChIKey', "SMILES", "Y"]].copy()
    exp_out["STD"] = None
    exp_out["AUG"] = False
    exp_out = exp_out[["InChIKey", "SMILES", "Y", "STD", "AUG"]]

    exp_out = exp_out.dropna(how='all')
    grouped_pred = grouped_pred.dropna(how='all')

    # Remove predicted entries with InChIKey already in experimental
    known_keys = set(exp_out['InChIKey'])
    grouped_pred = grouped_pred[~grouped_pred['InChIKey'].isin(known_keys)]


    output_df = pd.concat(
    [exp_out.drop_duplicates("InChIKey"), grouped_pred.drop_duplicates("InChIKey")],
    ignore_index=True
    )

    output_df = output_df[~((output_df['AUG'] == True) & (output_df['InChIKey'].isin(output_df.loc[output_df['AUG'] == False, 'InChIKey'])))]

    return output_df


# Compute InChIKey from SMILES
def smiles_to_inchikey(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    try:
        return inchi.MolToInchiKey(mol)
    except:
        return None
    

def assign_set_from_inchikey(output_df, reference_df, smiles_col="SMILES", inchikey_col="InChIKey", set_col="SET"):
    # Compute InChIKey for each SMILES in output_df
    output_df[inchikey_col] = output_df[smiles_col].apply(
        lambda smi: Chem.inchi.MolToInchiKey(Chem.MolFromSmiles(smi)) if Chem.MolFromSmiles(smi) else None
    )
    
    # Create lookup from reference_df
    inchikey_map = dict(zip(reference_df[inchikey_col], reference_df[set_col]))
    
    # Map SET
    output_df[set_col] = output_df[inchikey_col].map(inchikey_map)
    
    return output_df


def smiles_to_morgan(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)
    return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits))


from itertools import combinations
from pathlib import Path

base_dirs = ["../data/noaug", "../data/aug", "../data/test", "../data/frag"]
combinations_set = ["ABCD", "ABCE", "ABDE", "ACDE","BCDE", "ALL"]
combinations_second = ["STL", "MTL"]

# Create main dirs
for base in base_dirs:
    Path(base).mkdir(parents=True, exist_ok=True)
    # Create subdirs

    
    for combo in combinations_set:
        if base != "../data/frag":
            for combo_2 in combinations_second:
                Path(f"{base}/{combo}/{combo_2}").mkdir(parents=True, exist_ok=True)
        else:
            Path(f"{base}/{combo}").mkdir(parents=True, exist_ok=True)


# 1) Parameters

In [3]:
max_heavy = 15
max_ratio = 0.3
min_common = 4
pearson_thresh = 0.3
crmsd_thresh = 0.6
std_threshold = 0.6

# 2) Augment

---------------------------------------------------------------------------
-> Need to add a way to handle when no pairs or fragments are generated from mmpa
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[3], line 17
     13 df_test = df[~df["SET"].isin(train_set)]
     15 output_csv = f'../data/frag/{set_ID}/{dataset}'
---> 17 MMPGenerator(df_train, output_csv=output_csv, symmetric=True, max_heavy=max_heavy, max_ratio=max_ratio, verbose=True).run()
     18 augmentor = MMPAugmentorFixed(pd.read_csv(output_csv, on_bad_lines="skip") , min_common=min_common, pearson_thresh=pearson_thresh, crmsd_thresh=crmsd_thresh)
     19 final_df2 = augmentor.run()

Cell In[1], line 90
     86     lines = [line.strip() for line in f if line.strip()]
     89 splits = [line.split(',') for line in lines]
---> 90 max_len = max(len(s) for s in splits)
     92 df = pd.DataFrame(splits, columns=['L_SMILES', 'R_SMILES', 'L_ID', 'R_ID', 'SMIRKS', 'CORE'])
     95 df['L_Y'] = df['L_ID'].map(y_map)

ValueError: max() arg is an empty sequence


---------------------------------------------------------------------------
-> Add time to process, per file, in a log
---------------------------------------------------------------------------



---------------------------------------------------------------------------
-> Process the full fill in another notebook
---------------------------------------------------------------------------



---------------------------------------------------------------------------
-> Process the full files and the sub files to create MTL datasets
---------------------------------------------------------------------------







In [None]:
iterated_datasets = [i for i in os.listdir('../data/aug/BC/') if i.endswith('.parquet')]

for dataset in tqdm(os.listdir('../data/exp/STL/'), desc = "Processing datasets"):
    print("Dataset:", dataset)
    if dataset not in iterated_datasets:
        df = pd.read_parquet(f'../data/exp/STL/{dataset}')

        output_csv = f'../data/frag/ALL/{dataset}'
            
        MMPGenerator(df, output_csv=output_csv, symmetric=True, max_heavy=max_heavy, max_ratio=max_ratio, verbose=True).run()
        augmentor = MMPAugmentorFixed(pd.read_csv(output_csv, on_bad_lines="skip") , min_common=min_common, pearson_thresh=pearson_thresh, crmsd_thresh=crmsd_thresh)
        final_df2 = augmentor.run()
        imputed_df = update_predicted_rows(final_df2)
        clean_df = prepare_and_plot_prediction_vs_experiment_complete_with_output(imputed_df, std_threshold=std_threshold)
        output_df = assign_set_from_inchikey(clean_df, df)
        # Find InChIKeys in df_train not in output_df
        missing_rows = df[~df['InChIKey'].isin(output_df['InChIKey'])]
        missing_rows['STD'] = None
        missing_rows['AUG'] = False
        missing_rows = missing_rows[['InChIKey', 'SMILES', 'Y', 'STD', 'AUG', 'SET']]
        full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
        full_aug_df.to_parquet(f'../data/aug/ALL/STL/{dataset}', index=False)

Processing datasets:   0%|          | 0/1268 [00:00<?, ?it/s]

Dataset: oneADMET_LR-STL---pIC$_{50}$ TGFR1 (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 27747525/27747525 [10:18<00:00, 44880.99it/s]
4) Augmenting data: 100%|██████████| 447/447 [00:20<00:00, 21.32it/s]
5) Updating predicted rows: 100%|██████████| 54780/54780 [00:59<00:00, 919.05it/s] 


⚠️  Empty product sets in 0 / 54780 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 323470 (0.00%)


7) Standardizing SMILES: 100%|██████████| 323470/323470 [03:39<00:00, 1472.18it/s]
8) Generating InChIKeys: 100%|██████████| 323470/323470 [06:01<00:00, 894.25it/s]
  output_df = pd.concat(
Processing datasets:   0%|          | 1/1268 [25:24<536:24:27, 1524.13s/it]

Dataset: oneADMET_LR-STL---pK$_{i}$ CXCR3 (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 874503/874503 [00:04<00:00, 180883.65it/s]
4) Augmenting data: 100%|██████████| 157/157 [00:00<00:00, 295.24it/s]
5) Updating predicted rows: 100%|██████████| 8351/8351 [00:05<00:00, 1628.70it/s]


⚠️  Empty product sets in 0 / 8351 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 51342 (0.00%)


7) Standardizing SMILES: 100%|██████████| 51342/51342 [00:05<00:00, 9350.92it/s] 
8) Generating InChIKeys: 100%|██████████| 51342/51342 [00:08<00:00, 5757.17it/s]
  output_df = pd.concat(
Processing datasets:   0%|          | 2/1268 [26:50<238:32:12, 678.30s/it] 

Dataset: oneADMET_LR-STL---pIC$_{50}$ AMPN (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 1488675/1488675 [00:31<00:00, 47735.30it/s]
4) Augmenting data: 100%|██████████| 23/23 [00:00<00:00, 49.28it/s]
5) Updating predicted rows: 100%|██████████| 2045/2045 [00:03<00:00, 580.86it/s]


⚠️  Empty product sets in 0 / 2045 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 46488 (0.00%)


7) Standardizing SMILES: 100%|██████████| 46488/46488 [00:18<00:00, 2559.30it/s]
8) Generating InChIKeys: 100%|██████████| 46488/46488 [00:33<00:00, 1370.52it/s]
  output_df = pd.concat(
Processing datasets:   0%|          | 3/1268 [28:50<148:46:41, 423.40s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ ANM6 (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 153181/153181 [00:03<00:00, 50017.21it/s]
4) Augmenting data: 100%|██████████| 2/2 [00:00<00:00, 82.20it/s]
5) Updating predicted rows: 100%|██████████| 140/140 [00:00<00:00, 2111.87it/s]


⚠️  Empty product sets in 0 / 140 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 14616 (0.00%)


7) Standardizing SMILES: 100%|██████████| 14616/14616 [00:05<00:00, 2535.92it/s]
8) Generating InChIKeys: 100%|██████████| 14616/14616 [00:10<00:00, 1442.16it/s]
Processing datasets:   0%|          | 4/1268 [29:32<95:51:05, 273.00s/it] 

Dataset: oneADMET_LR-STL---pIC$_{50}$ MP2K1 (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 12061416/12061416 [04:21<00:00, 46076.74it/s]
4) Augmenting data: 100%|██████████| 338/338 [00:05<00:00, 64.88it/s]
5) Updating predicted rows: 100%|██████████| 17089/17089 [00:22<00:00, 759.30it/s]


⚠️  Empty product sets in 0 / 17089 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 185778 (0.00%)


7) Standardizing SMILES: 100%|██████████| 185778/185778 [01:53<00:00, 1633.59it/s]
8) Generating InChIKeys: 100%|██████████| 185778/185778 [03:13<00:00, 960.85it/s]
  output_df = pd.concat(
Processing datasets:   0%|          | 5/1268 [44:52<177:34:20, 506.14s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ CAH12 (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 26335/26335 [00:00<00:00, 42378.56it/s]
4) Augmenting data: 100%|██████████| 12/12 [00:00<00:00, 91.90it/s]
5) Updating predicted rows: 100%|██████████| 464/464 [00:02<00:00, 213.79it/s]


⚠️  Empty product sets in 0 / 464 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 8380 (0.00%)


7) Standardizing SMILES: 100%|██████████| 8380/8380 [00:03<00:00, 2131.27it/s]
8) Generating InChIKeys: 100%|██████████| 8380/8380 [00:06<00:00, 1229.05it/s]
  output_df = pd.concat(
Processing datasets:   0%|          | 6/1268 [45:14<119:47:31, 341.72s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ SC6A2 (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 34715278/34715278 [13:15<00:00, 43615.41it/s]
4) Augmenting data: 100%|██████████| 332/332 [00:19<00:00, 17.24it/s]
5) Updating predicted rows: 100%|██████████| 38243/38243 [00:55<00:00, 693.97it/s]


⚠️  Empty product sets in 0 / 38243 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 631898 (0.00%)


7) Standardizing SMILES: 100%|██████████| 631898/631898 [04:38<00:00, 2271.79it/s]
8) Generating InChIKeys: 100%|██████████| 631898/631898 [07:44<00:00, 1359.11it/s]
  output_df = pd.concat(
Processing datasets:   1%|          | 7/1268 [1:21:43<331:13:33, 945.61s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ GCN5 (YEAST).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 15225/15225 [00:00<00:00, 55457.81it/s]
4) Augmenting data: 100%|██████████| 1/1 [00:00<00:00, 213.27it/s]
5) Updating predicted rows: 100%|██████████| 16/16 [00:00<00:00, 468.94it/s]


⚠️  Empty product sets in 0 / 16 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 2048 (0.00%)


7) Standardizing SMILES: 100%|██████████| 2048/2048 [00:01<00:00, 1884.59it/s]
8) Generating InChIKeys: 100%|██████████| 2048/2048 [00:01<00:00, 1121.17it/s]
Processing datasets:   1%|          | 8/1268 [1:21:56<226:59:07, 648.53s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ KC1D (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 5305653/5305653 [02:11<00:00, 40390.44it/s]
4) Augmenting data: 100%|██████████| 253/253 [00:28<00:00,  8.78it/s]
5) Updating predicted rows: 100%|██████████| 35467/35467 [00:59<00:00, 600.77it/s]


⚠️  Empty product sets in 0 / 35467 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 245796 (0.00%)


7) Standardizing SMILES: 100%|██████████| 245796/245796 [03:19<00:00, 1230.34it/s]
8) Generating InChIKeys: 100%|██████████| 245796/245796 [05:50<00:00, 701.85it/s]
  output_df = pd.concat(
Processing datasets:   1%|          | 9/1268 [1:40:24<277:06:44, 792.38s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ MAL32 (YEAST).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 1431/1431 [00:00<00:00, 57675.96it/s]
4) Augmenting data: 0it [00:00, ?it/s]
5) Updating predicted rows: 0it [00:00, ?it/s]


6) Preparing output file
Invalid SMILES removed: 0 / 588 (0.00%)


7) Standardizing SMILES: 100%|██████████| 588/588 [00:00<00:00, 1276.29it/s]
8) Generating InChIKeys: 100%|██████████| 588/588 [00:00<00:00, 883.43it/s]
Processing datasets:   1%|          | 10/1268 [1:40:33<192:18:17, 550.32s/it]

Dataset: oneADMET_LR-STL---pK$_{i}$ CRFR1 (RAT).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 466095/466095 [00:08<00:00, 54965.92it/s]
4) Augmenting data: 100%|██████████| 17/17 [00:00<00:00, 55.88it/s]
5) Updating predicted rows: 100%|██████████| 1240/1240 [00:01<00:00, 864.74it/s] 


⚠️  Empty product sets in 0 / 1240 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 22172 (0.00%)


7) Standardizing SMILES: 100%|██████████| 22172/22172 [00:13<00:00, 1631.48it/s]
8) Generating InChIKeys: 100%|██████████| 22172/22172 [00:20<00:00, 1075.94it/s]
  output_df = pd.concat(
Processing datasets:   1%|          | 11/1268 [1:42:02<142:54:38, 409.29s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ CRTN (STAAE).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 298378/298378 [00:08<00:00, 34219.81it/s]
4) Augmenting data: 100%|██████████| 313/313 [00:06<00:00, 50.28it/s]
5) Updating predicted rows: 100%|██████████| 25867/25867 [00:20<00:00, 1270.61it/s]


⚠️  Empty product sets in 0 / 25867 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 40092 (0.00%)


7) Standardizing SMILES: 100%|██████████| 40092/40092 [00:20<00:00, 1946.15it/s]
8) Generating InChIKeys: 100%|██████████| 40092/40092 [00:34<00:00, 1179.17it/s]
  output_df = pd.concat(
Processing datasets:   1%|          | 12/1268 [1:44:15<113:27:47, 325.21s/it]

Dataset: oneADMET_LR-STL---pK$_{i}$ DRD1 (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 8571870/8571870 [02:50<00:00, 50206.68it/s]
4) Augmenting data: 100%|██████████| 374/374 [00:10<00:00, 34.75it/s]
5) Updating predicted rows: 100%|██████████| 35243/35243 [01:13<00:00, 478.77it/s]


⚠️  Empty product sets in 0 / 35243 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 115692 (0.00%)


7) Standardizing SMILES: 100%|██████████| 115692/115692 [01:05<00:00, 1762.11it/s]
8) Generating InChIKeys: 100%|██████████| 115692/115692 [01:51<00:00, 1035.71it/s]
  output_df = pd.concat(
Processing datasets:   1%|          | 13/1268 [1:54:16<142:26:10, 408.58s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ DGLA (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 335790/335790 [00:09<00:00, 35315.81it/s]
4) Augmenting data: 100%|██████████| 88/88 [00:00<00:00, 110.02it/s]
5) Updating predicted rows: 100%|██████████| 1009/1009 [00:12<00:00, 82.26it/s] 


⚠️  Empty product sets in 0 / 1009 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 41462 (0.00%)


7) Standardizing SMILES: 100%|██████████| 41462/41462 [00:19<00:00, 2118.12it/s]
8) Generating InChIKeys: 100%|██████████| 41462/41462 [00:31<00:00, 1306.89it/s]
  output_df = pd.concat(
Processing datasets:   1%|          | 14/1268 [1:56:06<110:57:23, 318.54s/it]

Dataset: oneADMET_LR-STL---pK$_{d}$ SC6A3 (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 414505/414505 [00:07<00:00, 58761.81it/s]
4) Augmenting data: 100%|██████████| 3/3 [00:00<00:00, 25.54it/s]
5) Updating predicted rows: 100%|██████████| 350/350 [00:00<00:00, 573.74it/s]


⚠️  Empty product sets in 0 / 350 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 11096 (0.00%)


7) Standardizing SMILES: 100%|██████████| 11096/11096 [00:06<00:00, 1609.76it/s]
8) Generating InChIKeys: 100%|██████████| 11096/11096 [00:11<00:00, 989.67it/s] 
  output_df = pd.concat(
Processing datasets:   1%|          | 15/1268 [1:56:52<82:16:32, 236.39s/it] 

Dataset: oneADMET_LR-STL---pK$_{i}$ HGFA (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 20301/20301 [00:00<00:00, 41910.09it/s]
4) Augmenting data: 0it [00:00, ?it/s]
5) Updating predicted rows: 0it [00:00, ?it/s]


6) Preparing output file
Invalid SMILES removed: 0 / 6780 (0.00%)


7) Standardizing SMILES: 100%|██████████| 6780/6780 [00:06<00:00, 1091.90it/s]
8) Generating InChIKeys: 100%|██████████| 6780/6780 [00:10<00:00, 664.35it/s]
Processing datasets:   1%|▏         | 16/1268 [1:57:28<61:14:56, 176.12s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ CCR6 (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 390286/390286 [00:10<00:00, 36645.41it/s]
4) Augmenting data: 100%|██████████| 5/5 [00:01<00:00,  3.33it/s]
5) Updating predicted rows: 100%|██████████| 1536/1536 [00:22<00:00, 68.50it/s] 


⚠️  Empty product sets in 0 / 1536 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 92868 (0.00%)


7) Standardizing SMILES: 100%|██████████| 92868/92868 [00:58<00:00, 1583.51it/s]
8) Generating InChIKeys: 100%|██████████| 92868/92868 [01:37<00:00, 954.79it/s]
  output_df = pd.concat(
Processing datasets:   1%|▏         | 17/1268 [2:02:31<74:23:19, 214.07s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ KPCA (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 2305878/2305878 [00:07<00:00, 293071.34it/s]
4) Augmenting data: 100%|██████████| 78/78 [00:00<00:00, 355.14it/s]
5) Updating predicted rows: 100%|██████████| 2829/2829 [00:03<00:00, 869.16it/s] 


⚠️  Empty product sets in 0 / 2829 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 70052 (0.00%)


7) Standardizing SMILES: 100%|██████████| 70052/70052 [00:08<00:00, 8232.38it/s]
8) Generating InChIKeys: 100%|██████████| 70052/70052 [00:13<00:00, 5105.48it/s]
  output_df = pd.concat(
Processing datasets:   1%|▏         | 18/1268 [2:06:33<77:18:21, 222.64s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ ACHA3 (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 41041/41041 [00:00<00:00, 253647.94it/s]
4) Augmenting data: 100%|██████████| 2/2 [00:00<00:00, 284.25it/s]
5) Updating predicted rows: 100%|██████████| 120/120 [00:00<00:00, 9884.65it/s]


⚠️  Empty product sets in 0 / 120 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 13960 (0.00%)


7) Standardizing SMILES: 100%|██████████| 13960/13960 [00:00<00:00, 14517.52it/s]
8) Generating InChIKeys: 100%|██████████| 13960/13960 [00:01<00:00, 8582.61it/s]
Processing datasets:   1%|▏         | 19/1268 [2:06:42<55:00:34, 158.55s/it]

Dataset: oneADMET_LR-STL---pIC$_{50}$ EDNRB (PIG).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 2366400/2366400 [00:10<00:00, 225444.39it/s]
4) Augmenting data: 100%|██████████| 80/80 [00:00<00:00, 82.06it/s]
5) Updating predicted rows: 100%|██████████| 13698/13698 [00:14<00:00, 918.17it/s]


⚠️  Empty product sets in 0 / 13698 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 220272 (0.00%)


7) Standardizing SMILES: 100%|██████████| 220272/220272 [00:28<00:00, 7815.72it/s]
8) Generating InChIKeys: 100%|██████████| 220272/220272 [00:43<00:00, 5015.05it/s]
  output_df = pd.concat(
Processing datasets:   2%|▏         | 20/1268 [2:09:12<54:01:01, 155.82s/it]

Dataset: oneADMET_LR-STL---pK$_{i}$ CAH2 (HUMAN).parquet
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pears