# 0) Modules & Functions

In [None]:
import os
import pandas as pd
from rdkit import Chem
import subprocess
import hashlib
import tempfile
import base64
import numpy as np
from itertools import combinations
from scipy.stats import pearsonr
from tqdm import tqdm
import time
from collections import defaultdict
from rdkit.Chem import AllChem
from rdkit import RDLogger
import warnings
import re
from rdkit.Chem import inchi
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from rdkit.Chem import Descriptors

# Disable RDKit warnings
RDLogger.DisableLog('rdApp.*')

warnings.filterwarnings("ignore", category=UserWarning, module="rdkit")
warnings.filterwarnings("ignore", category=FutureWarning, module="rdkit")
warnings.filterwarnings("ignore", category=DeprecationWarning, module="rdkit")

class MMPGenerator:
    def __init__(self, df_input, output_csv, mmpa_dir='../mmpa',
                 symmetric=True, max_heavy=14, max_ratio=0.7, verbose=True):
        self.df_original = df_input.copy()
        self.output_csv = output_csv
        self.mmpa_dir = mmpa_dir
        self.symmetric = symmetric
        self.max_heavy = max_heavy
        self.max_ratio = max_ratio
        self.verbose = verbose

    def _get_inchikey(self, smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.inchi.MolToInchiKey(mol)
        return self.encode_string(smiles)

    def encode_string(self, s):
        return base64.urlsafe_b64encode(s.encode()).decode()

    def decode_string(self, b64):
        return base64.urlsafe_b64decode(b64.encode()).decode()

    def run(self):
        self.df_original['ID'] = [self._get_inchikey(smi) for smi in self.df_original['SMILES']]
        y_map = self.df_original.set_index('ID')['Y'].to_dict()

        with tempfile.TemporaryDirectory() as tmp:
            smi_path = os.path.join(tmp, 'input.smi')
            frag_path = os.path.join(tmp, 'fragmented.txt')
            mmps_path = os.path.join(tmp, 'mmps.csv')
            smirks_path = os.path.join(tmp, 'smirks.txt')
            cansmirks_path = os.path.join(tmp, 'cansmirks.txt')

            self.df_original[['SMILES', 'ID']].to_csv(smi_path, index=False, sep=' ', header=False)

            print("0) Fragments generation")
            with open(frag_path, 'w') as out:
                subprocess.run(['python', f'{self.mmpa_dir}/rfrag.py'], stdin=open(smi_path), stdout=out)

            print("1) Indexing")
            cmd = ['python', f'{self.mmpa_dir}/indexing.py']
            if self.symmetric:
                cmd.append('-s')
            if self.max_heavy:
                cmd.extend(['-m', str(self.max_heavy)])
            if self.max_ratio:
                cmd.extend(['-r', str(self.max_ratio)])

            with open(mmps_path, 'w') as out:
                subprocess.run(cmd, stdin=open(frag_path), stdout=out)

            with open(mmps_path) as f:
                lines = [line.strip() for line in f if line.strip()]


            splits = [line.split(',') for line in lines]
            max_len = max(len(s) for s in splits)

            df = pd.DataFrame(splits, columns=['L_SMILES', 'R_SMILES', 'L_ID', 'R_ID', 'SMIRKS', 'CORE'])


            df['L_Y'] = df['L_ID'].map(y_map)
            df['R_Y'] = df['R_ID'].map(y_map)
            df['Delta_Y'] = df['R_Y'] - df['L_Y']

            df = df[df['SMIRKS'].apply(lambda x: isinstance(x, str) and '>>' in x)]

            df['__row'] = range(len(df))
            df[['SMIRKS', '__row']].to_csv(smirks_path, index=False, sep=' ', header=False)

            print("2) Canonical SMIRKS generation")
            with open(cansmirks_path, 'w') as out:
                subprocess.run(['python', f'{self.mmpa_dir}/cansmirk.py'], stdin=open(smirks_path), stdout=out)


            canon_df = pd.read_csv(cansmirks_path, sep=' ', names=['Canonical_SMIRKS', 'index'])


            df = df.merge(canon_df, left_on='__row', right_on='index').drop(columns=['__row', 'index'])

            df[['L_sub', 'R_sub']] = df['Canonical_SMIRKS'].str.split('>>', expand=True)


            df['L_sub_ID'] = [self.encode_string(k) for k in df['L_sub'].tolist()]
            df['R_sub_ID'] = [self.encode_string(k) for k in df['R_sub'].tolist()]
            df['SMIRKS_ID'] = [self.encode_string(k) for k in df['Canonical_SMIRKS'].tolist()]
            df['CORE_ID'] = [self.encode_string(k) for k in df['CORE'].tolist()]
            df = df.drop_duplicates()


            df[['L_SMILES', 'R_SMILES', 'L_ID', 'R_ID', 'SMIRKS', 'CORE', 'L_Y', 'R_Y', 'Delta_Y', 'L_sub', 'R_sub', 'L_sub_ID', 'R_sub_ID', 'SMIRKS_ID', 'CORE_ID']].to_csv(
                self.output_csv, index=False
            )


class MMPAugmentorFixed:
    def __init__(self, df, min_common=4, pearson_thresh=0.3, crmsd_thresh=0.4):
        self.df = df.copy()
        self.min_common = min_common
        self.pearson_thresh = pearson_thresh
        self.crmsd_thresh = crmsd_thresh
        self.series = {}
        self.pair_scores = []
        self.filtered_pairs = []
        self.augmented_data = []

    def _extract_series(self):
        start = time.time()
        self.series = {
            core: group for core, group in self.df.groupby("CORE")
        }

    def _compute_pairwise_scores(self):
        start = time.time()
        self.pair_scores = []
        series_items = list(self.series.items())
        total_combinations = len(series_items) * (len(series_items) - 1) // 2
        for (core1, df1), (core2, df2) in tqdm(combinations(series_items, 2), desc="3) Computing pairwise correlations", total=total_combinations):
            subs1 = set(df1["L_sub"])
            subs2 = set(df2["L_sub"])
            common = subs1 & subs2
            if len(common) < self.min_common:
                continue

            merged = pd.merge(
                df1, df2,
                left_on=["L_sub", "R_sub"],
                right_on=["L_sub", "R_sub"],
                suffixes=('_1', '_2')
            )
            if len(merged) < self.min_common:
                continue

            y1 = merged['Delta_Y_1'].values
            y2 = merged['Delta_Y_2'].values
            crmsd = np.sqrt(np.mean((y1 - y2) ** 2))
            try:
                corr = pearsonr(y1, y2)[0]
            except:
                corr = np.nan

            self.pair_scores.append((core1, core2, crmsd, corr, len(merged)))

    def _filter_pairs(self):
        start = time.time()
        self.filtered_pairs = [
            (s1, s2) for s1, s2, rmsd, corr, n in self.pair_scores
            if rmsd <= self.crmsd_thresh and (not np.isnan(corr) and corr >= self.pearson_thresh)
        ]

    def _augment(self):
        start = time.time()
        augmented_entries = []

        for s1, s2 in tqdm(self.filtered_pairs, desc="4) Augmenting data"):
            df1 = self.series[s1]
            df2 = self.series[s2]

            tf1 = df1[["L_sub", "R_sub", "Delta_Y"]].to_dict("records")
            tf2 = df2[["L_sub", "R_sub", "Delta_Y"]].to_dict("records")

            df1_dict = defaultdict(list)
            for _, row in df1.iterrows():
                df1_dict[row["L_sub"]].append(row.to_dict())

            df2_dict = defaultdict(list)
            for _, row in df2.iterrows():
                df2_dict[row["L_sub"]].append(row.to_dict())

            for entry in tf1:
                l_sub = entry["L_sub"]
                for base in df2_dict.get(l_sub, []):
                    r_sub = entry["R_sub"]
                    delta = entry["Delta_Y"]
                    new_y = base["L_Y"] + delta
                    smirks_new = l_sub + ">>" + r_sub
                    augmented_entries.append({
                        "CORE": s1,
                        "L_sub": l_sub,
                        "R_sub": r_sub,
                        "L_Y": base["L_Y"],
                        "R_Y": new_y,
                        "Delta_Y": delta,
                        "AUG": True,
                        "L_SMILES": base.get("L_SMILES"),
                        "L_ID": base.get("L_ID"),
                        "L_sub_ID": base.get("L_sub_ID"),
                        "R_sub_ID": base64.urlsafe_b64encode(r_sub.encode()).decode(),
                        "SMIRKS": smirks_new,
                        "SMIRKS_ID": base64.urlsafe_b64encode(smirks_new.encode()).decode(),
                        "CORE_ID": base64.urlsafe_b64encode(s1.encode()).decode()
                    })

            for entry in tf2:
                l_sub = entry["L_sub"]
                for base in df1_dict.get(l_sub, []):
                    r_sub = entry["R_sub"]
                    delta = entry["Delta_Y"]
                    new_y = base["L_Y"] + delta
                    smirks_new = l_sub + ">>" + r_sub
                    augmented_entries.append({
                        "CORE": s2,
                        "L_sub": l_sub,
                        "R_sub": r_sub,
                        "L_Y": base["L_Y"],
                        "R_Y": new_y,
                        "Delta_Y": delta,
                        "AUG": True,
                        "L_SMILES": base.get("L_SMILES"),
                        "L_ID": base.get("L_ID"),
                        "L_sub_ID": base.get("L_sub_ID"),
                        "R_sub_ID": base64.urlsafe_b64encode(r_sub.encode()).decode(),
                        "SMIRKS": smirks_new,
                        "SMIRKS_ID": base64.urlsafe_b64encode(smirks_new.encode()).decode(),
                        "CORE_ID": base64.urlsafe_b64encode(s2.encode()).decode()
                    })

        self.augmented_data = pd.DataFrame(augmented_entries)


    def run(self):
        self._extract_series()
        self._compute_pairwise_scores()
        self._filter_pairs()
        self._augment()

        original = self.df.copy()
        original["AUG"] = False
        return pd.concat([original, self.augmented_data], ignore_index=True)
    
    def get_pair_scores_df(self):
        """
        Return a DataFrame of scaffold pair scores (cRMSD, Pearson, common MMP count)
        """
        return pd.DataFrame(
            self.pair_scores,
            columns=["Scaffold_1", "Scaffold_2", "cRMSD", "Pearson", "N_common"]
        )

    def get_augmented_only(self):
        """
        Return only the augmented (predicted) entries.
        """
        return self.augmented_data.copy()
    

def analyze_scaffold_pair_scores(df):
    summary = {
        "Total Pairs": len(df),
        "Mean cRMSD": df["cRMSD"].mean(),
        "Median cRMSD": df["cRMSD"].median(),
        "Std cRMSD": df["cRMSD"].std(),
        "Mean Pearson": df["Pearson"].mean(),
        "Median Pearson": df["Pearson"].median(),
        "Std Pearson": df["Pearson"].std(),
        "Mean N_common": df["N_common"].mean(),
        "Median N_common": df["N_common"].median()
    }

    high_corr = df["Pearson"] > 0.7
    low_crmsd = df["cRMSD"] < 0.5
    enough_common = df["N_common"] >= 5
    strong_pairs = df[high_corr & low_crmsd & enough_common]

    summary.update({
        "High Pearson (>0.7)": high_corr.sum(),
        "Low cRMSD (<0.5)": low_crmsd.sum(),
        "N_common ≥ 5": enough_common.sum(),
        "Strong Pairs (all 3)": len(strong_pairs)
    })

    top_corr = df.sort_values("Pearson", ascending=False).head(5)
    top_low_crmsd = df.sort_values("cRMSD").head(5)
    top_common = df.sort_values("N_common", ascending=False).head(5)

    return {
        "summary": summary,
        "top_corr": top_corr,
        "top_low_crmsd": top_low_crmsd,
        "top_common": top_common,
        "strong_pairs": strong_pairs
    }




def fast_apply_transformation(transformation, l_smiles, rxn_cache, core_cache, heavy_cache, core_smarts, failure_tracker=None):
    if pd.isna(transformation) or pd.isna(l_smiles):
        return None

    # --- Cache reaction and Δheavy ---
    if transformation not in rxn_cache:
        try:
            rxn = AllChem.ReactionFromSmarts(transformation)
            left_smi, right_smi = transformation.split(">>")
            left_mol = Chem.MolFromSmarts(left_smi)
            right_mol = Chem.MolFromSmarts(right_smi)
            delta_heavy = right_mol.GetNumHeavyAtoms() - left_mol.GetNumHeavyAtoms()
            rxn_cache[transformation] = (rxn, delta_heavy)
        except:
            return None
    else:
        rxn, delta_heavy = rxn_cache[transformation]

    # --- Cache L_SMILES heavy atom count ---
    if l_smiles not in heavy_cache:
        mol_l = Chem.MolFromSmiles(l_smiles)
        if mol_l is None:
            return None
        n_heavy_l = mol_l.GetNumHeavyAtoms()
        heavy_cache[l_smiles] = (mol_l, n_heavy_l)
    else:
        mol_l, n_heavy_l = heavy_cache[l_smiles]

    # --- Cache core mol ---
    if core_smarts not in core_cache:
        core_mol = Chem.MolFromSmarts(core_smarts)
        if core_mol is None:
            return None
        core_cache[core_smarts] = core_mol
    else:
        core_mol = core_cache[core_smarts]

    # --- Run reaction ---
    try:
        products = rxn.RunReactants((mol_l,))
    except Exception:
        return None

    if failure_tracker is not None:
        failure_tracker["total"] += 1
        if not products:
            failure_tracker["empty_prodsets"] += 1

    all_products = []
    for prod_set in products:
        if not prod_set and failure_tracker is not None:
            failure_tracker["empty_prodsets"] += 1
        for prod in prod_set:
            if prod is None:
                continue
            try:
                if not prod.HasSubstructMatch(core_mol):
                    continue
            except:
                continue
            n_heavy_r = prod.GetNumHeavyAtoms()
            if n_heavy_r - n_heavy_l != delta_heavy:
                continue
            all_products.append(Chem.MolToSmiles(prod, isomericSmiles=True))

    return all_products if all_products else None

def update_predicted_rows(df):
    df = df.drop_duplicates()
    mask = df["AUG"] == True
    indices = df[mask].index

    rxn_cache = {}
    heavy_cache = {}
    core_cache = {}

    new_rows = []
    failure_tracker = {"total": 0, "empty_prodsets": 0}

    for idx in tqdm(indices, desc="5) Updating predicted rows"):
        row = df.loc[idx]
        l_smiles = row["L_SMILES"]
        smirks = row["SMIRKS"]
        core = row["CORE"]

        r_smiles_list = fast_apply_transformation(
            smirks, l_smiles, rxn_cache, core_cache, heavy_cache, core,
            failure_tracker=failure_tracker
        )

        if r_smiles_list:
            for r_smiles in r_smiles_list:
                mol = Chem.MolFromSmiles(r_smiles)
                if mol:
                    r_id = Chem.InchiToInchiKey(Chem.MolToInchi(mol))
                    new_row = row.copy()
                    new_row["R_SMILES"] = r_smiles
                    new_row["R_ID"] = r_id
                    new_rows.append(new_row)

    df_non_aug = df[~mask]
    df_aug_expanded = pd.DataFrame(new_rows)
    df_final = pd.concat([df_non_aug, df_aug_expanded], ignore_index=True)

    # Print failure rate
    total = failure_tracker["total"]
    failed = failure_tracker["empty_prodsets"]
    if total > 0:
        print(f"⚠️  Empty product sets in {failed} / {total} ({100 * failed / total:.2f}%) of transformations")

    return df_final








def prepare_and_plot_prediction_vs_experiment_complete_with_output(imputed_df, std_threshold=0.5):

    print("6) Preparing output file")

    l_df = imputed_df[["L_SMILES", "L_Y", "AUG"]]
    r_df = imputed_df[["R_SMILES", "R_Y", "AUG"]]
    l_df.columns = ["SMILES", "Y", "AUG"]
    r_df.columns = ["SMILES", "Y", "AUG"]
    clean_df = pd.concat([l_df, r_df])


    # Count total before
    total_before = len(clean_df)

    # Drop rows with missing or invalid SMILES
    clean_df = clean_df.dropna(subset=["SMILES"]).copy()
    clean_df = clean_df[clean_df["SMILES"].apply(lambda x: Chem.MolFromSmiles(x) is not None)]

    # Count total after
    total_after = len(clean_df)

    # Print failure rate
    fail_pct = 100 * (total_before - total_after) / total_before
    print(f"Invalid SMILES removed: {total_before - total_after} / {total_before} ({fail_pct:.2f}%)")

    tqdm.pandas(desc="7) Standardizing SMILES")
    clean_df["SMILES"] = clean_df["SMILES"].progress_apply(
        lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=True)
    )

    # Step 8) InChIKey generation
    tqdm.pandas(desc="8) Generating InChIKeys")
    clean_df['InChIKey'] = clean_df["SMILES"].progress_apply(smiles_to_inchikey)


    # Insert InChIKey as first column
    cols = ['InChIKey'] + [col for col in clean_df.columns if col != 'InChIKey']
    clean_df = clean_df[cols]


    clean_df_exp = clean_df[clean_df["AUG"] != True].copy()
    clean_df_pred = clean_df[clean_df["AUG"] == True].copy()

    # Normalize column names
    clean_df_exp = clean_df_exp.rename(columns={"R_SMILES": "SMILES", "R_Y": "Y"})
    clean_df_pred = clean_df_pred.rename(columns={"R_SMILES": "SMILES", "R_Y": "Y"})

    # Compute median and std per SMILES in predicted
    grouped_pred = clean_df_pred.groupby("InChIKey")["Y"].agg(["median", "std"]).reset_index()
    grouped_pred.columns = ["InChIKey", "Y", "STD"]
    grouped_pred["AUG"] = True

    # Re-add SMILES by mapping InChIKey → first SMILES in original pred dataframe
    inchikey_to_smiles = clean_df_pred.dropna(subset=["SMILES"]).drop_duplicates("InChIKey").set_index("InChIKey")["SMILES"].to_dict()
    grouped_pred["SMILES"] = grouped_pred["InChIKey"].map(inchikey_to_smiles)
    grouped_pred = grouped_pred[["InChIKey", "SMILES", "Y", "STD", "AUG"]]
    grouped_pred = grouped_pred[grouped_pred["STD"]<= std_threshold]

    # Final prediction rows
    exp_out = clean_df_exp[['InChIKey', "SMILES", "Y"]].copy()
    exp_out["STD"] = None
    exp_out["AUG"] = False
    exp_out = exp_out[["InChIKey", "SMILES", "Y", "STD", "AUG"]]

    exp_out = exp_out.dropna(how='all')
    grouped_pred = grouped_pred.dropna(how='all')

    # Remove predicted entries with InChIKey already in experimental
    known_keys = set(exp_out['InChIKey'])
    grouped_pred = grouped_pred[~grouped_pred['InChIKey'].isin(known_keys)]


    output_df = pd.concat(
    [exp_out.drop_duplicates("InChIKey"), grouped_pred.drop_duplicates("InChIKey")],
    ignore_index=True
    )

    output_df = output_df[~((output_df['AUG'] == True) & (output_df['InChIKey'].isin(output_df.loc[output_df['AUG'] == False, 'InChIKey'])))]

    return output_df


# Compute InChIKey from SMILES
def smiles_to_inchikey(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    try:
        return inchi.MolToInchiKey(mol)
    except:
        return None
    

def assign_set_from_inchikey(output_df, reference_df, smiles_col="SMILES", inchikey_col="InChIKey", set_col="SET"):
    # Compute InChIKey for each SMILES in output_df
    output_df[inchikey_col] = output_df[smiles_col].apply(
        lambda smi: Chem.inchi.MolToInchiKey(Chem.MolFromSmiles(smi)) if Chem.MolFromSmiles(smi) else None
    )
    
    # Create lookup from reference_df
    inchikey_map = dict(zip(reference_df[inchikey_col], reference_df[set_col]))
    
    # Map SET
    output_df[set_col] = output_df[inchikey_col].map(inchikey_map)
    
    return output_df


def smiles_to_morgan(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)
    return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits))

def train_and_compare_models(output_df, random_state=29121997):
    print("🔬 Starting model training and comparison...")

    # --- Step 1: Compute Morgan fingerprints for all molecules ---
    tqdm.pandas(desc="Generating fingerprints")
    output_df_ML = output_df.copy()
    output_df_ML["FP"] = output_df_ML["SMILES"].progress_apply(smiles_to_morgan)

    # --- Step 2: Split into training and testing sets ---
    train_df = output_df_ML[(output_df_ML["SET"].isin(["A","B"])==True) & (output_df_ML["AUG"] == False)]
    aug_df = output_df_ML[output_df_ML["AUG"] == True]
    test_df = output_df_ML[output_df_ML["SET"].isin(["A","B"])==False]

    # --- Step 3: Format X and y ---
    def get_X_y(df):
        X = np.stack(df["FP"].values)
        y = df["Y"].values
        return X, y

    X_train_no_aug, y_train_no_aug = get_X_y(train_df)
    X_train_aug, y_train_aug = get_X_y(pd.concat([train_df, aug_df], ignore_index=True))
    X_test, y_test = get_X_y(test_df)

    # --- Step 4: Train models ---
    model_no_aug = RandomForestRegressor(n_estimators=200, random_state=random_state, n_jobs=-1)
    model_aug = RandomForestRegressor(n_estimators=200, random_state=random_state, n_jobs=-1)

    print("🚜 Training on raw train set (no AUG)...")
    model_no_aug.fit(X_train_no_aug, y_train_no_aug)

    print("🌱 Training on train + AUG data...")
    model_aug.fit(X_train_aug, y_train_aug)

    # --- Step 5: Predict on test set ---
    y_pred_no_aug = model_no_aug.predict(X_test)
    y_pred_aug = model_aug.predict(X_test)

    # --- Step 6: Compute metrics ---
    def report_metrics(y_true, y_pred, label):
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        r2 = r2_score(y_true, y_pred)
        print(f"{label} => RMSE: {rmse:.4f}, R²: {r2:.4f}")
        return rmse, r2

    print("\n📊 Evaluation on test set:")
    rmse_no_aug, r2_no_aug = report_metrics(y_test, y_pred_no_aug, "Baseline model (no AUG)")
    rmse_aug, r2_aug = report_metrics(y_test, y_pred_aug, "Augmented model (with AUG)")

    return {
        "RMSE_NoAug": rmse_no_aug,
        "R2_NoAug": r2_no_aug,
        "RMSE_Aug": rmse_aug,
        "R2_Aug": r2_aug
    }

from rdkit import Chem
from rdkit.Chem import Draw


def show_top_aug_vs_nonaug(output_df, n=8):
    # Sort by SMILES then AUG=False before AUG=True
    sorted_df = output_df.sort_values(by=["SMILES", "AUG"], ascending=[True, False])
    
    # Select top non-augmented and augmented
    top_nonaug = sorted_df[sorted_df["AUG"] == False].head(n)
    top_aug = sorted_df[sorted_df["AUG"] == True].head(n)

    combined = pd.concat([top_nonaug, top_aug])
    mols = [Chem.MolFromSmiles(s) for s in combined["SMILES"]]

    legends = [f'{i+1}. {row["InChIKey"][:8]} | {row["Y"]:.2f}' for i, row in combined.iterrows()]
    img = Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(400, 400), legends=legends, useSVG=False)
    return img


In [None]:

# Compute InChIKey from SMILES
def smiles_to_inchikey(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    try:
        return inchi.MolToInchiKey(mol)
    except:
        return None
    

def assign_set_from_inchikey(output_df, reference_df, smiles_col="SMILES", inchikey_col="InChIKey", set_col="SET"):
    # Compute InChIKey for each SMILES in output_df
    output_df[inchikey_col] = output_df[smiles_col].apply(
        lambda smi: Chem.inchi.MolToInchiKey(Chem.MolFromSmiles(smi)) if Chem.MolFromSmiles(smi) else None
    )
    
    # Create lookup from reference_df
    inchikey_map = dict(zip(reference_df[inchikey_col], reference_df[set_col]))
    
    # Map SET
    output_df[set_col] = output_df[inchikey_col].map(inchikey_map)
    
    return output_df


def smiles_to_morgan(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)
    return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits))

# === MODEL FACTORIES ===
def get_rf(seed):
    return RandomForestRegressor(n_estimators=200, random_state=seed, n_jobs=-1)

def get_xgb(seed):
    return XGBRegressor(n_estimators=200, random_state=seed, n_jobs=-1, verbosity=0)
import os
import pandas as pd
from pathlib import Path
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from tqdm import tqdm
import joblib
import subprocess
import time

RANDOM_STATES = [42, 1337, 29121997]

# === DESCRIPTOR FUNCTIONS ===
def smiles_to_ecfp(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return np.array(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits))
    return np.zeros(nBits)

def smiles_to_avalon(smiles, nBits=512):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return np.array(DataStructs.BitVectToNumPyArray(MACCSkeys.GenMACCSKeys(mol)))
    return np.zeros(nBits)

from itertools import combinations
from pathlib import Path

base_dirs = ["../data/noaug", "../data/aug", "../data/test", "../data/frag"]
combinations_set = ["BCDEFGHIJ", "ACDEFGHIJ", "ABDEFGHIJ", "ABCEFGHIJ","ABCDFGHIJ", "ABCDEGHIJ", "ABCDEFHIJ", "ABCDEFGIJ", "ABCDEFGHJ","ABCDEFGHI", "ALL"]
combinations_second = ["STL", "MTL"]

# Create main dirs
for base in base_dirs:
    Path(base).mkdir(parents=True, exist_ok=True)
    # Create subdirs

    
    for combo in combinations_set:
        if base != "../data/frag":
            for combo_2 in combinations_second:
                Path(f"{base}/{combo}/{combo_2}").mkdir(parents=True, exist_ok=True)
        else:
            Path(f"{base}/{combo}").mkdir(parents=True, exist_ok=True)


# 1) Parameters

In [None]:
max_heavy = 20
max_ratio = 0.3
min_common = 4
pearson_thresh = 0.3
crmsd_thresh = 0.8
std_threshold = 2

# 2) Augment


---------------------------------------------------------------------------
-> Add time to process, per file, in a log
---------------------------------------------------------------------------



In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

def compute_metrics(y_true, y_pred):
    mask = ~pd.isna(y_true) & ~pd.isna(y_pred)
    if mask.sum() == 0:
        return np.nan, np.nan
    return r2_score(y_true[mask], y_pred[mask]), mean_squared_error(y_true[mask], y_pred[mask], squared=False)

result_log = []
for dataset in tqdm(os.listdir('../data/exp/STL/'), desc="Processing datasets"):
    print("Dataset:", dataset)
    df = pd.read_parquet(f'../data/exp/STL/{dataset}')
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    for train_set in (['A', 'B', 'C', 'D', 'E','F','G','H','I'],['A', 'B', 'C', 'D', 'E','F','G','H','J'],['A', 'B', 'C', 'D', 'E','F','G','I','J'],['A', 'B', 'C', 'D', 'E','F','H','I','J'],['A', 'B', 'C', 'D', 'E','G','H','I','J'],
                      ['A', 'B', 'C', 'D', 'F','G','H','I','J'],['A', 'B', 'C',  'E','F','G','H','I','J'],['A', 'B',  'D', 'E','F','G','H','I','J'],['A',  'C', 'D', 'E','F','G','H','I','J'],[ 'B', 'C', 'D', 'E','F','G','H','I','J']):
        print("Train set:", train_set)
        set_ID = "".join(train_set)
        df_train = df[df["SET"].isin(train_set)]
        df_test = df[~df["SET"].isin(train_set)]
        output_csv = f'../data/frag/{set_ID}/{dataset}'
        MMPGenerator(df_train, output_csv=output_csv, symmetric=True,
                     max_heavy=max_heavy, max_ratio=max_ratio, verbose=True).run()
        augmentor = MMPAugmentorFixed(
            pd.read_csv(output_csv, on_bad_lines="skip"),
            min_common=min_common,
            pearson_thresh=pearson_thresh,
            crmsd_thresh=crmsd_thresh
        )
        final_df2 = augmentor.run()
        imputed_df = update_predicted_rows(final_df2)
        clean_df = prepare_and_plot_prediction_vs_experiment_complete_with_output(imputed_df, std_threshold=std_threshold)
        output_df = assign_set_from_inchikey(clean_df, df)
        # Find InChIKeys in df_train not in output_df
        missing_rows = df_train[~df_train['InChIKey'].isin(output_df['InChIKey'])]
        missing_rows['STD'] = None
        missing_rows['AUG'] = False
        missing_rows = missing_rows[['InChIKey', 'SMILES', 'Y', 'STD', 'AUG', 'SET']]
        full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
        # Add labels
        df_train["AUG"] = False
        df_test["AUG"] = False
        # Model and eval


        full_aug_df.to_parquet(f'../data/aug/{set_ID}/STL/{dataset}', index=False)

        df_train["STD"] = None
        df_test["STD"] = None
        df_train["AUG"] = False
        df_test["AUG"] = False

        df_train = df_train[['InChIKey', 'SMILES', 'Y', 'STD', 'AUG', 'SET']]
        df_test = df_test[['InChIKey', 'SMILES', 'Y', 'STD', 'AUG', 'SET']]

        
        df_train.to_parquet(f'../data/noaug/{set_ID}/STL/{dataset}', index=False)
        df_test.to_parquet(f'../data/test/{set_ID}/STL/{dataset}', index=False)

        X_train_noaug = np.stack(df_train["SMILES"].map(smiles_to_ecfp))
        y_train_noaug = df_train["Y"].values
        X_train_aug = np.stack(full_aug_df["SMILES"].map(smiles_to_ecfp))
        y_train_aug = full_aug_df["Y"].values
        X_test = np.stack(df_test["SMILES"].map(smiles_to_ecfp))
        y_test = df_test["Y"].values
        
        model_noaug = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42)
        model_aug = RandomForestRegressor(n_estimators=200, n_jobs=-1, random_state=42)
        model_noaug.fit(X_train_noaug, y_train_noaug)
        model_aug.fit(X_train_aug, y_train_aug)
        y_pred_noaug = model_noaug.predict(X_test)
        y_pred_aug = model_aug.predict(X_test)
        r2_noaug, rmse_noaug = compute_metrics(y_test, y_pred_noaug)
        r2_aug, rmse_aug = compute_metrics(y_test, y_pred_aug)
        result_log.append({
            "Dataset": dataset,
            "Set": set_ID,
            "N_train_noaug": len(df_train),
            "N_train_aug": len(full_aug_df),
            "N_test": len(df_test),
            "R2_noaug": r2_noaug,
            "RMSE_noaug": rmse_noaug,
            "R2_aug": r2_aug,
            "RMSE_aug": rmse_aug
        })

Processing datasets:   0%|          | 0/1186 [00:00<?, ?it/s]

Dataset: oneADMET_LR-STL---pIC$_{50}$ TGFR1 (HUMAN).parquet
Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 40621591/40621591 [03:28<00:00, 194671.50it/s]
4) Augmenting data: 100%|██████████| 503/503 [00:16<00:00, 30.80it/s]
5) Updating predicted rows: 100%|██████████| 115133/115133 [00:22<00:00, 5131.34it/s]


⚠️  Empty product sets in 0 / 115133 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 711326 (0.00%)


7) Standardizing SMILES: 100%|██████████| 711326/711326 [01:31<00:00, 7744.25it/s]
8) Generating InChIKeys: 100%|██████████| 711326/711326 [02:32<00:00, 4669.47it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 40729825/40729825 [04:06<00:00, 165228.56it/s]
4) Augmenting data: 100%|██████████| 496/496 [00:13<00:00, 37.15it/s]
5) Updating predicted rows: 100%|██████████| 108338/108338 [00:25<00:00, 4241.95it/s]


⚠️  Empty product sets in 0 / 108338 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 718028 (0.00%)


7) Standardizing SMILES: 100%|██████████| 718028/718028 [01:35<00:00, 7517.89it/s]
8) Generating InChIKeys: 100%|██████████| 718028/718028 [02:36<00:00, 4591.28it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 40639620/40639620 [03:59<00:00, 169431.83it/s]
4) Augmenting data: 100%|██████████| 563/563 [00:13<00:00, 41.86it/s]
5) Updating predicted rows: 100%|██████████| 115874/115874 [00:26<00:00, 4415.65it/s]


⚠️  Empty product sets in 0 / 115874 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 715490 (0.00%)


7) Standardizing SMILES: 100%|██████████| 715490/715490 [01:32<00:00, 7747.50it/s]
8) Generating InChIKeys: 100%|██████████| 715490/715490 [02:32<00:00, 4705.07it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 42343003/42343003 [03:52<00:00, 182079.82it/s]
4) Augmenting data: 100%|██████████| 542/542 [00:15<00:00, 36.07it/s]
5) Updating predicted rows: 100%|██████████| 119584/119584 [00:26<00:00, 4471.91it/s]


⚠️  Empty product sets in 0 / 119584 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 740352 (0.00%)


7) Standardizing SMILES: 100%|██████████| 740352/740352 [01:39<00:00, 7437.05it/s]
8) Generating InChIKeys: 100%|██████████| 740352/740352 [02:33<00:00, 4821.63it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 40756906/40756906 [04:00<00:00, 169553.37it/s]
4) Augmenting data: 100%|██████████| 505/505 [00:14<00:00, 35.01it/s]
5) Updating predicted rows: 100%|██████████| 115226/115226 [00:27<00:00, 4185.58it/s]


⚠️  Empty product sets in 0 / 115226 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 713280 (0.00%)


7) Standardizing SMILES: 100%|██████████| 713280/713280 [01:34<00:00, 7541.67it/s]
8) Generating InChIKeys: 100%|██████████| 713280/713280 [02:34<00:00, 4607.75it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 41518828/41518828 [04:16<00:00, 161831.97it/s]
4) Augmenting data: 100%|██████████| 545/545 [00:16<00:00, 33.81it/s]
5) Updating predicted rows: 100%|██████████| 124672/124672 [00:30<00:00, 4042.15it/s]


⚠️  Empty product sets in 0 / 124672 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 756000 (0.00%)


7) Standardizing SMILES: 100%|██████████| 756000/756000 [01:38<00:00, 7673.80it/s]
8) Generating InChIKeys: 100%|██████████| 756000/756000 [02:42<00:00, 4659.83it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 44015653/44015653 [04:31<00:00, 161911.97it/s]
4) Augmenting data: 100%|██████████| 556/556 [00:16<00:00, 32.86it/s]
5) Updating predicted rows: 100%|██████████| 128515/128515 [00:29<00:00, 4332.17it/s]


⚠️  Empty product sets in 0 / 128515 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 755220 (0.00%)


7) Standardizing SMILES: 100%|██████████| 755220/755220 [01:40<00:00, 7479.93it/s]
8) Generating InChIKeys: 100%|██████████| 755220/755220 [02:46<00:00, 4544.82it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 42269415/42269415 [04:13<00:00, 166972.07it/s]
4) Augmenting data: 100%|██████████| 512/512 [00:15<00:00, 33.62it/s]
5) Updating predicted rows: 100%|██████████| 117698/117698 [00:27<00:00, 4255.72it/s]


⚠️  Empty product sets in 0 / 117698 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 731176 (0.00%)


7) Standardizing SMILES: 100%|██████████| 731176/731176 [01:36<00:00, 7577.20it/s]
8) Generating InChIKeys: 100%|██████████| 731176/731176 [02:34<00:00, 4721.40it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 42693420/42693420 [04:14<00:00, 167594.61it/s]
4) Augmenting data: 100%|██████████| 470/470 [00:13<00:00, 34.18it/s]
5) Updating predicted rows: 100%|██████████| 114994/114994 [00:27<00:00, 4210.03it/s]


⚠️  Empty product sets in 0 / 114994 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 726376 (0.00%)


7) Standardizing SMILES: 100%|██████████| 726376/726376 [01:34<00:00, 7652.32it/s]
8) Generating InChIKeys: 100%|██████████| 726376/726376 [02:35<00:00, 4665.64it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 41209581/41209581 [04:05<00:00, 167932.00it/s]
4) Augmenting data: 100%|██████████| 485/485 [00:14<00:00, 34.26it/s]
5) Updating predicted rows: 100%|██████████| 109770/109770 [00:25<00:00, 4358.52it/s]


⚠️  Empty product sets in 0 / 109770 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 702540 (0.00%)


7) Standardizing SMILES: 100%|██████████| 702540/702540 [01:34<00:00, 7415.33it/s]
8) Generating InChIKeys: 100%|██████████| 702540/702540 [02:31<00:00, 4624.84it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Dataset: oneADMET_LR-STL---pK$_{i}$ CXCR3 (HUMAN).parquet
Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 1212903/1212903 [00:06<00:00, 178033.22it/s]
4) Augmenting data: 100%|██████████| 81/81 [00:00<00:00, 172.23it/s]
5) Updating predicted rows: 100%|██████████| 5400/5400 [00:02<00:00, 1852.08it/s]


⚠️  Empty product sets in 0 / 5400 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 114744 (0.00%)


7) Standardizing SMILES: 100%|██████████| 114744/114744 [00:13<00:00, 8272.49it/s]
8) Generating InChIKeys: 100%|██████████| 114744/114744 [00:22<00:00, 5095.60it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 1378630/1378630 [00:07<00:00, 173944.98it/s]
4) Augmenting data: 100%|██████████| 136/136 [00:00<00:00, 173.45it/s]
5) Updating predicted rows: 100%|██████████| 8806/8806 [00:04<00:00, 1846.66it/s]


⚠️  Empty product sets in 0 / 8806 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 131302 (0.00%)


7) Standardizing SMILES: 100%|██████████| 131302/131302 [00:15<00:00, 8278.66it/s]
8) Generating InChIKeys: 100%|██████████| 131302/131302 [00:25<00:00, 5063.78it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 1339066/1339066 [00:07<00:00, 175005.43it/s]
4) Augmenting data: 100%|██████████| 152/152 [00:00<00:00, 185.02it/s]
5) Updating predicted rows: 100%|██████████| 8657/8657 [00:05<00:00, 1632.45it/s]


⚠️  Empty product sets in 0 / 8657 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 125988 (0.00%)


7) Standardizing SMILES: 100%|██████████| 125988/125988 [00:14<00:00, 8400.88it/s]
8) Generating InChIKeys: 100%|██████████| 125988/125988 [00:24<00:00, 5195.81it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 1386945/1386945 [00:07<00:00, 174116.23it/s]
4) Augmenting data: 100%|██████████| 151/151 [00:00<00:00, 179.27it/s]
5) Updating predicted rows: 100%|██████████| 8923/8923 [00:05<00:00, 1654.59it/s]


⚠️  Empty product sets in 0 / 8923 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 137668 (0.00%)


7) Standardizing SMILES: 100%|██████████| 137668/137668 [00:16<00:00, 8454.80it/s]
8) Generating InChIKeys: 100%|██████████| 137668/137668 [00:26<00:00, 5163.95it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 1130256/1130256 [00:06<00:00, 173280.39it/s]
4) Augmenting data: 100%|██████████| 79/79 [00:00<00:00, 236.86it/s]
5) Updating predicted rows: 100%|██████████| 4471/4471 [00:03<00:00, 1456.17it/s]


⚠️  Empty product sets in 0 / 4471 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 122870 (0.00%)


7) Standardizing SMILES: 100%|██████████| 122870/122870 [00:14<00:00, 8330.15it/s]
8) Generating InChIKeys: 100%|██████████| 122870/122870 [00:23<00:00, 5237.58it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 1200475/1200475 [00:06<00:00, 180542.52it/s]
4) Augmenting data: 100%|██████████| 88/88 [00:00<00:00, 253.91it/s]
5) Updating predicted rows: 100%|██████████| 4859/4859 [00:03<00:00, 1599.26it/s]


⚠️  Empty product sets in 0 / 4859 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 118140 (0.00%)


7) Standardizing SMILES: 100%|██████████| 118140/118140 [00:14<00:00, 8333.97it/s]
8) Generating InChIKeys: 100%|██████████| 118140/118140 [00:22<00:00, 5163.36it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 1326006/1326006 [00:07<00:00, 169306.66it/s]
4) Augmenting data: 100%|██████████| 129/129 [00:00<00:00, 171.58it/s]
5) Updating predicted rows: 100%|██████████| 7756/7756 [00:04<00:00, 1721.04it/s]


⚠️  Empty product sets in 0 / 7756 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 135440 (0.00%)


7) Standardizing SMILES: 100%|██████████| 135440/135440 [00:15<00:00, 8534.67it/s]
8) Generating InChIKeys: 100%|██████████| 135440/135440 [00:25<00:00, 5211.10it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 1339066/1339066 [00:07<00:00, 175284.16it/s]
4) Augmenting data: 100%|██████████| 155/155 [00:00<00:00, 193.65it/s]
5) Updating predicted rows: 100%|██████████| 9351/9351 [00:06<00:00, 1490.64it/s]


⚠️  Empty product sets in 0 / 9351 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 131506 (0.00%)


7) Standardizing SMILES: 100%|██████████| 131506/131506 [00:15<00:00, 8584.14it/s]
8) Generating InChIKeys: 100%|██████████| 131506/131506 [00:24<00:00, 5303.71it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 1340703/1340703 [00:07<00:00, 173377.61it/s]
4) Augmenting data: 100%|██████████| 149/149 [00:00<00:00, 196.24it/s]
5) Updating predicted rows: 100%|██████████| 8136/8136 [00:04<00:00, 1766.73it/s]


⚠️  Empty product sets in 0 / 8136 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 130970 (0.00%)


7) Standardizing SMILES: 100%|██████████| 130970/130970 [00:15<00:00, 8495.15it/s]
8) Generating InChIKeys: 100%|██████████| 130970/130970 [00:24<00:00, 5407.54it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 1445850/1445850 [00:06<00:00, 210316.14it/s]
4) Augmenting data: 100%|██████████| 105/105 [00:00<00:00, 179.67it/s]
5) Updating predicted rows: 100%|██████████| 7254/7254 [00:04<00:00, 1695.51it/s]


⚠️  Empty product sets in 0 / 7254 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 130644 (0.00%)


7) Standardizing SMILES: 100%|██████████| 130644/130644 [00:17<00:00, 7609.51it/s]
8) Generating InChIKeys: 100%|██████████| 130644/130644 [00:33<00:00, 3908.93it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Dataset: oneADMET_LR-STL---pIC$_{50}$ AMPN (HUMAN).parquet
Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 2454220/2454220 [00:14<00:00, 174915.69it/s]
4) Augmenting data: 100%|██████████| 55/55 [00:00<00:00, 65.36it/s]
5) Updating predicted rows: 100%|██████████| 7488/7488 [00:05<00:00, 1345.44it/s]


⚠️  Empty product sets in 0 / 7488 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 140198 (0.00%)


7) Standardizing SMILES: 100%|██████████| 140198/140198 [00:15<00:00, 8974.93it/s]
8) Generating InChIKeys: 100%|██████████| 140198/140198 [00:25<00:00, 5541.53it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 2346861/2346861 [00:15<00:00, 154200.64it/s]
4) Augmenting data: 100%|██████████| 63/63 [00:01<00:00, 51.62it/s]
5) Updating predicted rows: 100%|██████████| 9253/9253 [00:06<00:00, 1388.23it/s]


⚠️  Empty product sets in 0 / 9253 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 165374 (0.00%)


7) Standardizing SMILES: 100%|██████████| 165374/165374 [00:11<00:00, 14371.54it/s]
8) Generating InChIKeys: 100%|██████████| 165374/165374 [00:20<00:00, 8005.23it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 2269515/2269515 [00:09<00:00, 228948.60it/s]
4) Augmenting data: 100%|██████████| 57/57 [00:01<00:00, 43.66it/s]
5) Updating predicted rows: 100%|██████████| 10165/10165 [00:04<00:00, 2454.67it/s]


⚠️  Empty product sets in 0 / 10165 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 169136 (0.00%)


7) Standardizing SMILES: 100%|██████████| 169136/169136 [00:11<00:00, 14153.27it/s]
8) Generating InChIKeys: 100%|██████████| 169136/169136 [00:21<00:00, 7762.66it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 2077741/2077741 [00:09<00:00, 209439.16it/s]
4) Augmenting data: 100%|██████████| 66/66 [00:01<00:00, 56.76it/s]
5) Updating predicted rows: 100%|██████████| 9749/9749 [00:03<00:00, 2513.94it/s]


⚠️  Empty product sets in 0 / 9749 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 151284 (0.00%)


7) Standardizing SMILES: 100%|██████████| 151284/151284 [00:11<00:00, 13729.72it/s]
8) Generating InChIKeys: 100%|██████████| 151284/151284 [00:18<00:00, 8102.33it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

Train set: ['A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 2383836/2383836 [00:10<00:00, 230366.43it/s]
4) Augmenting data: 100%|██████████| 69/69 [00:01<00:00, 44.90it/s]
5) Updating predicted rows: 100%|██████████| 11570/11570 [00:04<00:00, 2481.39it/s]


⚠️  Empty product sets in 0 / 11570 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 175810 (0.00%)


7) Standardizing SMILES: 100%|██████████| 175810/175810 [00:12<00:00, 14533.06it/s]
8) Generating InChIKeys: 100%|██████████| 175810/175810 [00:22<00:00, 7754.61it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

Train set: ['A', 'B', 'C', 'D', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 2288730/2288730 [00:10<00:00, 228857.67it/s]
4) Augmenting data: 100%|██████████| 49/49 [00:01<00:00, 40.09it/s]
5) Updating predicted rows: 100%|██████████| 9287/9287 [00:04<00:00, 2311.88it/s]


⚠️  Empty product sets in 0 / 9287 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 155672 (0.00%)


7) Standardizing SMILES: 100%|██████████| 155672/155672 [00:10<00:00, 14161.58it/s]
8) Generating InChIKeys: 100%|██████████| 155672/155672 [00:19<00:00, 7908.50it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

Train set: ['A', 'B', 'C', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 2545896/2545896 [00:11<00:00, 226644.43it/s]
4) Augmenting data: 100%|██████████| 45/45 [00:01<00:00, 43.54it/s]
5) Updating predicted rows: 100%|██████████| 7376/7376 [00:03<00:00, 2249.77it/s]


⚠️  Empty product sets in 0 / 7376 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 164356 (0.00%)


7) Standardizing SMILES: 100%|██████████| 164356/164356 [00:11<00:00, 13883.24it/s]
8) Generating InChIKeys: 100%|██████████| 164356/164356 [00:20<00:00, 7954.55it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

Train set: ['A', 'B', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 2634660/2634660 [00:11<00:00, 221530.53it/s]
4) Augmenting data: 100%|██████████| 65/65 [00:01<00:00, 42.60it/s]
5) Updating predicted rows: 100%|██████████| 10383/

⚠️  Empty product sets in 0 / 10383 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 184636 (0.00%)


7) Standardizing SMILES: 100%|██████████| 184636/184636 [00:13<00:00, 14093.60it/s]
8) Generating InChIKeys: 100%|██████████| 184636/184636 [00:23<00:00, 7971.36it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

Train set: ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 2143485/2143485 [00:09<00:00, 229102.57it/s]
4) Augmenting data: 100%|██████████| 37/37 [00:00<00:00, 112.78it/s]
5) Updating predicted rows: 100%|██████████| 4475/4475 [00:01<00:00, 3146.12it/s]


⚠️  Empty product sets in 0 / 4475 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 134640 (0.00%)


7) Standardizing SMILES: 100%|██████████| 134640/134640 [00:09<00:00, 14138.99it/s]
8) Generating InChIKeys: 100%|██████████| 134640/134640 [00:17<00:00, 7786.42it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

Train set: ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
3) Computing pairwise correlations: 100%|██████████| 2618616/2618616 [00:11<00:00, 222903.85it/s]
4) Augmenting data: 100%|██████████| 57/57 [00:01<00:00, 37.2

⚠️  Empty product sets in 0 / 10786 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 183982 (0.00%)


7) Standardizing SMILES: 100%|██████████| 183982/183982 [00:13<00:00, 14104.14it/s]
8) Generating InChIKeys: 100%|██████████| 183982/183982 [00:23<00:00, 7782.48it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docu

Dataset: oneADMET_LR-STL---pIC$_{50}$ ANM6 (HUMAN).parquet
Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 344865/344865 [00:01<00:00, 255845.38it/s]
4) Augmenting data: 100%|██████████| 2/2 [00:00<00:00, 135.12it/s]
5) Updating predicted rows: 100%|██████████| 104/104 [00:00<00:00, 10406.46it/s]


⚠️  Empty product sets in 0 / 104 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 36896 (0.00%)


7) Standardizing SMILES: 100%|██████████| 36896/36896 [00:02<00:00, 13117.85it/s]
8) Generating InChIKeys: 100%|██████████| 36896/36896 [00:04<00:00, 7816.54it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 350703/350703 [00:01<00:00, 257444.73it/s]
4) Augmenting data: 100%|██████████| 1/1 [00:00<00:00, 328.73it/s]
5) Updating predicted rows: 100%|██████████| 52/52 [00:00<00:00, 7684.31it/s]


⚠️  Empty product sets in 0 / 52 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 37252 (0.00%)


7) Standardizing SMILES: 100%|██████████| 37252/37252 [00:02<00:00, 12987.14it/s]
8) Generating InChIKeys: 100%|██████████| 37252/37252 [00:05<00:00, 7425.82it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 290703/290703 [00:01<00:00, 237204.39it/s]
4) Augmenting data: 100%|██████████| 2/2 [00:00<00:00, 534.75it/s]
5) Updating predicted rows: 100%|██████████| 80/80 [00:00<00:00, 11044.91it/s]


⚠️  Empty product sets in 0 / 80 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 32676 (0.00%)


7) Standardizing SMILES: 100%|██████████| 32676/32676 [00:02<00:00, 12825.76it/s]
8) Generating InChIKeys: 100%|██████████| 32676/32676 [00:04<00:00, 7476.42it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 287661/287661 [00:01<00:00, 255764.58it/s]
4) Augmenting data: 100%|██████████| 2/2 [00:00<00:00, 348.02it/s]
5) Updating predicted rows: 100%|██████████| 122/122 [00:00<00:00, 10106.96it/s]


⚠️  Empty product sets in 0 / 122 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 37012 (0.00%)


7) Standardizing SMILES: 100%|██████████| 37012/37012 [00:02<00:00, 12642.33it/s]
8) Generating InChIKeys: 100%|██████████| 37012/37012 [00:05<00:00, 7397.09it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Train set: ['A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 370230/370230 [00:01<00:00, 252277.39it/s]
4) Augmenting data: 100%|██████████| 2/2 [00:00<00:00, 359.33it/s]
5) Updating predicted rows: 100%|██████████| 104/104 [00:00<00:00, 10170.38it/s]


⚠️  Empty product sets in 0 / 104 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 37748 (0.00%)


7) Standardizing SMILES: 100%|██████████| 37748/37748 [00:02<00:00, 12939.46it/s]
8) Generating InChIKeys: 100%|██████████| 37748/37748 [00:05<00:00, 7512.13it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Train set: ['A', 'B', 'C', 'D', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 316410/316410 [00:01<00:00, 254673.37it/s]
4) Augmenting data: 100%|██████████| 1/1 [00:00<00:00, 323.11it/s]
5) Updating predicted rows: 100%|██████████| 52/52 [00:00<00:00, 5582.96it/s]


⚠️  Empty product sets in 0 / 52 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 37164 (0.00%)


7) Standardizing SMILES: 100%|██████████| 37164/37164 [00:02<00:00, 13051.35it/s]
8) Generating InChIKeys: 100%|██████████| 37164/37164 [00:04<00:00, 7504.84it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Train set: ['A', 'B', 'C', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 389403/389403 [00:01<00:00, 257310.06it/s]
4) Augmenting data: 100%|██████████| 2/2 [00:00<00:00, 335.92it/s]
5) Updating predicted rows: 100%|██████████| 104/104 [00:00<00:00, 9895.59it/s]


⚠️  Empty product sets in 0 / 104 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 39700 (0.00%)


7) Standardizing SMILES: 100%|██████████| 39700/39700 [00:03<00:00, 12910.16it/s]
8) Generating InChIKeys: 100%|██████████| 39700/39700 [00:05<00:00, 7439.05it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Train set: ['A', 'B', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 344865/344865 [00:01<00:00, 242913.28it/s]
4) Augmenting data: 100%|██████████| 3/3 [00:00<00:00, 527.63it/s]
5) Updating predicted rows: 100%|██████████| 132/132 [00:00<00:00, 10141.38it/s]


⚠️  Empty product sets in 0 / 132 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 37300 (0.00%)


7) Standardizing SMILES: 100%|██████████| 37300/37300 [00:02<00:00, 12816.25it/s]
8) Generating InChIKeys: 100%|██████████| 37300/37300 [00:05<00:00, 7381.56it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Train set: ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 370230/370230 [00:01<00:00, 248478.50it/s]
4) Augmenting data: 100%|██████████| 1/1 [00:00<00:00, 381.68it/s]
5) Updating predicted rows: 100%|██████████| 48/48 [00:00<00:00, 7319.37it/s]


⚠️  Empty product sets in 0 / 48 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 42484 (0.00%)


7) Standardizing SMILES: 100%|██████████| 42484/42484 [00:03<00:00, 12863.91it/s]
8) Generating InChIKeys: 100%|██████████| 42484/42484 [00:05<00:00, 7236.62it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Train set: ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


3) Computing pairwise correlations: 100%|██████████| 279378/279378 [00:01<00:00, 252260.68it/s]
4) Augmenting data: 100%|██████████| 3/3 [00:00<00:00, 406.94it/s]
5) Updating predicted rows: 100%|██████████| 141/141 [00:00<00:00, 11470.74it/s]


⚠️  Empty product sets in 0 / 141 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 34384 (0.00%)


7) Standardizing SMILES: 100%|██████████| 34384/34384 [00:02<00:00, 13046.82it/s]
8) Generating InChIKeys: 100%|██████████| 34384/34384 [00:04<00:00, 7713.06it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

Dataset: oneADMET_LR-STL---pIC$_{50}$ MP2K1 (HUMAN).parquet
Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
0) Fragments generation
1) Indexing
2) Canonical SMIRKS generation


  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pearsonr(y1, y2)[0]
  corr = pears

⚠️  Empty product sets in 0 / 69584 (0.00%) of transformations
6) Preparing output file
Invalid SMILES removed: 0 / 700330 (0.00%)


7) Standardizing SMILES: 100%|██████████| 700330/700330 [01:16<00:00, 9183.89it/s]
8) Generating InChIKeys: 100%|██████████| 700330/700330 [02:13<00:00, 5228.60it/s]
  output_df = pd.concat(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['STD'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_rows['AUG'] = False
  full_aug_df = pd.concat([output_df, missing_rows], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Train set: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J']
0) Fragments generation


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df_perf = pd.DataFrame(result_log)
# Scatterplot: N vs R2
sns.scatterplot(x="N_train_noaug", y="R2_noaug", data=df_perf, label="No Aug")
sns.scatterplot(x="N_train_aug", y="R2_aug", data=df_perf, label="Aug")
plt.xlabel("N_train")
plt.ylabel("R²")
plt.title("Scatterplot: R² vs N_train")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute relative R² difference
df_perf["R2_diff"] = 100*(df_perf["R2_aug"] - df_perf["R2_noaug"]) / df_perf["R2_noaug"]
df_perf_sel = df_perf[df_perf["R2_diff"]>-100]
# Plot KDE and histogram
plt.figure(figsize=(10, 6))
sns.histplot(df_perf_sel["R2_diff"], kde=True, bins=100, color='skyblue', edgecolor='black')
plt.axvline(0, color='red', linestyle='--', linewidth=2)
plt.title("Distribution of Relative R² Difference (Aug - NoAug) / NoAug")
plt.xlabel("Relative R² Difference")
plt.ylabel("Density")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Scatterplot: N vs R2
sns.scatterplot(x="N_train_noaug", y="N_train_aug", data=df_perf, label="No Aug")
plt.tight_layout()
plt.show()


In [None]:
# Boxplot: R2 by Augmentation
df_box = pd.DataFrame({
    "R2": df_perf["R2_noaug"].tolist() + df_perf["R2_aug"].tolist(),
    "Augmentation": ["No Aug"] * len(df_perf) + ["Aug"] * len(df_perf)
})
sns.boxplot(x="Augmentation", y="R2", data=df_box)
plt.title("Boxplot: R² No Aug vs Aug")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Fix column names if necessary
df_perf_sel.columns = df_perf_sel.columns.str.strip()

# Compute R² difference
df_perf_sel["R2_diff"] = (df_perf_sel["R2_aug"] - df_perf_sel["R2_noaug"]) / df_perf_sel["R2_noaug"]

# Add dataset sizes
df_perf_sel["Size"] = df_perf_sel["N_train_noaug"]

# Prepare DataFrame for plotting
df_r2diff = df_perf_sel[["Dataset", "R2_diff", "Size"]]

# Plot boxplot of R² difference with size annotations
plt.figure(figsize=(14, 10))
ax = sns.boxplot(x="Dataset", y="R2_diff", data=df_r2diff)
plt.title("Boxplot: Relative R² Difference by Dataset (Aug - NoAug) / NoAug")
plt.xticks(rotation=90)
plt.axhline(0, color='red', linestyle='--', linewidth=2)
plt.grid(True)

# Add size annotations on top
grouped = df_r2diff.groupby("Dataset")["Size"].mean().reset_index()
for i, row in grouped.iterrows():
    xpos = list(df_r2diff["Dataset"].unique()).index(row["Dataset"])
    ax.text(xpos, 0.01, f"n={int(row['Size'])}", 
            ha='center', va='bottom', fontsize=9, color='black', rotation=90)

plt.tight_layout()
plt.show()

In [None]:
# Fix column names if necessary
df_perf.columns = df_perf.columns.str.strip()

# Add dataset sizes
df_perf["Size_noaug"] = df_perf["N_train_noaug"] + df_perf["N_test"]
df_perf["Size_aug"] = df_perf["N_train_aug"] + df_perf["N_test"]

# Rename for melt
df_rename = df_perf.rename(columns={
    "R2_noaug": "No Aug", "R2_aug": "Aug",
    "Size_noaug": "No Aug Size", "Size_aug": "Aug Size"
})

# Melt R² values
df_r2 = df_rename.melt(id_vars="Dataset", value_vars=["No Aug", "Aug"],
                       var_name="Augmentation", value_name="R2")

# Melt sizes
df_size = df_rename.melt(id_vars="Dataset", value_vars=["No Aug Size", "Aug Size"],
                         var_name="Augmentation", value_name="Size")
df_size["Augmentation"] = df_size["Augmentation"].map({
    "No Aug Size": "No Aug",
    "Aug Size": "Aug"
})

# Merge R² and Size
df_melt = pd.merge(df_r2, df_size, on=["Dataset", "Augmentation"])

# Plot boxplot with R² as Y, add size annotations
plt.figure(figsize=(14, 10))
ax = sns.boxplot(x="Dataset", y="R2", hue="Augmentation", data=df_melt)
plt.title("Boxplot: R² by Dataset and Augmentation")
plt.xticks(rotation=90)
plt.grid(True)

# Add size annotations on top of each box
grouped = df_melt.groupby(["Dataset", "Augmentation"])["Size"].mean().reset_index()
for i, row in grouped.iterrows():
    xpos = list(df_melt["Dataset"].unique()).index(row["Dataset"])
    offset = -0.2 if row["Augmentation"] == "No Aug" else 0.2
    ax.text(xpos + offset, 1.01, f"n={int(row['Size'])}", 
            ha='center', va='bottom', fontsize=9, color='black', rotation=90)

plt.tight_layout()
plt.show()