# 0) Modules & Functions

In [10]:
import os
import pandas as pd
from rdkit import Chem
import subprocess
import hashlib
import tempfile
import base64
import numpy as np
from itertools import combinations
from scipy.stats import pearsonr
from tqdm import tqdm
import time
from collections import defaultdict
from rdkit.Chem import AllChem
from rdkit import RDLogger
import warnings
import re
from rdkit.Chem import inchi
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from rdkit.Chem import Descriptors
from rdkit import Chem
from rdkit.Chem import Draw

# Disable RDKit warnings
RDLogger.DisableLog('rdApp.*')

warnings.filterwarnings("ignore", category=UserWarning, module="rdkit")
warnings.filterwarnings("ignore", category=FutureWarning, module="rdkit")
warnings.filterwarnings("ignore", category=DeprecationWarning, module="rdkit")


from itertools import combinations
from pathlib import Path


import os
import pandas as pd
from pathlib import Path
def merge_stl_to_mtl(base_dirs, combinations_set, output_filename="MTL_merged.parquet"):
    for base in tqdm(base_dirs, desc="Iterating directories"):
        for combo in combinations_set:
            stl_dir = f"{base}/{combo}/STL"
            mtl_dir = f"{base}/{combo}/MTL"

            if not os.path.isdir(stl_dir):
                continue

            Path(mtl_dir).mkdir(parents=True, exist_ok=True)

            files = [f for f in os.listdir(stl_dir) if f.endswith(".parquet")]
            if not files:
                continue

            merged_df = None
            aug_map = {}

            for file in files:
                path = os.path.join(stl_dir, file)
                df = pd.read_parquet(path)[["InChIKey", "SMILES", "Y", "AUG"]]
                col_name = os.path.splitext(file)[0]
                df.rename(columns={"Y": col_name}, inplace=True)

                # Track AUG status per InChIKey — False dominates
                for k, is_aug in zip(df["InChIKey"], df["AUG"]):
                    if k not in aug_map:
                        aug_map[k] = is_aug
                    else:
                        aug_map[k] = aug_map[k] and is_aug  # False overrides True

                df.drop(columns=["AUG"], inplace=True)

                if merged_df is None:
                    merged_df = df
                else:
                    merged_df = pd.merge(merged_df, df, on=["InChIKey", "SMILES"], how="outer")

            # Final AUG map (False dominates)
            merged_df["AUG"] = merged_df["InChIKey"].map(aug_map).fillna(False)

            # Reorder columns
            cols = ["InChIKey", "SMILES", "AUG"] + [col for col in merged_df.columns if col not in ["InChIKey", "SMILES", "AUG"]]
            merged_df = merged_df[cols]

            merged_df.sort_values("InChIKey", inplace=True)
            output_path = os.path.join(mtl_dir, output_filename)
            merged_df.to_parquet(output_path, index=False)
            print(f"[✓] Wrote {output_path}")



# 1) Prepare the MTL data

In [None]:

base_dirs = ["../data/noaug", "../data/aug", "../data/test"]
combinations_set = ["ABCD", "ABCE", "ABDE", "ACDE","BCDE", "ALL"]

merge_stl_to_mtl(base_dirs, combinations_set)

Iterating directories:   0%|          | 0/3 [00:00<?, ?it/s]

[✓] Wrote ../data/noaug/AB/MTL/MTL_merged.parquet
[✓] Wrote ../data/noaug/AC/MTL/MTL_merged.parquet


Iterating directories:  33%|███▎      | 1/3 [00:00<00:00,  2.56it/s]

[✓] Wrote ../data/noaug/BC/MTL/MTL_merged.parquet
[✓] Wrote ../data/aug/AB/MTL/MTL_merged.parquet


Iterating directories:  67%|██████▋   | 2/3 [00:00<00:00,  2.29it/s]

[✓] Wrote ../data/aug/AC/MTL/MTL_merged.parquet
[✓] Wrote ../data/aug/BC/MTL/MTL_merged.parquet


Iterating directories: 100%|██████████| 3/3 [00:01<00:00,  2.76it/s]

[✓] Wrote ../data/test/AB/MTL/MTL_merged.parquet
[✓] Wrote ../data/test/AC/MTL/MTL_merged.parquet
[✓] Wrote ../data/test/BC/MTL/MTL_merged.parquet



