In [1]:
import pandas as pd
from skmultilearn.model_selection import IterativeStratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from rdkit import Chem
from tqdm import tqdm
import json
import os

GSLF

In [2]:
os.makedirs("gslf", exist_ok=True)

In [3]:
def split(df):
    # Shuffle and split dataset, Train:Test = 80:20
    stf = IterativeStratification(
        n_splits=2, order=2, sample_distribution_per_fold=[0.8, 0.2]
    )
    train_test_split = list(
        stf.split(df["IsomericSMILES"].values, df.iloc[:, 1:].values)
    )

    origin_train_idx, origin_test_idx = train_test_split[0][1], train_test_split[1][1]
    df_train = df.iloc[origin_train_idx]
    df_test = df.iloc[origin_test_idx]

    print(len(df_train), len(df_test))

    df_train.to_csv("gslf/train.csv", index=False)
    df_test.to_csv("gslf/test.csv", index=False)

    df_dist = pd.DataFrame()
    df_dist["odor"] = df_train.columns[1:]
    df_dist["train"] = df_train.iloc[:, 1:].sum().values
    df_dist["test"] = df_test.iloc[:, 1:].sum().values
    df_dist.to_csv(os.path.join("gslf", "odor_distribution.csv"), index=False)

    # 5-Fold Cross Validation(Train:Valid = 80:20 of Train 80%)
    stf_tvt = MultilabelStratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    for i, (train, valid) in enumerate(
        stf_tvt.split(df_train["IsomericSMILES"].values, df_train.iloc[:, 1:].values)
    ):
        df_train_data = df_train.iloc[train]
        df_valid_data = df_train.iloc[valid]

        fold_dir = f"gslf/fold{i+1}"
        os.makedirs(fold_dir, exist_ok=True)

        df_train_data.to_csv(f"{fold_dir}/train.csv", index=False)
        df_valid_data.to_csv(f"{fold_dir}/valid.csv", index=False)
        print(len(df_train_data), len(df_valid_data))

        df_dist = pd.DataFrame()
        df_dist["odor"] = df_train.columns[1:]
        df_dist["train"] = df_train_data.iloc[:, 1:].sum().values
        df_dist["valid"] = df_valid_data.iloc[:, 1:].sum().values

        print((df_dist["train"] == 0).sum(), (df_dist["valid"] == 0).sum())
        df_dist.to_csv(os.path.join(fold_dir, "odor_distribution.csv"), index=False)

In [4]:
df = pd.read_csv("gslf_curation/gs-lf_combined.csv")
df.drop(["descriptors"], axis=1, inplace=True)

split(df)

3825 989
3041 784
0 0
3083 742
0 0
3067 758
0 0
3053 772
0 0
3056 769
0 0


Blend Pair

In [2]:
labels_74 = [
    'acidic', 'aldehydic', 'alliaceous', 'amber', 'animal', 'anisic', 'aromatic', 
    'balsamic', 'berry', 'bitter', 'bready', 'brown', 'burnt', 'buttery', 'camphoreous', 
    'caramellic', 'cheesy', 'chemical', 'chocolate', 'citrus', 'clean', 'cocoa', 'coconut', 
    'coffee', 'cooling', 'coumarinic', 'creamy', 'dairy', 'earthy', 'estery', 'ethereal', 
    'fatty', 'fermented', 'floral', 'fresh', 'fruity', 'fungal', 'fusel', 'green', 'herbal', 
    'honey', 'jammy', 'licorice', 'marine', 'meaty', 'medicinal', 'melon', 'mentholic', 
    'mint', 'mossy', 'musk', 'musty', 'nutty', 'oily', 'onion', 'orris', 'phenolic', 
    'powdery', 'roasted', 'rummy', 'soapy', 'solvent', 'sour', 'spicy', 'sulfurous', 
    'sweet', 'thujonic', 'tonka', 'tropical', 'vanilla', 'vegetable', 'waxy', 'winey', 'woody'
]

In [14]:
def isomeric2canonical(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return Chem.MolToSmiles(mol, isomericSmiles=True)


def inchi2smiles(inchi_str):
    mol = Chem.inchi.MolFromInchi(inchi_str)
    if mol is None:
        return None
    return Chem.MolToSmiles(mol, isomericSmiles=True)


def convert(data_list):
    rows = []
    for item in data_list:
        if "edge" in item and "blend_notes" in item:
            smi1 = item["edge"][0]
            smi2 = item["edge"][1]

            # remove stereo
            smi1 = smi1.replace("@", "")
            smi2 = smi2.replace("@", "")
            smi1 = smi1.replace("/", "")
            smi2 = smi2.replace("/", "")
            smi1 = smi1.replace("\\", "")
            smi2 = smi2.replace("\\", "")

            if smi1.startswith("InChI"):
                smi1 = inchi2smiles(smi1)
            else:
                smi1 = isomeric2canonical(smi1)
            if smi2.startswith("InChI"):
                smi2 = inchi2smiles(smi2)
            else:
                smi2 = isomeric2canonical(smi2)
            descriptors = ";".join(item["blend_notes"])

            if smi1 and smi2:
                if smi1 != smi2:
                    smi_pair = sorted([smi1, smi2])
                    rows.append({"smiles": f"{';'.join(smi_pair)}", "descriptors": descriptors})
    return pd.DataFrame(rows)


def one_hot_encode(row):
    labels = set(str(row["descriptors"]).split(";"))
    return [1 if odor in labels else 0 for odor in labels_74]

def apply_one_hot(df):
    label_enc = df.apply(lambda row: one_hot_encode(row), axis=1)
    df_labels = pd.DataFrame(label_enc.tolist(), columns=labels_74)
    df_total = pd.concat([df, df_labels], axis=1)
    df_total.drop(["descriptors"], axis=1, inplace=True)
    
    label_part = df_total.iloc[:, 1:]
    df_filtered = df_total[label_part.sum(axis=1) > 0]

    label_cols = df_filtered.columns[1:]
    df_filtered = df_filtered.groupby("smiles", as_index=False)[label_cols].max()

    return df_filtered

In [15]:
for i in tqdm(range(1, 6)):
    os.makedirs(f"bp/fold{i}", exist_ok=True)

    with open(f"bp/fold{i-1}.json", "r") as f:
        data = json.load(f)
        
    df_train = convert(data.get("train", [])).dropna().reset_index(drop=True)
    apply_one_hot(df_train).to_csv(f"bp/fold{i}/train.csv", index=False)
    df_valid = convert(data.get("validate", [])).dropna().reset_index(drop=True)
    apply_one_hot(df_valid).to_csv(f"bp/fold{i}/valid.csv", index=False)
    df_test = convert(data.get("test", [])).dropna().reset_index(drop=True)
    apply_one_hot(df_test).to_csv(f"bp/fold{i}/test.csv", index=False)

  0%|          | 0/5 [00:00<?, ?it/s][10:35:17] SMILES Parse Error: syntax error while parsing: (C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC
[10:35:17] SMILES Parse Error: Failed parsing SMILES '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC' for input: '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC'
[10:35:18] SMILES Parse Error: syntax error while parsing: (C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC
[10:35:18] SMILES Parse Error: Failed parsing SMILES '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC' for input: '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC'
[10:35:18] SMILES Parse Error: syntax error while parsing: (C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC
[10:35:18] SMILES Parse Error: Failed parsing SMILES '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC' for input: '(C)C1=CN=CC(=N1)OC.CC(C)C1=CN=C(C=N1)OC.CC(C)C1=NC=CN=C1OC'
[10:35:18] Can't kekulize mol.  Unkekulized atoms: 

Make Mixture Set

In [16]:
os.makedirs("mixture", exist_ok=True)

In [17]:
with open("labels_152.txt", "r") as f:
    total_labels = f.read().splitlines()
    
def align_labels(df):
    missing_labels = set(total_labels) - set(df.columns)
    for label in missing_labels:
        df[label] = 0

    df = df[total_labels]

    return df

In [18]:
for i in range(1, 6):
    for type in ["train", "valid", "test"]:
        df_bp = pd.read_csv(f"bp/fold{i}/{type}.csv")
        if type == "test":
            df_gslf = pd.read_csv("gslf/test.csv")
        else:
            df_gslf = pd.read_csv(f"gslf/fold{i}/{type}.csv")
        df_gslf.rename(columns={"IsomericSMILES": "smiles"}, inplace=True)
        df_bp = pd.concat([df_bp["smiles"], align_labels(df_bp)], axis=1)
        df = pd.concat([df_gslf, df_bp])

        os.makedirs(f"mixture/fold{i}", exist_ok=True)
        df.to_csv(f"mixture/fold{i}/{type}.csv", index=False)