### Random Split

In [None]:
import pickle
from rdkit import Chem
from data_prepare import *
import pandas as pd

# Load data
descriptors = ["ASO", "ESP", "NBO"]
Denmark_data = load_raw_data(descriptors, origin="Denmark")

catalysts = pd.read_csv("../ASO/Catalysts/catalysts.csv")
imines = pd.read_csv("../ASO/Imines/imines.csv")
thiols = pd.read_csv("../ASO/Thiols/thiols.csv")

d = {}
for df in [catalysts, imines, thiols]:
    d.update(dict([(Chem.MolToInchi(Chem.MolFromSmiles(smi)), str(label)) for smi, label in zip(df["smiles"], df["label"]) if label != "ref"]))

datas = []
for i in range(10):
    ori_data = pd.read_excel("Denmark_data_original.xlsx", sheet_name="FullCV_{:02}".format(i + 1))
    reactions = []
    for c, i, t, ddG in zip(ori_data["Catalyst"], ori_data["Imine"], ori_data["Thiol"], ori_data["Output"]):
        cat = Chem.MolToInchi(Chem.MolFromSmiles(c))
        imi = Chem.MolToInchi(Chem.MolFromSmiles(i))
        thi = Chem.MolToInchi(Chem.MolFromSmiles(t))
        try:
            reactions.append(f"{d[cat]}_{d[imi]}_{d[thi]}")
        except Exception as e:
            reactions.append(f"328_vi_{d[imi]}_{d[thi]}")
    train_comb = reactions[:600]
    test_comb = reactions[600:]
    data = load_data(Denmark_data, train_comb, test_comb, feature_selection="mutual_info_regression", n_features=0.25)
    datas.append(data)
with open("./RandomSplit/random_split.pickle", "wb") as f:
    pickle.dump(datas, f)

### Combination Split

In [None]:
import os
import pickle
from data_prepare import *

# Load data
print("Loading Data...")
descriptors = ["ASO", "ESP", "NBO"]
Denmark_data = load_raw_data(descriptors, origin="Denmark", save_to_file="DATA_Denmark.csv")
print("Completed!")

train_combs, test_combs = split_combinations(mode="combination")
test_cat, test_subs, test_catsubs = tuple(test_combs)

print("Loading sub-dataset...")
cat_data = load_data(Denmark_data, train_combs,
                     test_cat, feature_selection="mutual_info_regression", n_features=0.25)
print("Test Catalysts Completed!")
subs_data = load_data(Denmark_data, train_combs,
                      test_subs, feature_selection="mutual_info_regression", n_features=0.25)
print("Test Substrates Completed!")
catsubs_data = load_data(Denmark_data, train_combs,
                         test_catsubs, feature_selection="mutual_info_regression", n_features=0.25)
print("Test Cat/Subs Completed!")

datas = {}
datas["cat_data"] = cat_data
datas["subs_data"] = subs_data
datas["catsubs_data"] = catsubs_data

file = "./CombinationSplit/combination_split.pickle"
with open(file, "wb") as f:
    pickle.dump(datas, f)
print(f"Store Data to {os.path.abspath(file)}")

### JACS Case Study 1

In [None]:
from data_prepare import *
import pickle

train_combs, test_combs = split_combinations(mode="combination")
test_cat, test_subs, test_catsubs = tuple(test_combs)

ASO_ESPMAX_data = load_raw_data(descriptors=["ASO", "ESPMAX"])

cat_data = load_data(ASO_ESPMAX_data, train_combs, test_cat, feature_selection="f_regression", n_features=500)
subs_data = load_data(ASO_ESPMAX_data, train_combs, test_subs, feature_selection="f_regression", n_features=500)
catsubs_data = load_data(ASO_ESPMAX_data, train_combs, test_catsubs, feature_selection="f_regression", n_features=500)

datas = {}
datas["cat_data"] = cat_data
datas["subs_data"] = subs_data
datas["catsubs_data"] = catsubs_data

with open("./JACS-CaseStudy1/ASO_ESPMAX_FREG500.pickle", "wb") as f:
    pickle.dump(datas, f)

In [4]:
SIF_ESPMAX_data = load_raw_data(descriptors=["SIF", "ESPMAX"])

cat_data = load_data(SIF_ESPMAX_data, train_combs, test_cat, feature_selection="f_regression", n_features=500)
subs_data = load_data(SIF_ESPMAX_data, train_combs, test_subs, feature_selection="f_regression", n_features=500)
catsubs_data = load_data(SIF_ESPMAX_data, train_combs, test_catsubs, feature_selection="f_regression", n_features=500)

datas = {}
datas["cat_data"] = cat_data
datas["subs_data"] = subs_data
datas["catsubs_data"] = catsubs_data

with open("./JACS-CaseStudy1/SIF_ESPMAX_FREG500.pickle", "wb") as f:
    pickle.dump(datas, f)

  n_samples * X_means ** 2)
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)
  n_samples * X_means ** 2)
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)
  n_samples * X_means ** 2)
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)
