In [1]:
import pickle
from random import shuffle, seed
import numpy as np

In [2]:
seed(13)

with open("Cleaner NIST Dataset.pickle", "rb") as f:
    d = pickle.load(f)
smiles = d["smiles"]
sequences = d["sequences"]
# Zip each data sequence.
dataset = list(zip(smiles, sequences))
shuffle(dataset)

# Extract compounds that occur more than once so that repeats aren't distributed across folds.
single_occurrence_molecules = [x for x in dataset if list(d["smiles"]).count(x[0]) <= 1]
multiple_occurrence_molecules = [x for x in dataset if x[0] not in [h[0] for h in single_occurrence_molecules]]

# Create folds.
folds = {}
fold_size = len(single_occurrence_molecules) // 5
for i in range(1, 6):
    folds[i] = single_occurrence_molecules[((i - 1) * fold_size):(i * fold_size)]
# Add whatever wasn't added from single occurrences to the end of multiple occurrences.
multiple_occurrence_molecules += single_occurrence_molecules[(5 * fold_size):]
mult_fold_size = len(multiple_occurrence_molecules) // 5
# Add all these molecules across folds such that all repeat occurrences always occur within the same fold.
current_fold = 0
while len(multiple_occurrence_molecules) > 0:
    current_fold %= 5
    current_fold += 1
    current_molecule = multiple_occurrence_molecules[0]
    while current_molecule[0] in [h[0] for h in multiple_occurrence_molecules]:
        folds[current_fold].append(multiple_occurrence_molecules.pop([h[0] for h in multiple_occurrence_molecules].index(current_molecule[0])))

In [3]:
def normalize(s):
    """
    Normalize the input series from 0->1 and return it.
    """
    # 找到输入序列中的最大值
    max_val = max(s)
    # 计算缩放比例
    scale = 1 / max_val
    # 如果最大值为 0，则将缩放比例设为 0
    if max_val == 0:
      scale = 0
    # 通过列表推导式对输入序列进行缩放
    return [j * scale for j in s]

def normal_many(x):
    # 对输入的多个序列分别调用 normalize 函数并进行 floor_out 处理
    return [floor_out(normalize(j)) for j in x]

def floor_out(x):
    # 通过列表推导式对输入序列进行处理，将小于 0.01 的值设为 0
    return [j if j > 0.01 else 0 for j in x]

dataset_splits = {1: {}, 2: {}, 3: {}, 4: {}, 5: {}}
for i in range(1, 6):
    # For each i-th split, the testing set will be the i-th fold.
    test = folds[i]
    train = []
    for x in range(1, 6):
        if x != i:
            train += folds[x]
            
    dataset_splits[i]["test_smiles"] = np.array([j[0] for j in test])
    dataset_splits[i]["test_y"] = np.array(normal_many([j[1] for j in test]), dtype = float)
    dataset_splits[i]["train_smiles"] = np.array([j[0] for j in train])
    dataset_splits[i]["train_y"] = np.array(normal_many([j[1] for j in train]), dtype = float)