In [1]:
import numpy as np
import pandas as pd

from rdkit import Chem
from tqdm import tqdm

In [2]:
raw_data = pd.read_csv("NIST Gaseous IR Dataset.csv")
smiles = np.array(raw_data["SMILES"])
sequences = np.zeros((8362, 1586))
for i in range(1586):
  current_col = raw_data[str(i)]
  for x in range(8362):
    sequences[x][i] = current_col[x]

In [3]:
def not_organic(sm):
    """
    Exclude compounds that do not contain at least one C-H bond
    """
    # 从给定的 SMILES 表达式创建分子对象
    molecule = Chem.MolFromSmiles(sm)
    # 为分子添加氢原子
    molecule = Chem.AddHs(molecule)
    # 遍历分子中的原子
    for atom in molecule.GetAtoms():
        # 如果原子的原子序数为 6（即碳元素）
        if atom.GetAtomicNum() == 6:
            # 遍历该碳原子的相邻原子
            for neb in atom.GetNeighbors():
                # 如果相邻原子的原子序数为 1（即氢元素），返回 False，表示存在 C-H 键
                if neb.GetAtomicNum() == 1:
                    return False
    # 如果遍历完所有碳原子都没有找到相邻的氢原子，返回 True，表示不存在 C-H 键
    return True

def has_charged_center(sm):
    """
    Exclude compounds with formal charges.
    """
    return True if "+" in sm or "-" in sm else False

def too_big(sm):
    """
    Exclude compounds containing more than 25 non-hydrogen atoms.
    """
    # 从给定的 SMILES 表达式创建分子对象
    molecule = Chem.MolFromSmiles(sm)
    # 判断分子中的非氢原子数量是否大于等于 25
    return True if molecule.GetNumAtoms() >= 25 else False

too_big("[H]C([H])([H])S(=O)C([H])([H])[H]")

False

In [4]:
good_indices = []
num_charged, num_too_big, num_not_organic = 0, 0, 0
for sm in tqdm(range(len(smiles))):

    if not_organic(smiles[sm]) :
        num_not_organic += 1
        continue
    
    if has_charged_center(smiles[sm]):
        num_charged += 1
        continue
        
    if too_big(smiles[sm]):
        num_too_big += 1 
        continue
    
    good_indices.append(sm)

100%|██████████| 8362/8362 [00:03<00:00, 2450.66it/s]


In [5]:
print("Started with:", len(smiles), "samples, ended with:", len(good_indices), "samples")
print(num_not_organic, "weren't organic.")
print(num_charged, "had charged centers.")
print(num_too_big, "were too large.")

Started with: 8362 samples, ended with: 7506 samples
96 weren't organic.
649 had charged centers.
111 were too large.


In [6]:
import pickle
data = {}
data["smiles"] = smiles[good_indices]
data["sequences"] = sequences[good_indices]
with open("Cleaner NIST Dataset.pickle", "wb+") as f:
    pickle.dump(data, f)