In [1]:
import pandas as pd
from rdkit import Chem

data = pd.read_pickle('igcdata/small_mol/train_72k.pkl')

# Data Cleaning: 
data = data.dropna(subset=['isosmiles'])

# Low-Frequency Atoms: 
def atom_frequencies(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return [atom.GetSymbol() for atom in mol.GetAtoms()]

all_atoms = [atom for sublist in data['isosmiles'].apply(atom_frequencies) for atom in sublist]
atom_freq = pd.Series(all_atoms).value_counts()

#atom_freq
low_freq_atoms = atom_freq[atom_freq < 100].index.tolist()

def contains_low_freq_atoms(smiles):
    mol = Chem.MolFromSmiles(smiles)
    mol_atoms = [atom.GetSymbol() for atom in mol.GetAtoms()]
    return any(atom in mol_atoms for atom in low_freq_atoms)

data = data[~data['isosmiles'].apply(contains_low_freq_atoms)]

# Validation of Graph Structures
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

data = data[data['isosmiles'].apply(is_valid_smiles)]




In [2]:
data.head()

Unnamed: 0,cid,image2d,image3d,SMILES,isosmiles,summary
33261,3553946,1_632953_10034684/3516533_3669275/3553946.png,1_632953_10034684/3516533_3669275/3553946.png,c1ccc(cc1)C[NH2+]CCNCc1ccccc1,C1=CC=C(C=C1)C[NH2+]CCNCC2=CC=CC=C2,The molecule is an ammonium ion derivative. It...
72297,45360076,4_45268362_60189818/45359558_45480588/45360076...,4_45268362_60189818/45359558_45480588/45360076...,CC(=O)OC(CCCCCCCCCCCC=C)CC(CO)O,CC(=O)OC(CCCCCCCCCCCC=C)CC(CO)O,The molecule is a long-chain fatty alcohol.
71244,135410798,6_101348969_139589427/135398660_135413545/1354...,6_101348969_139589427/135398660_135413545/1354...,Cc1c(c(ccc1)N=C1NN=C(CS1)c1ccccc1F)C,CC1=C(C(=CC=C1)N=C2NN=C(CS2)C3=CC=CC=C3F)C,The molecule is a methylbenzene.
1409,4621901,1_632953_10034684/4482010_4632015/4621901.png,1_632953_10034684/4482010_4632015/4621901.png,Cc1cccn2c1nc1c(c2=O)cc(n1C)C(=O)NCCN1CCOCC1,CC1=CC=CN2C1=NC3=C(C2=O)C=C(N3C)C(=O)NCCN4CCOCC4,The molecule is a pyridopyrimidine.
21341,131755983,6_101348969_139589427/131753332_131758668/1317...,6_101348969_139589427/131753332_131758668/1317...,CCCCCCCCCCCCCCCCCC(=O)OC[C@@H](COC(=O)CCCCCCC/...,CCCCCCCCCCCCCCCCCC(=O)OC[C@@H](COC(=O)CCCCCCC/...,The molecule is a triglyceride.
