In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
import numpy as np
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from rdkit.Chem import rdMolDescriptors, DataStructs

In [12]:
input_file = 'filtered_output.csv'
output_valid = 'filtered_valid_smiles.csv'
output_invalid = 'filtered_invalid_smiles.csv'

valid = []
invalid = []

with open(input_file, 'r') as fin:
    for line in fin:
        smi = line.strip()
        # 跳过空行
        if not smi:
            continue
        mol = Chem.MolFromSmiles(smi)
        if mol is not None:
            valid.append(smi)
        else:
            invalid.append(smi)

# 写入合法SMILES
with open(output_valid, 'w') as fout:
    for smi in valid:
        fout.write(smi + '\n')

# 写入无效SMILES
with open(output_invalid, 'w') as fout:
    for smi in invalid:
        fout.write(smi + '\n')

print(f"共处理 {len(valid) + len(invalid)} 条SMILES")
print(f"合法SMILES数量: {len(valid)}")
print(f"无效SMILES数量: {len(invalid)}")
print("过滤完成！")

[16:48:45] SMILES Parse Error: syntax error while parsing: smiles
[16:48:45] SMILES Parse Error: check for mistakes around position 2:
[16:48:45] smiles
[16:48:45] ~^
[16:48:45] SMILES Parse Error: Failed parsing SMILES 'smiles' for input: 'smiles'
[16:49:07] Explicit valence for atom # 13 S, 6, is greater than permitted
[16:49:34] Explicit valence for atom # 1 S, 6, is greater than permitted
[16:50:03] Explicit valence for atom # 20 S, 6, is greater than permitted
[16:50:14] Explicit valence for atom # 16 S, 6, is greater than permitted
[16:50:31] Explicit valence for atom # 9 S, 6, is greater than permitted
[16:50:51] Explicit valence for atom # 16 S, 6, is greater than permitted
[16:51:29] Explicit valence for atom # 12 S, 6, is greater than permitted
[16:51:35] Explicit valence for atom # 16 S, 6, is greater than permitted
[16:51:42] Explicit valence for atom # 21 C, 5, is greater than permitted
[16:51:50] Explicit valence for atom # 20 S, 6, is greater than permitted
[16:52:17] Ex

共处理 3636235 条SMILES
合法SMILES数量: 3636210
无效SMILES数量: 25
过滤完成！


In [2]:
smiles_list = pd.read_csv('filtered_valid_smiles.csv', header=None)[0].tolist()

In [3]:
smiles_list

['Cc1ccc(C)c(NC(=O)c2c(O)c3ccccc3n(C)c2=O)c1',
 'COCCn1cncc1CNC(=O)c1c(C)nc2cc(C)ccn12',
 '[H]C(=C1S\\C(NC1=O)=N/c1ccccc1F)c1cccc(c1)N(=O)=O',
 'Cn1c2nc(SCCCC#N)n(Cc3c(F)cccc3Cl)c2c(=O)n(C)c1=O',
 'OC(=O)COc1ccc(Br)cc1\\C=N\\NC(=O)COc1cccc(Br)c1',
 'Ic1ccc(o1)-c1nsc(NC(=O)c2ccccc2)n1',
 'COc1cccc(c1)N1CCN(CC1)C(=O)c1nc2ccccc2c(=O)[nH]1',
 'Cn1nc(cc1C(=O)N1CCNC(=O)c2ccccc12)C1CC1',
 'OC(=O)[C@@H]1Cc2ccccc2CN1Cc1ncccc1C(F)(F)F',
 'CC(C)CCC(=O)NCC(=O)N[C@@H](CC(O)=O)c1ccc2ccccc2c1',
 'COc1ccc2CN(Cc2c1)C(=O)CCc1ccc(cc1)C(O)=O',
 'NC(=O)c1ccc(CSc2nnc(Cc3ccccc3F)o2)cc1',
 'O=C(N1CCC(CCc2noc(n2)-c2cccnc2)C1)c1cccs1',
 'CC1(CC(C1)C#N)C(=O)N1CCCC(Cn2cc(nn2)C2CC2)C1',
 'COc1ccccc1NC(=O)Cn1nc2nc(ccn2c1=O)N1CCCC1',
 'Cc1nnc(-c2cnn(C)c2N)n1Cc1ccc2OCOc2c1',
 'CC(NCc1ccc2oc(=O)n(C)c2c1)c1cnc2sccn12',
 'COc1ccc(CCNC(=O)c2c3CCCc3nc3n(ncc23)C(C)C)cc1',
 'COc1cc(cc(Cl)c1OS(=O)(=O)c1ccc(C)cc1)-c1nc(O)c2c3CCCCc3sc2n1',
 'O=C(NCc1ccco1)[C@H](C1CCCC1)N1CCN(CC1)C(=O)\\C=C\\c1ccccc1',
 'Cc1cccc(NC(=O)Cn2nc3c4c

In [4]:
def mol_weight(sm):
    m = Chem.MolFromSmiles(sm)
    if m is None:
        return None
    return Descriptors.MolWt(m)

mol_weights = [mol_weight(sm) for sm in smiles_list]
valid_idx = [i for i, mw in enumerate(mol_weights) if mw is not None]
smiles_list = [smiles_list[i] for i in valid_idx]
mol_weights = [mol_weights[i] for i in valid_idx]

In [5]:
df = pd.DataFrame({'smiles': smiles_list, 'mol_weight': mol_weights})

# 设每组5w，算分组数
group_size = 1500
num_groups = int(np.ceil(len(df) / group_size))

# 用qcut等频分箱，labels为整数编号
df['mw_group'], bins = pd.qcut(df['mol_weight'], q=num_groups, labels=False, retbins=True, duplicates='drop')

# 统计每组分子数量
group_counts = df['mw_group'].value_counts().sort_index()


In [6]:
print("分子量等频分箱后每组分子数量：")
for i, cnt in group_counts.items():
    low = bins[i]
    high = bins[i+1]
    print(f"Group {i}: 分子量区间 [{low:.1f}, {high:.1f}), 数量: {cnt}")

分子量等频分箱后每组分子数量：
Group 0: 分子量区间 [85.1, 282.3), 数量: 1500
Group 1: 分子量区间 [282.3, 300.2), 数量: 1511
Group 2: 分子量区间 [300.2, 300.3), 数量: 1531
Group 3: 分子量区间 [300.3, 300.3), 数量: 1831
Group 4: 分子量区间 [300.3, 300.3), 数量: 1203
Group 5: 分子量区间 [300.3, 300.3), 数量: 1453
Group 6: 分子量区间 [300.3, 300.4), 数量: 1911
Group 7: 分子量区间 [300.4, 300.4), 数量: 1655
Group 8: 分子量区间 [300.4, 300.4), 数量: 1412
Group 9: 分子量区间 [300.4, 300.4), 数量: 1210
Group 10: 分子量区间 [300.4, 300.4), 数量: 1317
Group 11: 分子量区间 [300.4, 300.4), 数量: 1614
Group 12: 分子量区间 [300.4, 300.4), 数量: 1372
Group 13: 分子量区间 [300.4, 300.4), 数量: 1611
Group 14: 分子量区间 [300.4, 300.4), 数量: 1403
Group 15: 分子量区间 [300.4, 300.5), 数量: 1458
Group 16: 分子量区间 [300.5, 300.8), 数量: 1509
Group 17: 分子量区间 [300.8, 301.2), 数量: 1519
Group 18: 分子量区间 [301.2, 301.3), 数量: 1504
Group 19: 分子量区间 [301.3, 301.3), 数量: 1538
Group 20: 分子量区间 [301.3, 301.3), 数量: 1455
Group 21: 分子量区间 [301.3, 301.3), 数量: 2201
Group 22: 分子量区间 [301.3, 301.3), 数量: 968
Group 23: 分子量区间 [301.3, 301.4), 数量: 1419
Group 24: 分子

In [7]:
import warnings
warnings.filterwarnings("ignore")
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')


In [8]:
# 准备保存聚类结果
df['cluster'] = -1

# 对每一组做聚类，并用tqdm显示进度
i = 0
for group in tqdm(sorted(df['mw_group'].unique()), desc="分组聚类进度"):
    sub_idx = df[df['mw_group'] == group].index
    sub_smiles = df.loc[sub_idx, 'smiles'].tolist()

    # 生成指纹，tqdm可监控指纹生成
    fps = []
    for sm in tqdm(sub_smiles, desc=f"指纹计算-组{group}", leave=False):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
            fps.append(np.array(fp))
        else:
            fps.append(np.zeros(2048))  # 不应该出现，但兜底

    fps = np.array(fps)

    # 聚类，5类
    if len(fps) < 5:
        labels = [0] * len(fps)
    else:
        kmeans = KMeans(n_clusters=5, random_state=0)
        labels = kmeans.fit_predict(fps)

    df.loc[sub_idx, 'cluster'] = labels + 5 * i + 1
    i += 1

# 保存结果
df.to_csv('clustered_smiles.csv', index=False)
print("全部分组聚类完成，结果已保存。")

分组聚类进度: 100%|██████████| 2425/2425 [42:23<00:00,  1.05s/it]


全部分组聚类完成，结果已保存。


In [9]:
df_sampled = df.groupby('cluster', group_keys=False).apply(lambda x: x.sample(1, random_state=42))

# 重置索引，并只保存你要的字段（比如smiles、cluster）
df_sampled = df_sampled.reset_index(drop=True)
df_sampled.to_csv('cluster_representatives.csv', index=False)

print(f"已保存每个类一个分子的代表到 cluster_representatives.csv，总数: {len(df_sampled)}")

已保存每个类一个分子的代表到 cluster_representatives.csv，总数: 12125
