In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
import numpy as np
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from rdkit.Chem import rdMolDescriptors, DataStructs

In [None]:
input_file = 'filtered_output.csv'
output_valid = 'filtered_valid_smiles.csv'
output_invalid = 'filtered_invalid_smiles.csv'

valid = []
invalid = []

with open(input_file, 'r') as fin:
    for line in fin:
        smi = line.strip()
        # 跳过空行
        if not smi:
            continue
        mol = Chem.MolFromSmiles(smi)
        if mol is not None:
            valid.append(smi)
        else:
            invalid.append(smi)

# 写入合法SMILES
with open(output_valid, 'w') as fout:
    for smi in valid:
        fout.write(smi + '\n')

# 写入无效SMILES
with open(output_invalid, 'w') as fout:
    for smi in invalid:
        fout.write(smi + '\n')

print(f"共处理 {len(valid) + len(invalid)} 条SMILES")
print(f"合法SMILES数量: {len(valid)}")
print(f"无效SMILES数量: {len(invalid)}")
print("过滤完成！")

In [None]:
smiles_list = pd.read_csv('filtered_valid_smiles.csv', header=None)[0].tolist()

In [None]:
smiles_list

In [None]:
def mol_weight(sm):
    m = Chem.MolFromSmiles(sm)
    if m is None:
        return None
    return Descriptors.MolWt(m)

mol_weights = [mol_weight(sm) for sm in smiles_list]
valid_idx = [i for i, mw in enumerate(mol_weights) if mw is not None]
smiles_list = [smiles_list[i] for i in valid_idx]
mol_weights = [mol_weights[i] for i in valid_idx]

In [None]:
df = pd.DataFrame({'smiles': smiles_list, 'mol_weight': mol_weights})

# 设每组5w，算分组数
group_size = 1500
num_groups = int(np.ceil(len(df) / group_size))

# 用qcut等频分箱，labels为整数编号
df['mw_group'], bins = pd.qcut(df['mol_weight'], q=num_groups, labels=False, retbins=True, duplicates='drop')

# 统计每组分子数量
group_counts = df['mw_group'].value_counts().sort_index()


In [None]:
print("分子量等频分箱后每组分子数量：")
for i, cnt in group_counts.items():
    low = bins[i]
    high = bins[i+1]
    print(f"Group {i}: 分子量区间 [{low:.1f}, {high:.1f}), 数量: {cnt}")

In [None]:
import warnings
warnings.filterwarnings("ignore")
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')


In [None]:
# 准备保存聚类结果
df['cluster'] = -1

# 对每一组做聚类，并用tqdm显示进度
i = 0
for group in tqdm(sorted(df['mw_group'].unique()), desc="分组聚类进度"):
    sub_idx = df[df['mw_group'] == group].index
    sub_smiles = df.loc[sub_idx, 'smiles'].tolist()

    # 生成指纹，tqdm可监控指纹生成
    fps = []
    for sm in tqdm(sub_smiles, desc=f"指纹计算-组{group}", leave=False):
        mol = Chem.MolFromSmiles(sm)
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
            fps.append(np.array(fp))
        else:
            fps.append(np.zeros(2048))  # 不应该出现，但兜底

    fps = np.array(fps)

    # 聚类，5类
    if len(fps) < 5:
        labels = [0] * len(fps)
    else:
        kmeans = KMeans(n_clusters=5, random_state=0)
        labels = kmeans.fit_predict(fps)

    df.loc[sub_idx, 'cluster'] = labels + 5 * i + 1
    i += 1

# 保存结果
df.to_csv('clustered_smiles.csv', index=False)
print("全部分组聚类完成，结果已保存。")

In [None]:
df_sampled = df.groupby('cluster', group_keys=False).apply(lambda x: x.sample(1, random_state=42))

# 重置索引，并只保存你要的字段（比如smiles、cluster）
df_sampled = df_sampled.reset_index(drop=True)
df_sampled.to_csv('cluster_representatives.csv', index=False)

print(f"已保存每个类一个分子的代表到 cluster_representatives.csv，总数: {len(df_sampled)}")