In [27]:
import pandas as pd
from rdkit.Chem import Descriptors as Des
from rdkit import Chem

import numpy as np
from rdkit.Chem import MACCSkeys, DataStructs
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

In [28]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors

import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

import matplotlib.pyplot as plt
plt.rc('font',family='Times New Roman')


In [29]:
Test_1 = pd.read_csv("Test_1.csv")
Test_1["split"]="Test 1"
Test_2 = pd.read_csv("Test_2.csv")
Test_2["split"]="Test 2"
Train_Val = pd.read_csv("Train_Val.csv")
Train_Val['split'] = Train_Val['split'].replace({'train': 'Train', 'test': 'Validation'})

In [30]:
data = pd.concat([Train_Val[Train_Val["split"]=="Validation"],Test_1])

In [31]:
data["MV"]=data["smiles"].apply(lambda x : Des.ExactMolWt(Chem.MolFromSmiles(x)))

In [32]:
data = data[(data['MV'] >= 300) & (data['MV'] <= 500)]

In [33]:

def calculate_maccs(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return MACCSkeys.GenMACCSKeys(mol)
    else:
        return None

data['MACCS'] = data['smiles'].apply(calculate_maccs)
maccs_fps = [fp for fp in data['MACCS'] if fp is not None]
split_labels = data['split'][data['MACCS'].notnull()].values

def calculate_similarity_matrix(fps):
    num_fps = len(fps)
    similarity_matrix = np.zeros((num_fps, num_fps))
    for i in range(num_fps):
        for j in range(num_fps):
            similarity_matrix[i, j] = DataStructs.TanimotoSimilarity(fps[i], fps[j])
    return similarity_matrix

similarity_matrix = calculate_similarity_matrix(maccs_fps)


In [34]:
similarity_matrix.shape

(1294, 1294)

测试集与验证集/训练集的相似性热图

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D

# 设置画布
plt.figure(figsize=(12, 10), dpi=600)

# 唯一split和颜色映射
unique_splits = np.unique(split_labels)
colors = sns.color_palette("husl", len(unique_splits))
split_color_map = dict(zip(unique_splits, colors))

# 绘制heatmap
# ax = sns.heatmap(similarity_matrix, cmap='viridis', xticklabels=False, yticklabels=False, cbar=True)
# 在 heatmap 中加 rasterized=True
ax = sns.heatmap(similarity_matrix, cmap='viridis', xticklabels=False, yticklabels=False, cbar=True, rasterized=True)


# 设置 colorbar 刻度字体大小
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=16)

# 顶部色带标注split
for i, label in enumerate(split_labels):
    color = split_color_map[label]
    ax.add_patch(plt.Rectangle((i, -0.5), 1, 0.5, color=color, clip_on=False))

# 框选区域并加label文字
for split, color in split_color_map.items():
    indices = np.where(split_labels == split)[0]
    if len(indices) > 0:
        start = indices[0]
        end = indices[-1]
        width = end - start + 1

        # 边框矩形
        rect = Rectangle((start, start), width, width, edgecolor=color, facecolor='none', linewidth=2)
        ax.add_patch(rect)

        # label文字
        center_x = start + width / 2
        center_y = start + width / 2
        ax.text(center_x, center_y, split, color='white', fontsize=20,
                ha='center', va='center', fontweight='bold',
                bbox=dict(facecolor='black', alpha=0.4, boxstyle='round,pad=0.3'))

# 添加标题和轴标签
plt.title('MACCS Fingerprint Similarity Between Validation and Test 1 Sets', fontsize=20)
plt.xlabel('Molecule Index', fontsize=20)
plt.ylabel('Molecule Index', fontsize=20)

# 创建图例（如需显示，可取消注释）
# legend_elements = [Line2D([0], [0], marker='o', color='w',
#                           markerfacecolor=color, markersize=16, label=split)
#                    for split, color in split_color_map.items()]
# plt.legend(handles=legend_elements, title='Split', bbox_to_anchor=(1.05, 1), loc='upper right')

plt.tight_layout()

# 保存图像为PDF
plt.savefig("/home/ubuntu/FEAOF/figs/fig1_C_1_3.pdf", format='pdf', bbox_inches='tight')
plt.show()
