In [12]:
import pickle  
import numpy as np  
from collections import Counter  
from pathlib import Path  
  
def get_edge_type_name(edge_type_tuple):  
    """根据边类型编码返回相互作用名称"""  
    edge_type_map = {  
        (4, 0, 0): "空间边 (Spatial)",  
        (5, 1, 0): "氢键 (Hydrogen Bonds)",   
        (5, 2, 0): "疏水相互作用 (Hydrophobic)",  
        (5, 3, 0): "π-π堆积 (Pi Stacking)",  
        (5, 4, 0): "π-阳离子相互作用 (Pi-Cation)",  
        (5, 5, 0): "盐桥 (Salt Bridges)",  
        (5, 6, 0): "水桥 (Water Bridges)",  
        (5, 7, 0): "卤键 (Halogen Bonds)",  
        (5, 8, 0): "金属配位 (Metal Complexes)",  
        (5, 9, 0): "其他相互作用 (Others)"  
    }  
    return edge_type_map.get(edge_type_tuple, f"未知类型 {edge_type_tuple}")  
  
def analyze_edge_types_and_ratios(pkl_file_path):  
    """分析PKL文件中的边类型和占比"""  
      
    # 读取pkl文件  
    with open(pkl_file_path, 'rb') as f:  
        data = pickle.load(f)  
      
    print(f"成功加载 {len(data)} 个复合物\\n")  
      
    for i, complex_dict in enumerate(data):  
        print(f"=== 复合物 {i+1}: {complex_dict.get('pdbid', 'Unknown')} ===")  
          
        # 基本信息  
        print(f"PDB ID: {complex_dict.get('pdbid', 'N/A')}")  
        print(f"结合亲和力 (pK): {complex_dict.get('pk', 'N/A')}")  
        print(f"SMILES: {complex_dict.get('smiles', 'N/A')}")  
          
        # 节点和边统计  
        num_node = complex_dict.get('num_node', [])  
        num_edge = complex_dict.get('num_edge', [])  
          
        if len(num_node) >= 2:  
            lig_nodes = num_node[0]  
            pro_nodes = num_node[1]  
            print(f"\\n节点统计:")  
            print(f"  配体节点数: {lig_nodes}")  
            print(f"  蛋白质节点数: {pro_nodes}")  
            print(f"  总节点数: {lig_nodes + pro_nodes}")  
          
        if len(num_edge) >= 3:  
            lig_edges = num_edge[0]  
            pro_edges = num_edge[1]   
            interaction_edges = num_edge[2]  
            total_edges = lig_edges + pro_edges + interaction_edges  
              
            print(f"\\n边统计:")  
            print(f"  配体内部边数: {lig_edges} ({lig_edges/total_edges*100:.1f}%)")  
            print(f"  蛋白质内部边数: {pro_edges} ({pro_edges/total_edges*100:.1f}%)")  
            print(f"  蛋白-配体相互作用边数: {interaction_edges} ({interaction_edges/total_edges*100:.1f}%)")  
            print(f"  总边数: {total_edges}")  
          
        # 分析边特征 - 这里包含了边的类型信息  
        edge_feat = complex_dict.get('edge_feat', np.array([]))  
        edge_index = complex_dict.get('edge_index', np.array([]))  
          
        if edge_feat.size > 0:  
            print(f"\\n整体边特征分析:")  
            print(f"  边特征形状: {edge_feat.shape}")  
            print(f"  边特征数据类型: {edge_feat.dtype}")  
              
            # 分析边类型分布（基于边特征的前几列）  
            if edge_feat.shape[1] >= 3:  
                edge_types = []  
                for feat in edge_feat:  
                    edge_type = tuple(feat[:3].astype(int))  
                    edge_types.append(edge_type)  
                  
                type_counter = Counter(edge_types)  
                print(f"  整体边类型分布:")  
                for edge_type, count in type_counter.most_common():  
                    percentage = count / len(edge_types) * 100  
                    type_name = get_edge_type_name(edge_type)  
                    print(f"    {type_name}: {count} 条边 ({percentage:.1f}%)")  
          
        # 分析配体内部空间边  
        lig_spatial_ei = complex_dict.get('lig_spatial_edge_index', np.array([]))  
        lig_spatial_ea = complex_dict.get('lig_spatial_edge_attr', np.array([]))  
          
        if lig_spatial_ei.size > 0:  
            print(f"\\n配体内部空间边:")  
            print(f"  配体空间边数量: {lig_spatial_ei.shape[1] if len(lig_spatial_ei.shape) > 1 else 0}")  
            if lig_spatial_ea.size > 0:  
                print(f"  配体空间边特征形状: {lig_spatial_ea.shape}")  
                # 分析配体内部边类型  
                if lig_spatial_ea.shape[1] >= 3:  
                    lig_edge_types = []  
                    for feat in lig_spatial_ea:  
                        edge_type = tuple(feat[:3].astype(int))  
                        lig_edge_types.append(edge_type)  
                      
                    lig_type_counter = Counter(lig_edge_types)  
                    print(f"  配体内部边类型分布:")  
                    for edge_type, count in lig_type_counter.most_common():  
                        percentage = count / len(lig_edge_types) * 100  
                        type_name = get_edge_type_name(edge_type)  
                        print(f"    {type_name}: {count} 条边 ({percentage:.1f}%)")  
          
        # 分析蛋白质内部空间边  
        pro_spatial_ei = complex_dict.get('pro_spatial_edge_index', np.array([]))  
        pro_spatial_ea = complex_dict.get('pro_spatial_edge_attr', np.array([]))  
          
        if pro_spatial_ei.size > 0:  
            print(f"\\n蛋白质内部空间边:")  
            print(f"  蛋白空间边数量: {pro_spatial_ei.shape[1] if len(pro_spatial_ei.shape) > 1 else 0}")  
            if pro_spatial_ea.size > 0:  
                print(f"  蛋白空间边特征形状: {pro_spatial_ea.shape}")  
                # 分析蛋白质内部边类型  
                if pro_spatial_ea.shape[1] >= 3:  
                    pro_edge_types = []  
                    for feat in pro_spatial_ea:  
                        edge_type = tuple(feat[:3].astype(int))  
                        pro_edge_types.append(edge_type)  
                      
                    pro_type_counter = Counter(pro_edge_types)  
                    print(f"  蛋白质内部边类型分布:")  
                    for edge_type, count in pro_type_counter.most_common():  
                        percentage = count / len(pro_edge_types) * 100  
                        type_name = get_edge_type_name(edge_type)  
                        print(f"    {type_name}: {count} 条边 ({percentage:.1f}%)")  
          
        # 打印前几个边的详细信息  
        print(f"\\n前5条边的详细信息:")  
        if edge_index.size > 0 and edge_feat.size > 0:  
            for j in range(min(5, edge_index.shape[1])):  
                src, dst = edge_index[:, j]  
                feat = edge_feat[j]  
                edge_type = tuple(feat[:3].astype(int))  
                type_name = get_edge_type_name(edge_type)  
                print(f"  边 {j}: {src} -> {dst}, 类型: {type_name}, 特征: {feat}")  
          
        print("-" * 80)  
          
        # 只分析第一个复合物，避免输出过长  
        if i >= 0:  # 可以修改这个数字来控制分析多少个复合物  
            break  
  
if __name__ == "__main__":  
    pkl_file_path = "./test.pkl"  # 你的pkl文件路径  
      
    if Path(pkl_file_path).exists():  
        analyze_edge_types_and_ratios(pkl_file_path)  
    else:  
        print(f"文件不存在: {pkl_file_path}")

成功加载 1 个复合物\n
=== 复合物 1: 2zga_frame75 ===
PDB ID: 2zga_frame75
结合亲和力 (pK): 6.32
SMILES: CC(C)CCN(CC[NH3])S(=O)(=O)c1ccc(C(N)=O)cc1
\n节点统计:
  配体节点数: 21
  蛋白质节点数: 81
  总节点数: 102
\n边统计:
  配体内部边数: 42 (22.8%)
  蛋白质内部边数: 126 (68.5%)
  蛋白-配体相互作用边数: 16 (8.7%)
  总边数: 184
\n整体边特征分析:
  边特征形状: (1012, 4)
  边特征数据类型: float32
  整体边类型分布:
    其他相互作用 (Others): 710 条边 (70.2%)
    未知类型 (0, 0, 0): 132 条边 (13.0%)
    疏水相互作用 (Hydrophobic): 86 条边 (8.5%)
    氢键 (Hydrogen Bonds): 42 条边 (4.2%)
    未知类型 (1, 0, 0): 18 条边 (1.8%)
    未知类型 (3, 0, 1): 12 条边 (1.2%)
    盐桥 (Salt Bridges): 6 条边 (0.6%)
    未知类型 (0, 0, 1): 4 条边 (0.4%)
    未知类型 (1, 0, 1): 2 条边 (0.2%)
\n配体内部空间边:
  配体空间边数量: 196
  配体空间边特征形状: (196, 4)
  配体内部边类型分布:
    其他相互作用 (Others): 170 条边 (86.7%)
    疏水相互作用 (Hydrophobic): 22 条边 (11.2%)
    氢键 (Hydrogen Bonds): 4 条边 (2.0%)
\n蛋白质内部空间边:
  蛋白空间边数量: 632
  蛋白空间边特征形状: (632, 4)
  蛋白质内部边类型分布:
    其他相互作用 (Others): 540 条边 (85.4%)
    疏水相互作用 (Hydrophobic): 52 条边 (8.2%)
    氢键 (Hydrogen Bonds): 36 条边 (5.7%)
    盐桥 (Salt B

成功加载 1 个复合物\n
=== 复合物 1: 2zga_frame75 ===
PDB ID: 2zga_frame75
结合亲和力 (pK): 6.32
SMILES: CC(C)CCN(CC[NH3])S(=O)(=O)c1ccc(C(N)=O)cc1
\n节点统计:
  配体节点数: 21
  蛋白质节点数: 81
  总节点数: 102
\n边统计:
  配体内部边数: 42 (22.8%)
  蛋白质内部边数: 126 (68.5%)
  蛋白-配体相互作用边数: 16 (8.7%)
  总边数: 184
\n整体边特征分析:
  边特征形状: (1012, 4)
  边特征数据类型: float32
  整体边类型分布:
    其他相互作用 (Others): 710 条边 (70.2%)
    未知类型 (0, 0, 0): 132 条边 (13.0%)
    疏水相互作用 (Hydrophobic): 76 条边 (7.5%)
    氢键 (Hydrogen Bonds): 40 条边 (4.0%)
    未知类型 (1, 0, 0): 18 条边 (1.8%)
    未知类型 (3, 0, 1): 12 条边 (1.2%)
    π-π堆积 (Pi Stacking): 6 条边 (0.6%)
    π-阳离子相互作用 (Pi-Cation): 6 条边 (0.6%)
    盐桥 (Salt Bridges): 6 条边 (0.6%)
    未知类型 (0, 0, 1): 4 条边 (0.4%)
    未知类型 (1, 0, 1): 2 条边 (0.2%)
\n配体内部空间边:
  配体空间边数量: 196
  配体空间边特征形状: (196, 4)
  配体内部边类型分布:
    其他相互作用 (Others): 170 条边 (86.7%)
    疏水相互作用 (Hydrophobic): 12 条边 (6.1%)
    π-π堆积 (Pi Stacking): 6 条边 (3.1%)
    π-阳离子相互作用 (Pi-Cation): 6 条边 (3.1%)
    氢键 (Hydrogen Bonds): 2 条边 (1.0%)
\n蛋白质内部空间边:
  蛋白空间边数量: 632
  蛋白空间边特征形状: (632, 4)
  蛋白质内部边类型分布:
    其他相互作用 (Others): 540 条边 (85.4%)
    疏水相互作用 (Hydrophobic): 52 条边 (8.2%)
    氢键 (Hydrogen Bonds): 36 条边 (5.7%)
    盐桥 (Salt Bridges): 4 条边 (0.6%)
\n前5条边的详细信息:
  边 0: 2 -> 1, 类型: 未知类型 (3, 0, 1), 特征: [3.        0.        1.        1.3724158]
  边 1: 1 -> 2, 类型: 未知类型 (3, 0, 1), 特征: [3.        0.        1.        1.3724158]
  边 2: 2 -> 3, 类型: 未知类型 (0, 0, 1), 特征: [0.        0.        1.        1.5317265]
  边 3: 3 -> 2, 类型: 未知类型 (0, 0, 1), 特征: [0.        0.        1.        1.5317265]
  边 4: 3 -> 0, 类型: 未知类型 (0, 0, 1), 特征: [0.        0.        1.        1.3554884]
--------------------------------------------------------------------------------


In [None]:

import pickle  
import numpy as np  
from collections import Counter  
from pathlib import Path  
  
def get_edge_type_name(edge_type_tuple):  
    """根据边类型编码返回相互作用名称"""  
    edge_type_map = {  
        (4, 0, 0): "空间边 (Spatial)",  
        (5, 1, 0): "氢键 (Hydrogen Bonds)",   
        (5, 2, 0): "疏水相互作用 (Hydrophobic)",  
        (5, 3, 0): "π-π堆积 (Pi Stacking)",  
        (5, 4, 0): "π-阳离子相互作用 (Pi-Cation)",  
        (5, 5, 0): "盐桥 (Salt Bridges)",  
        (5, 6, 0): "水桥 (Water Bridges)",  
        (5, 7, 0): "卤键 (Halogen Bonds)",  
        (5, 8, 0): "金属配位 (Metal Complexes)",  
        (5, 9, 0): "其他相互作用 (Others)"  
    }  
    return edge_type_map.get(edge_type_tuple, f"未知类型 {edge_type_tuple}")  
  
def analyze_edge_types_and_ratios(pkl_file_path):  
    """分析PKL文件中的边类型和占比"""  
      
    # 读取pkl文件  
    with open(pkl_file_path, 'rb') as f:  
        data = pickle.load(f)  
      
    print(f"成功加载 {len(data)} 个复合物\\n")  
      
    for i, complex_dict in enumerate(data):  
        print(f"=== 复合物 {i+1}: {complex_dict.get('pdbid', 'Unknown')} ===")  
          
        # 基本信息  
        print(f"PDB ID: {complex_dict.get('pdbid', 'N/A')}")  
        print(f"结合亲和力 (pK): {complex_dict.get('pk', 'N/A')}")  
        print(f"SMILES: {complex_dict.get('smiles', 'N/A')}")  
          
        # 节点和边统计  
        num_node = complex_dict.get('num_node', [])  
        num_edge = complex_dict.get('num_edge', [])  
          
        if len(num_node) >= 2:  
            lig_nodes = num_node[0]  
            pro_nodes = num_node[1]  
            print(f"\\n节点统计:")  
            print(f"  配体节点数: {lig_nodes}")  
            print(f"  蛋白质节点数: {pro_nodes}")  
            print(f"  总节点数: {lig_nodes + pro_nodes}")  
          
        if len(num_edge) >= 3:  
            lig_edges = num_edge[0]  
            pro_edges = num_edge[1]   
            interaction_edges = num_edge[2]  
            total_edges = lig_edges + pro_edges + interaction_edges  
              
            print(f"\\n边统计:")  
            print(f"  配体内部边数: {lig_edges} ({lig_edges/total_edges*100:.1f}%)")  
            print(f"  蛋白质内部边数: {pro_edges} ({pro_edges/total_edges*100:.1f}%)")  
            print(f"  蛋白-配体相互作用边数: {interaction_edges} ({interaction_edges/total_edges*100:.1f}%)")  
            print(f"  总边数: {total_edges}")  
          
        # 分析边特征 - 这里包含了边的类型信息  
        edge_feat = complex_dict.get('edge_feat', np.array([]))  
        edge_index = complex_dict.get('edge_index', np.array([]))  
          
        if edge_feat.size > 0:  
            print(f"\\n整体边特征分析:")  
            print(f"  边特征形状: {edge_feat.shape}")  
            print(f"  边特征数据类型: {edge_feat.dtype}")  
              
            # 分析边类型分布（基于边特征的前几列）  
            if edge_feat.shape[1] >= 3:  
                edge_types = []  
                for feat in edge_feat:  
                    edge_type = tuple(feat[:3].astype(int))  
                    edge_types.append(edge_type)  
                  
                type_counter = Counter(edge_types)  
                print(f"  整体边类型分布:")  
                for edge_type, count in type_counter.most_common():  
                    percentage = count / len(edge_types) * 100  
                    type_name = get_edge_type_name(edge_type)  
                    print(f"    {type_name}: {count} 条边 ({percentage:.1f}%)")  
          
        # 分析配体内部空间边  
        lig_spatial_ei = complex_dict.get('lig_spatial_edge_index', np.array([]))  
        lig_spatial_ea = complex_dict.get('lig_spatial_edge_attr', np.array([]))  
          
        if lig_spatial_ei.size > 0:  
            print(f"\\n配体内部空间边:")  
            print(f"  配体空间边数量: {lig_spatial_ei.shape[1] if len(lig_spatial_ei.shape) > 1 else 0}")  
            if lig_spatial_ea.size > 0:  
                print(f"  配体空间边特征形状: {lig_spatial_ea.shape}")  
                # 分析配体内部边类型  
                if lig_spatial_ea.shape[1] >= 3:  
                    lig_edge_types = []  
                    for feat in lig_spatial_ea:  
                        edge_type = tuple(feat[:3].astype(int))  
                        lig_edge_types.append(edge_type)  
                      
                    lig_type_counter = Counter(lig_edge_types)  
                    print(f"  配体内部边类型分布:")  
                    for edge_type, count in lig_type_counter.most_common():  
                        percentage = count / len(lig_edge_types) * 100  
                        type_name = get_edge_type_name(edge_type)  
                        print(f"    {type_name}: {count} 条边 ({percentage:.1f}%)")  
          
        # 分析蛋白质内部空间边  
        pro_spatial_ei = complex_dict.get('pro_spatial_edge_index', np.array([]))  
        pro_spatial_ea = complex_dict.get('pro_spatial_edge_attr', np.array([]))  
          
        if pro_spatial_ei.size > 0:  
            print(f"\\n蛋白质内部空间边:")  
            print(f"  蛋白空间边数量: {pro_spatial_ei.shape[1] if len(pro_spatial_ei.shape) > 1 else 0}")  
            if pro_spatial_ea.size > 0:  
                print(f"  蛋白空间边特征形状: {pro_spatial_ea.shape}")  
                # 分析蛋白质内部边类型  
                if pro_spatial_ea.shape[1] >= 3:  
                    pro_edge_types = []  
                    for feat in pro_spatial_ea:  
                        edge_type = tuple(feat[:3].astype(int))  
                        pro_edge_types.append(edge_type)  
                      
                    pro_type_counter = Counter(pro_edge_types)  
                    print(f"  蛋白质内部边类型分布:")  
                    for edge_type, count in pro_type_counter.most_common():  
                        percentage = count / len(pro_edge_types) * 100  
                        type_name = get_edge_type_name(edge_type)  
                        print(f"    {type_name}: {count} 条边 ({percentage:.1f}%)")  
          
        # 打印前几个边的详细信息  
        print(f"\\n前5条边的详细信息:")  
        if edge_index.size > 0 and edge_feat.size > 0:  
            for j in range(min(5, edge_index.shape[1])):  
                src, dst = edge_index[:, j]  
                feat = edge_feat[j]  
                edge_type = tuple(feat[:3].astype(int))  
                type_name = get_edge_type_name(edge_type)  
                print(f"  边 {j}: {src} -> {dst}, 类型: {type_name}, 特征: {feat}")  
          
        print("-" * 80)  
          
        # 只分析第一个复合物，避免输出过长  
        if i >= 0:  # 可以修改这个数字来控制分析多少个复合物  
            break  
  
if __name__ == "__main__":  
    pkl_file_path = "./test1.pkl"  # 你的pkl文件路径  
      
    if Path(pkl_file_path).exists():  
        analyze_edge_types_and_ratios(pkl_file_path)  
    else:  
        print(f"文件不存在: {pkl_file_path}")

成功加载 1 个复合物\n
=== 复合物 1: 2aco-VCA ===
PDB ID: 2aco-VCA
结合亲和力 (pK): 6.62
SMILES: CCCCCC/C=C\CCCCCCCCCC(=O)[O-]
\n节点统计:
  配体节点数: 20
  蛋白质节点数: 98
  总节点数: 118
\n边统计:
  配体内部边数: 38 (16.1%)
  蛋白质内部边数: 176 (74.6%)
  蛋白-配体相互作用边数: 22 (9.3%)
  总边数: 236
\n整体边特征分析:
  边特征形状: (1224, 4)
  边特征数据类型: float32
  整体边类型分布:
    其他相互作用 (Others): 678 条边 (55.4%)
    疏水相互作用 (Hydrophobic): 198 条边 (16.2%)
    未知类型 (0, 0, 0): 106 条边 (8.7%)
    π-π堆积 (Pi Stacking): 72 条边 (5.9%)
    未知类型 (3, 0, 1): 68 条边 (5.6%)
    氢键 (Hydrogen Bonds): 34 条边 (2.8%)
    π-阳离子相互作用 (Pi-Cation): 24 条边 (2.0%)
    未知类型 (1, 0, 1): 20 条边 (1.6%)
    未知类型 (0, 0, 1): 16 条边 (1.3%)
    未知类型 (1, 0, 0): 4 条边 (0.3%)
    盐桥 (Salt Bridges): 4 条边 (0.3%)
\n配体内部空间边:
  配体空间边数量: 102
  配体空间边特征形状: (102, 4)
  配体内部边类型分布:
    疏水相互作用 (Hydrophobic): 58 条边 (56.9%)
    其他相互作用 (Others): 42 条边 (41.2%)
    氢键 (Hydrogen Bonds): 2 条边 (2.0%)
\n蛋白质内部空间边:
  蛋白空间边数量: 886
  蛋白空间边特征形状: (886, 4)
  蛋白质内部边类型分布:
    其他相互作用 (Others): 636 条边 (71.8%)
    疏水相互作用 (Hydrophobic): 118 条边 

\n配体内部空间边:
  配体空间边数量: 102
  配体空间边特征形状: (102, 4)
  配体内部边类型分布:
    疏水相互作用 (Hydrophobic): 54 条边 (52.9%)
    其他相互作用 (Others): 46 条边 (45.1%)
    氢键 (Hydrogen Bonds): 2 条边 (2.0%)
    \n配体内部空间边:
  配体空间边数量: 102
  配体空间边特征形状: (102, 4)
  配体内部边类型分布:
    疏水相互作用 (Hydrophobic): 58 条边 (56.9%)
    其他相互作用 (Others): 42 条边 (41.2%)
    氢键 (Hydrogen Bonds): 2 条边 (2.0%)