In [None]:
import pandas as pd
import networkx as nx

# 读取TMalign结果文件
tm_align_df = pd.read_csv('TMalign_results.tsv', sep='\t')

# 提取需要的列：query, target, evalue, alntmscore
tm_align_df = tm_align_df[['query', 'target', 'evalue', 'alntmscore']]

# 设置阈值
evalue_threshold = 1e-5  # evalue和tmscore一致
tm_score_threshold = 0.5

# 筛选出结构相似的蛋白
filtered_df = tm_align_df[(tm_align_df['evalue'] <= evalue_threshold) & 
                          (tm_align_df['alntmscore'] >= tm_score_threshold)]

# 保留最高的tm值
filtered_df = filtered_df.loc[filtered_df.groupby(['query', 'target'])['alntmscore'].idxmax()]

# 整理成两列的网络文件
network_df = filtered_df[['query', 'target']].drop_duplicates()

# 创建网络图
G = nx.from_pandas_edgelist(network_df, 'query', 'target')

# 统计簇和单一节点
clusters = list(nx.connected_components(G))
num_clusters = len(clusters)
num_singletons = sum(1 for cluster in clusters if len(cluster) == 1)

# 保存网络文件
network_df.to_csv('protein_network.tsv', sep='\t', index=False, header=False)

# 输出统计结果
print(f"Number of clusters: {num_clusters}")
print(f"Number of singletons: {num_singletons}")


KeyboardInterrupt: 