# 加载数据（分割为gene和term）

In [2]:
# 从data/目录下读取all_entity.csv文件
import pandas as pd
all_entity = pd.read_csv('./data/all_entity.csv')
all_entity.head()

Unnamed: 0,entity,id,type,namespace
0,GO:0098685,0,GO_term,cellular_component
1,FRS2-beta,1,Gene,
2,GO:1905890,2,GO_term,biological_process
3,NEK11,3,Gene,
4,GO:0042694,4,GO_term,biological_process


In [3]:
import pandas as pd

# 读取 CSV 文件
csv_file_path = './data/all_entity.csv'
csv_data = pd.read_csv(csv_file_path)

# 清理列名中的空格
csv_data.columns = csv_data.columns.str.strip()

# 去除 type 列中的前后空格
csv_data['type'] = csv_data['type'].str.strip()

# 定义函数从文本文件中读取嵌入，并处理潜在的文件读取异常
def read_embeddings(embedding_file_path):
    embeddings = {}
    try:
        with open(embedding_file_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                emb_id = int(parts[0])  # ID
                emb_values = list(map(float, parts[1:]))  # 嵌入向量
                embeddings[emb_id] = emb_values
    except FileNotFoundError:
        print(f"Error: File {embedding_file_path} not found.")
    except Exception as e:
        print(f"An error occurred while reading embeddings: {e}")
    return embeddings

# 读取嵌入文件，捕捉文件相关错误
embedding_file_path = 'D:\zfyx\嵌入分析\emb\Gene_go_best_valid_mrr_64.txt'
embeddings = read_embeddings(embedding_file_path)

# 分别保存 Gene 和 GO_term 的嵌入
gene_emb = {}
go_emb = {}

for index, row in csv_data.iterrows():
    try:
        entity_id = row['id']
        entity_type = row['type']
        
        if entity_type == 'Gene':
            gene_emb[entity_id] = embeddings.get(entity_id, None)  # 保存到 gene_emb
        elif entity_type == 'GO_term':
            go_emb[entity_id] = embeddings.get(entity_id, None)  # 保存到 go_emb
        else:
            print(f"Unknown entity type '{entity_type}' at index {index}")
    except KeyError as e:
        print(f"KeyError: {e} for entity ID: {entity_id}")
    except Exception as e:
        print(f"An unexpected error occurred: {e} at index {index}")

# 添加异常捕捉机制，确保数据映射时的安全
try:
    csv_data['embedding'] = csv_data['id'].apply(lambda x: gene_emb.get(x) if x in gene_emb else go_emb.get(x))
except Exception as e:
    print(f"An error occurred while mapping embeddings: {e}")

# # 将结果显示或保存
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Mapped CSV with Gene and GO Embeddings", dataframe=csv_data)


In [4]:
csv_data.head()

Unnamed: 0,entity,id,type,namespace,embedding
0,GO:0098685,0,GO_term,cellular_component,"[-0.2061376, -0.0, -0.2402187, -0.2214603, -0...."
1,FRS2-beta,1,Gene,,"[-0.0, -0.2006746, -0.2402187, -0.2214603, -0...."
2,GO:1905890,2,GO_term,biological_process,"[-0.2061376, -0.2006746, -0.0, -0.2214603, -0...."
3,NEK11,3,Gene,,"[-0.2061376, -0.2006746, -0.2402187, -0.221460..."
4,GO:0042694,4,GO_term,biological_process,"[-0.2061376, -0.0, -0.0, -0.2214603, -0.247762..."


In [5]:
# 根据type列的值，将数据分为两部分
gene_data = csv_data[csv_data['type'] == 'Gene']
go_data = csv_data[csv_data['type'] == 'GO_term']

gene_data.head()

Unnamed: 0,entity,id,type,namespace,embedding
1,FRS2-beta,1,Gene,,"[-0.0, -0.2006746, -0.2402187, -0.2214603, -0...."
3,NEK11,3,Gene,,"[-0.2061376, -0.2006746, -0.2402187, -0.221460..."
7,CDGIF,7,Gene,,"[-0.2061376, -0.2006746, -0.2402187, -0.221460..."
8,SUV39H,8,Gene,,"[-0.2061376, -0.2006746, -0.2402187, -0.0, -0...."
9,BWSCR1A,9,Gene,,"[-0.2061376, -0.2006746, -0.2402187, -0.221460..."


#   计算嵌入相似性

In [6]:
gene_data.head()

Unnamed: 0,entity,id,type,namespace,embedding
1,FRS2-beta,1,Gene,,"[-0.0, -0.2006746, -0.2402187, -0.2214603, -0...."
3,NEK11,3,Gene,,"[-0.2061376, -0.2006746, -0.2402187, -0.221460..."
7,CDGIF,7,Gene,,"[-0.2061376, -0.2006746, -0.2402187, -0.221460..."
8,SUV39H,8,Gene,,"[-0.2061376, -0.2006746, -0.2402187, -0.0, -0...."
9,BWSCR1A,9,Gene,,"[-0.2061376, -0.2006746, -0.2402187, -0.221460..."


In [9]:
import torch
import pandas as pd

# 假设我们有一个包含嵌入的数据框
# gene_data = pd.read_csv('your_data.csv') # 这里替换成你加载数据的方式

# 假设 gene_data 是你展示的数据框
# 从 'embedding' 列提取嵌入数据
embeddings = gene_data['embedding'].apply(lambda x: eval(x)).tolist()

# 将嵌入转为 PyTorch 张量
gene_embeddings = torch.tensor(embeddings)

# 计算每个嵌入的范数
norms = gene_embeddings.norm(dim=1, keepdim=True)

# 归一化嵌入向量
normalized_embeddings = gene_embeddings / norms

# 计算相似度矩阵（余弦相似度）
similarity_matrix = torch.mm(normalized_embeddings, normalized_embeddings.t())

# 如果你只需要上三角矩阵，可以进一步减少计算
upper_triangular_indices = torch.triu_indices(normalized_embeddings.size(0), normalized_embeddings.size(0), offset=1)
upper_triangular_similarities = similarity_matrix[upper_triangular_indices[0], upper_triangular_indices[1]]

# 输出相似度矩阵或上三角部分的相似度
print(similarity_matrix)
print(upper_triangular_similarities)


TypeError: eval() arg 1 must be a string, bytes or code object

In [10]:
print(gene_data['embedding'].head())


1    [-0.0, -0.2006746, -0.2402187, -0.2214603, -0....
3    [-0.2061376, -0.2006746, -0.2402187, -0.221460...
7    [-0.2061376, -0.2006746, -0.2402187, -0.221460...
8    [-0.2061376, -0.2006746, -0.2402187, -0.0, -0....
9    [-0.2061376, -0.2006746, -0.2402187, -0.221460...
Name: embedding, dtype: object


In [11]:
import torch

# 提取 'embedding' 列的嵌入
embeddings = gene_data['embedding'].tolist()

# 将嵌入转为 PyTorch 张量
gene_embeddings = torch.tensor(embeddings)

# 计算每个嵌入的范数
norms = gene_embeddings.norm(dim=1, keepdim=True)

# 归一化嵌入向量
normalized_embeddings = gene_embeddings / norms

# 计算相似度矩阵（余弦相似度）
similarity_matrix = torch.mm(normalized_embeddings, normalized_embeddings.t())

# 如果你只需要上三角矩阵，可以进一步减少计算
upper_triangular_indices = torch.triu_indices(normalized_embeddings.size(0), normalized_embeddings.size(0), offset=1)
upper_triangular_similarities = similarity_matrix[upper_triangular_indices[0], upper_triangular_indices[1]]

# 输出相似度矩阵或上三角部分的相似度
print(similarity_matrix)
print(upper_triangular_similarities)


tensor([[1.0000, 0.7215, 0.7169,  ..., 0.7459, 0.7159, 0.7251],
        [0.7215, 1.0000, 0.6604,  ..., 0.6969, 0.6854, 0.6849],
        [0.7169, 0.6604, 1.0000,  ..., 0.6917, 0.7240, 0.7121],
        ...,
        [0.7459, 0.6969, 0.6917,  ..., 1.0000, 0.7557, 0.6819],
        [0.7159, 0.6854, 0.7240,  ..., 0.7557, 1.0000, 0.7086],
        [0.7251, 0.6849, 0.7121,  ..., 0.6819, 0.7086, 1.0000]])
tensor([0.7215, 0.7169, 0.7459,  ..., 0.7557, 0.6819, 0.7086])


In [12]:
# 将 PyTorch 张量保存为 NumPy 格式文件
similarity_matrix_np = similarity_matrix.numpy()

# 使用 NumPy 保存为 .npy 文件
import numpy as np
np.save('similarity_matrix.npy', similarity_matrix_np)

print("相似度矩阵已保存为 similarity_matrix.npy")


相似度矩阵已保存为 similarity_matrix.npy


In [13]:
import torch
import numpy as np
import pandas as pd

# 1. 将对角线上的值（基因本身的相似度）设为负无穷，表示忽略这些值
similarity_matrix.fill_diagonal_(-float('inf'))

# 2. 找到每个基因相似度最高的前20个基因
top_k = 20
topk_values, topk_indices = torch.topk(similarity_matrix, top_k, dim=1)

# topk_values 是每个基因与其他基因的前20个相似度值
# topk_indices 是对应的基因索引

# 3. 将结果保存为文件
# 创建一个DataFrame保存每个基因前20个最相似的基因及其相似度
results = []

for i in range(similarity_matrix.size(0)):
    gene_id = i  # 当前基因的索引
    similar_gene_indices = topk_indices[i].tolist()  # 前20个最相似基因的索引
    similar_gene_values = topk_values[i].tolist()    # 前20个最相似基因的相似度值

    # 将当前基因的结果存储
    for idx, val in zip(similar_gene_indices, similar_gene_values):
        results.append([gene_id, idx, val])

# 将结果转为 DataFrame
results_df = pd.DataFrame(results, columns=['Gene_ID', 'Similar_Gene_Index', 'Similarity'])

# 保存为 CSV 文件
results_df.to_csv('top_20_similar_genes.csv', index=False)

print("每个基因相似度最高的前20个基因索引和相似度已保存为 top_20_similar_genes.csv")


每个基因相似度最高的前20个基因索引和相似度已保存为 top_20_similar_genes.csv


# oldtonew

In [7]:
# 假设 gene_data 是你的 DataFrame，'entity' 列包含基因名称

# 按顺序对 'entity' 列中的基因进行重新编号，从 0 开始
new_gene2id = {gene: idx for idx, gene in enumerate(gene_data['entity'])}

# 打印生成的 new_gene2id 映射
print(new_gene2id)

# 保存 new_gene2id 为 JSON 文件
import json
with open('new_gene2id.json', 'w') as f:
    json.dump(new_gene2id, f)

print("new_gene2id 映射已保存为 new_gene2id.json")


{'FRS2-beta': 0, 'NEK11': 1, 'CDGIF': 2, 'SUV39H': 3, 'BWSCR1A': 4, 'DEHMBA': 5, 'GnT-VB': 6, 'SMAP4': 7, 'Mmip-2': 8, 'SCYLP': 9, 'HMX2': 10, 'taR-2': 11, 'TMBIM4': 12, 'PSG5': 13, 'HPAST1': 14, 'SRp30b': 15, 'LNOX2': 16, 'CPSF100': 17, 'TPCR27': 18, 'MACC1': 19, 'THOC1': 20, 'NKX2.3': 21, 'B7.1': 22, 'MOT': 23, 'CPPB2': 24, 'FDX1L': 25, 'ICH': 26, 'YBX3': 27, 'CDC2L1': 28, 'RNF197': 29, 'TMBTS': 30, 'USP48': 31, 'bA9819.1': 32, 'KMHN1': 33, 'fSAP71': 34, 'H4F5': 35, 'UMAD1': 36, 'FAM198A': 37, 'KAP2.3': 38, 'SWSAP1': 39, 'FAM55C': 40, 'K6D': 41, 'PC5': 42, 'CyCAP': 43, 'SUSD5': 44, 'H3.5': 45, 'bA570F3.1': 46, 'beta4Gal-T4': 47, 'IBRDC3': 48, 'YDL201w': 49, 'UQCC2': 50, 'TSHB': 51, 'BHLHE22': 52, 'ZNF309': 53, 'IGFBP-7': 54, 'UNQ3064': 55, 'MUC19': 56, 'HIBBJ46': 57, 'TDN': 58, 'AMCASE': 59, 'PTD001': 60, 'BRPF2': 61, 'AKAP-Lbc': 62, 'P4Hbeta': 63, 'HSPC276': 64, 'TMEM59L': 65, 'DYT16': 66, 'ZMIZ4': 67, 'IFI-6-1616': 68, 'HILAP': 69, 'IFNA21': 70, 'HLC3': 71, 'C9orf142': 72, 'hKFC-B'

In [8]:
# 假设 results_df 是之前保存的 DataFrame
# 假设 new_gene2id 是已经生成的基因与ID的映射

# 1. 创建反向映射，将 new_gene2id 中的 {基因名: 索引} 反转为 {索引: 基因名}
id2gene = {idx: gene for gene, idx in new_gene2id.items()}

results_df =

# 2. 将 'Gene_ID' 和 'Similar_Gene_Index' 列中的索引替换为基因名
results_df['Gene_Name'] = results_df['Gene_ID'].map(id2gene)
results_df['Similar_Gene_Name'] = results_df['Similar_Gene_Index'].map(id2gene)

# 3. 删除原来的 'Gene_ID' 和 'Similar_Gene_Index' 列
results_df = results_df.drop(columns=['Gene_ID', 'Similar_Gene_Index'])

# 4. 将结果保存为新的 CSV 文件
results_df.to_csv('results_with_gene_names.csv', index=False)

print("results_df 中的索引已替换为基因名，并保存为 results_with_gene_names.csv")


NameError: name 'results_df' is not defined