In [1]:
import pandas as pd
import numpy as np

# 步骤 1：将现有的 CSV 文件加载到 pandas DataFrame 中
input_csv_path = r'E:\桌面\服务器\similarity_GO\basic\data\prior\GO_IDs_Namespaces_Embedding.csv'
df = pd.read_csv(input_csv_path)

# 步骤 2：从文本文件中加载新的嵌入向量
embeddings_txt_path = r'embeding/DisenGO_embedings.txt'
new_embeddings = []

with open(embeddings_txt_path, 'r') as f:
    next(f)  # 跳过第一行
    for line in f:
        embedding = [float(x) for x in line.strip().split()[1:]]  # 跳过每行的第一列
        new_embeddings.append(embedding)

# 检查嵌入向量是否具有相同的长度
embedding_length = len(new_embeddings[0])
if all(len(embedding) == embedding_length for embedding in new_embeddings):
    # 将新的嵌入向量列表转换为 numpy 数组
    new_embeddings = np.array(new_embeddings)
else:
    raise ValueError("文本文件中的嵌入向量长度不一致，请检查文件内容。")

# 步骤 3：用新的嵌入向量替换 DataFrame 中的旧嵌入向量
if len(new_embeddings) != len(df):
    raise ValueError("DataFrame 的行数与新的嵌入向量的数量不匹配，请检查文件。")

df['embedding'] = new_embeddings.tolist()

# 步骤 4：将更新后的 DataFrame 保存到新的 CSV 文件
output_csv_path = r'E:\桌面\服务器\similarity_GO\basic\data\prior\GO_IDs_Namespaces_Embedding_updated.csv'
df.to_csv(output_csv_path, index=False)

print("嵌入向量已成功替换并保存到新的 CSV 文件中。")


嵌入向量已成功替换并保存到新的 CSV 文件中。


In [8]:
import os
import numpy as np
from sklearn.decomposition import PCA

# 输入文件路径和参数
input_file = r'E:\桌面\服务器\similarity_GO\basic\embed\embeding\DisenGO_embedings.txt'
output_file = r'E:\桌面\服务器\similarity_GO\basic\embed\reform_dim\DisenGO_embedings_reduced.txt'
old_dim = 128  # 原始维度大小
new_dim = 24   # 降低到的目标维度

# 读取原始文件
def read_embeddings(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"找不到文件: {file_path}")
    
    with open(file_path, 'r') as f:
        # 第一行包含实体数量和维度信息
        first_line = f.readline().strip().split()
        entity_count = int(first_line[0])
        dim = int(first_line[1]) *3
        
        # 读取嵌入数据
        embeddings = []
        entity_ids = []
        for line in f:
            parts = line.strip().split()
            entity_ids.append(parts[0])  # 第一列为实体id
            embeddings.append([float(x) for x in parts[1:]])
        
    return entity_count, dim, entity_ids, np.array(embeddings)

# 保存降维后的嵌入
def save_embeddings(file_path, entity_count, new_dim, entity_ids, reduced_embeddings):
    with open(file_path, 'w') as f:
        # 写入第一行：实体数量和新的维度信息
        f.write(f"{entity_count} {3 * new_dim}\n")
        
        # 写入降维后的嵌入数据
        for entity_id, embedding in zip(entity_ids, reduced_embeddings):
            f.write(f"{entity_id} " + " ".join(map(str, embedding)) + "\n")

# 对嵌入进行切片并降维
def reduce_dimensions(embeddings, old_dim, new_dim):
    # 检查是否可以正确地将嵌入划分为3个切片
    assert embeddings.shape[1] == 3 * old_dim, "嵌入的维度不符合3倍old_dim的要求"
    
    # 对每个切片分别降维
    reduced_embeddings = []
    for i in range(3):
        slice_start = i * old_dim
        slice_end = (i + 1) * old_dim
        slice_data = embeddings[:, slice_start:slice_end]
        
        # 使用PCA进行降维
        pca = PCA(n_components=new_dim)
        reduced_slice = pca.fit_transform(slice_data)
        reduced_embeddings.append(reduced_slice)
    
    # 将降维后的三个切片拼接起来
    return np.hstack(reduced_embeddings)

# 主流程
def main():
    try:
        # 读取嵌入数据
        entity_count, dim, entity_ids, embeddings = read_embeddings(input_file)
        
        # 打印嵌入的前五行
        print("嵌入的前五行:")
        for entity_id, embedding in zip(entity_ids[:5], embeddings[:5]):
            print(entity_id, embedding)
        
        # 检查维度是否符合要求
        if dim != 3 * old_dim:
            raise ValueError(f"文件中的维度({dim})与预期的3 * old_dim({3 * old_dim})不一致。")
        
        # 对嵌入进行降维
        reduced_embeddings = reduce_dimensions(embeddings, old_dim, new_dim)
        
        # 保存降维后的嵌入
        save_embeddings(output_file, entity_count, new_dim, entity_ids, reduced_embeddings)
        print(f"降维后的嵌入已保存到 {output_file}")
    except FileNotFoundError as e:
        print(e)
    except ValueError as e:
        print(e)

if __name__ == "__main__":
    main()


嵌入的前五行:
0 [ 0.1028856  -0.          0.6362823   0.5109912   0.          0.2096516
  0.2131817  -0.          0.1547602   0.153346    0.1613547   0.4038821
  1.046876    0.2840714  -0.          0.2787862   0.1957743  -0.1825781
 -0.1123113   0.          0.8110542   0.2597196   0.4206546  -0.1450264
  0.00259759  0.02681614  0.          0.1107397   0.06070654  0.7842797
  0.          0.1562734   0.          0.1781882  -0.2335553   0.
  0.5371653   0.00470325 -0.2645913  -0.06026687  0.2206956  -0.3675617
  0.6018711   0.4584843   0.4595249  -0.         -0.5244626   0.3079007
  0.5645362   0.8006094  -0.1340827   0.06005013 -0.1375231   0.
  0.6138903   0.469929    0.5649364   0.2095708   0.9099756   0.00899977
  0.3238872   0.4906954   0.2743423   0.2710131  -0.00526576  0.
  0.275341    0.7994437   0.         -0.1374401   0.572575   -0.6630726
  0.6765105  -0.0426329  -0.         -0.         -0.1560699   0.8279077
  0.3031332   0.          0.7525079   0.8336183   0.4852214   0.
  0.00687