In [1]:
import pandas as pd
import numpy as np

 
input_csv_path = r'.\data\prior\GO_IDs_Namespaces_Embedding.csv'
df = pd.read_csv(input_csv_path)

 
embeddings_txt_path = r'embeding/SODGO_embedings.txt'
new_embeddings = []

with open(embeddings_txt_path, 'r') as f:
    next(f)   
    for line in f:
        embedding = [float(x) for x in line.strip().split()[1:]]   
        new_embeddings.append(embedding)

 
embedding_length = len(new_embeddings[0])
if all(len(embedding) == embedding_length for embedding in new_embeddings):
     
    new_embeddings = np.array(new_embeddings)
else:
    raise ValueError("The embedding vector length in the text file is inconsistent. Please check the file contents.")

 
if len(new_embeddings) != len(df):
    raise ValueError("The number of rows of the DataFrame does not match the number of new embedding vectors, check the file.")

df['embedding'] = new_embeddings.tolist()

 
output_csv_path = r'.\data\prior\GO_IDs_Namespaces_Embedding_updated.csv'
df.to_csv(output_csv_path, index=False)


嵌入向量已成功替换并保存到新的 CSV 文件中。


In [1]:
import os
import numpy as np
from sklearn.decomposition import PCA

 
input_file = r'.\embeding\SODGO_embedings.txt'
output_file = r'E:.\reform_dim\SODGO_embedings_reduced.txt'
old_dim = 128   
new_dim = 24    

 
def read_embeddings(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    with open(file_path, 'r') as f:
         
        first_line = f.readline().strip().split()
        entity_count = int(first_line[0])
        dim = int(first_line[1]) *3
        
         
        embeddings = []
        entity_ids = []
        for line in f:
            parts = line.strip().split()
            entity_ids.append(parts[0])   
            embeddings.append([float(x) for x in parts[1:]])
        
    return entity_count, dim, entity_ids, np.array(embeddings)

 
def save_embeddings(file_path, entity_count, new_dim, entity_ids, reduced_embeddings):
    with open(file_path, 'w') as f:
         
        f.write(f"{entity_count} {3 * new_dim}\n")
        
         
        for entity_id, embedding in zip(entity_ids, reduced_embeddings):
            f.write(f"{entity_id} " + " ".join(map(str, embedding)) + "\n")

 
def reduce_dimensions(embeddings, old_dim, new_dim):
     
    assert embeddings.shape[1] == 3 * old_dim, "The dimension of the embedding does not meet the requirement of 3 times old_dim"
    
     
    reduced_embeddings = []
    for i in range(3):
        slice_start = i * old_dim
        slice_end = (i + 1) * old_dim
        slice_data = embeddings[:, slice_start:slice_end]
        
         
        pca = PCA(n_components=new_dim)
        reduced_slice = pca.fit_transform(slice_data)
        reduced_embeddings.append(reduced_slice)
    
     
    return np.hstack(reduced_embeddings)

 
def main():
    try:
         
        entity_count, dim, entity_ids, embeddings = read_embeddings(input_file)
        
        for entity_id, embedding in zip(entity_ids[:5], embeddings[:5]):
            print(entity_id, embedding)
        
         
        if dim != 3 * old_dim:
            raise ValueError(f"The dimension ({dim}) in the file is not consistent with the expected 3 * old_dim({3 * old_dim}).")
        
         
        reduced_embeddings = reduce_dimensions(embeddings, old_dim, new_dim)
        
         
        save_embeddings(output_file, entity_count, new_dim, entity_ids, reduced_embeddings)
        print(f"{output_file}")
    except FileNotFoundError as e:
        print(e)
    except ValueError as e:
        print(e)

if __name__ == "__main__":
    main()


找不到文件: E:\桌面\服务器\similarity_GO\basic\embed\embeding\SODGO_embedings.txt
