In [2]:
import os
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2 import robjects
import pandas as pd
import json
from scipy.sparse import csr_matrix
from scipy.sparse import csc_matrix
import numpy as np

# 启用 pandas2ri 以便将 R 数据帧自动转换为 pandas DataFrame
pandas2ri.activate()

In [None]:
file_list = '/home/gy237/palmer_scratch/scFoundationModel_data/BICCN_data/test_data/downloads/HsCB.AllTypes.Clean.SeuratObj.rds'
# 读取 .rds 文件
rds_path = file_list
readRDS = robjects.r['readRDS']
seurat_obj = readRDS(rds_path)

In [23]:
print(seurat_obj)
str_output = robjects.r['str'](seurat_obj)
print(str_output)

An object of class Seurat 
38601 features across 220107 samples within 2 assays 
Active assay: integrated (2000 features, 2000 variable features)
 2 layers present: data, scale.data
 1 other assay present: RNA
 2 dimensional reductions calculated: pca, umap

Formal class 'Seurat' [package "SeuratObject"] with 13 slots
  ..@ assays      :List of 2
  .. ..$ RNA       :Formal class 'Assay' [package "SeuratObject"] with 8 slots
  .. .. .. ..@ counts       :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
  .. .. .. .. .. ..@ i       : int [1:389561916] 51 92 208 247 254 293 339 342 343 383 ...
  .. .. .. .. .. ..@ p       : int [1:220108] 0 1815 6861 8998 9882 13514 14468 15043 15741 17454 ...
  .. .. .. .. .. ..@ Dim     : int [1:2] 36601 220107
  .. .. .. .. .. ..@ Dimnames:List of 2
  .. .. .. .. .. .. ..$ : chr [1:36601] "MIR1302-2HG" "FAM138A" "OR4F5" "AL627309.1" ...
  .. .. .. .. .. .. ..$ : chr [1:220107] "AAACCCAAGAGGCTGT-1_1" "AAACCCAAGCCGTCGT-1_1" "AAACCCAAGCTGACCC-1_1" 

In [27]:
def dataframe2list(data):
    # 2. 转换为字典格式
    data_to_save = []
    for index, row in data.iterrows():
        row_dict = row.to_dict()  # 将行转换为字典
        row_dict['row_name'] = index  # 添加行名
        data_to_save.append(row_dict)

    return data_to_save
    # # 保存为 JSON 文件
    # with open(f'/home/gy237/palmer_scratch/scFoundationModel_data/BICCN_data/test_data/HsCB.AllTypes.Clean.SeuratObj.rds/{name}.json', 'w') as json_file:
    #     json.dump(data_to_save, json_file, indent=4)

In [28]:
# 获取 metadata
metadata = seurat_obj.slots["meta.data"]
# print(metadata)
# 如果 metadata 还不是 pandas DataFrame，先将其转换为 DataFrame
metadata_df = pd.DataFrame(metadata)
metadata_list = dataframe2list(metadata_df)

In [None]:
active_ident = seurat_obj.slots["active.ident"]
# print(active_ident)
# 将 active_ident 转换为 R 向量
active_ident_vector = robjects.r['as.vector'](active_ident)

# 获取细胞 ID
cell_ids = active_ident.names

# 创建 Pandas DataFrame
active_ident_df = pd.DataFrame({
    'Cell ID': cell_ids,
    'Name': active_ident_vector
})
active_ident_list = dataframe2list(active_ident_df)

In [34]:
def merge_cell_annotation(metadata_list, annotation_list, output_path):
    print(len(metadata_list))
    print(len(annotation_list))

    cell_dic = {}
    for i in annotation_list:
        cell_dic[i['Cell ID']] = i['Name']
    print(len(cell_dic))

    output = []
    for j in metadata_list:
        j['Cell_Name'] = cell_dic[j['row_name']]
        output.append(j)
    print(len(output))

    with open(output_path, 'w') as json_file:
        json.dump(output, json_file, indent=4)
    with open('/home/gy237/palmer_scratch/scFoundationModel_data/BICCN_data/test_data/HsCB.AllTypes.Clean.SeuratObj.rds/test.json', 'w') as json_file:
        json.dump(output[:40], json_file, indent=4)

output_json_path = '/home/gy237/palmer_scratch/scFoundationModel_data/BICCN_data/test_data/HsCB.AllTypes.Clean.SeuratObj.rds/merged_metadata.json'
merge_cell_annotation(metadata_list, active_ident_list, output_json_path)

220107
220107
220107
220107


In [31]:
# get CellxGene dataframe

## get row_name: Cell_Name dictory
with open('/home/gy237/palmer_scratch/scFoundationModel_data/BICCN_data/test_data/HsCB.AllTypes.Clean.SeuratObj.rds/merged_metadata.json', 'r') as f:
    metadata = json.load(f)

cellname_dic = {}
for i in metadata:
    cellname_dic[i['row_name']] = i['Cell_Name']
print(len(cellname_dic))


# 提取 Seurat 对象中的 data 槽位
integrated_assay = seurat_obj.slots['assays'].rx2('integrated')
data_matrix = integrated_assay.slots['data']

# 提取行名和列名
gene_names = list(data_matrix.slots['Dimnames'][0])  # 基因
cell_names = list(data_matrix.slots['Dimnames'][1])  # 细胞

# 将 R 的 dgCMatrix 转换为 scipy 的 csc_matrix
sparse_matrix = csc_matrix((data_matrix.slots['x'], data_matrix.slots['i'], data_matrix.slots['p']),
                           shape=data_matrix.slots['Dim'])

# 检查矩阵的形状和名称列表的长度
print("Sparse matrix shape:", sparse_matrix.shape)
print("Number of cell names:", len(cell_names))
print("Number of gene names:", len(gene_names))

# 创建 DataFrame，行名是细胞，列名是基因
cellxgene_df = pd.DataFrame.sparse.from_spmatrix(sparse_matrix, index=gene_names, columns=cell_names)

# 打印或保存结果
# print(cellxgene_df)

# 使用 rename 方法替换行索引
cellxgene_df.rename(index=cellname_dic, inplace=True)

# 转换为常规 DataFrame
# dense_df = cellxgene_df.sparse.to_dense()
# print(dense_df)

# 查看替换后的 DataFrame
df.to_hdf('/home/gy237/palmer_scratch/scFoundationModel_data/BICCN_data/test_data/HsCB.AllTypes.Clean.SeuratObj.rds/cellxgene_df.h5', key='df', mode='w')

220107
Sparse matrix shape: (2000, 220107)
Number of cell names: 220107
Number of gene names: 2000


NameError: name 'df' is not defined