In [1]:
import scanpy as sc
import pandas as pd
import decoupler as dc

In [3]:
#去除其他染色体上的基因和一些样本以及细胞类型
adata = sc.read_h5ad('/media/NaturalPopulationCohort/scRNA_Data/NatualCohort_All_Annotation_Final_reUMAP.h5ad')
tss_bed = pd.read_csv('/media/scPBMC1_AnalysisDisk1/huangzhuoli/hw5_backup/gaoyue/health/Make_Gene_TSS/tss.bed',sep = ' ')
adata = adata[:,adata.var_names.isin(tss_bed['gene_id'])]
sample_with_genomics = pd.read_table('/CIMA/Data/413sample.txt')
#去除没有wgs信息的样本
adata = adata[adata.obs['sample'].isin(sample_with_genomics['FID']),:]
adata = adata[adata.obs['final_annotation'] != 'HSPC_CD34']
adata = adata[adata.obs['celltype_l1'] != 'unconvensional_T']
del adata.layers

In [5]:
adata

AnnData object with n_obs × n_vars = 5305776 × 35083
    obs: 'library', 'sample', 'doublet_scores', 'predicted_doublets', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_rb', 'pct_counts_rb', 'total_counts_ncRNA', 'pct_counts_ncRNA', 'total_counts_hb', 'pct_counts_hb', 'n_counts', 'n_genes', 'leiden_r1.5_n2', 'leiden_r2_n2', 'celltype_1st', 'celltype_2nd', 'celltype_3rd', 'celltype_l1', 'celltype_l2', 'celltype_l3', 'celltype_l4', 'SEACell_ID', 'SEACell_l3', 'SEACell_l4', 'final_annotation'
    var: 'mt', 'hb', 'rb', 'ncRNA', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'celltype_1st_colors', 'celltype_2nd_colors', 'celltype_3rd_colors', 'celltype_l1_colors', 'celltype_l3_colors', 'celltype_l4_colors', 'final_annotation_colors', 'hvg', 'leiden', 'leiden_r1.5_n2_colors', 'leiden_r2_n2_colors', 'log1p', 'myeloid_colo

In [4]:
def preprocess_RNA_data(adata_use):
    for celltype in adata_use.obs['celltype_l1'].unique():
        print(f"正在处理 {celltype}", flush=True)
        #分小类
        adata_use_celltype = adata_use[adata_use.obs['celltype_l1'] == celltype]

        #算伪Bulk
        pdata = dc.get_pseudobulk(
            adata_use_celltype,
            sample_col='sample',
            groups_col=None,
            mode='mean',
            min_cells=10,
            min_counts=0,
            min_prop=0,
            min_smpls=0)
        
        #生成伪bulk矩阵
        pseudo_matrix = pd.DataFrame(pdata.X)
        pseudo_matrix.columns = pdata.var_names
        pseudo_matrix.index = pdata.obs.index

        #只选取在90%的样本中都表达的特征
        non_zero_ratio = (pseudo_matrix != 0).mean()
        columns_to_keep = non_zero_ratio[non_zero_ratio >= 0.9].index
        pseudo_matrix = pseudo_matrix[columns_to_keep]

        # 计算每列的均值和标准差
        means = pseudo_matrix.mean()
        stds = pseudo_matrix.std()
        # 计算变异系数
        cv = (stds / means).abs() * 100
        # 按变异系数从大到小排序
        sorted_columns = cv.sort_values(ascending=False).index
        # 选取变异系数最高的前2000列
        top_2000_columns = sorted_columns[:min(2000,pseudo_matrix.shape[1])]

        if pseudo_matrix.shape[1] > 0:
            #normalized_pseudo_matrix = quantile_transformer.fit_transform(pseudo_matrix)
            #normalized_pseudo_matrix = scaler.fit_transform(normalized_pseudo_matrix)
            #normalized_pseudo_matrix = pd.DataFrame(normalized_pseudo_matrix, columns=pseudo_matrix.columns,index=pseudo_matrix.index)
            #normalized_pseudo_matrix.to_csv(f'/CIMA/Data/eQTL/normal_dis/{celltype}.csv')
            #normalized_pseudo_matrix[top_2000_columns].to_csv(f'/CIMA/Data/eQTL/top2000_normal_dis/{celltype}.csv')
            pseudo_matrix.to_csv(f'/CIMA/Data/eQTL_L1/pseudobulk/{celltype}.csv')
            pseudo_matrix[top_2000_columns].to_csv(f'/CIMA/Data/eQTL_L1/top2000_pseudobulk/{celltype}.csv')
        print(f"处理完成 {celltype}", flush=True)


In [7]:
preprocess_RNA_data(adata_use=adata)

正在处理 CD8_T
处理完成 CD8_T
正在处理 Myeloid
处理完成 Myeloid
正在处理 CD4_T
处理完成 CD4_T
正在处理 NK&ILC
处理完成 NK&ILC
正在处理 B
处理完成 B


In [8]:
#解决R语言把“-”换成“."的问题
tss = pd.read_csv('/media/scPBMC1_AnalysisDisk1/huangzhuoli/hw5_backup/gaoyue/health/Make_Gene_TSS/tss.bed',sep = ' ')
#为了和R得到的结果相符合
tss['gene_id_map'] = tss['gene_id'].str.replace('-', '.', regex=False)
# 创建字典，映射 gene_id_map 到 gene_id，只包含 B 数据框中出现的列
gene_map_dict = pd.Series(tss.gene_id.values, index=tss.gene_id_map).to_dict()

In [9]:
for cell in ['CD4_T','CD8_T','NK','Myeloid','B']:
    print(f'processing_{cell}')
    #make_bed_file_eQTL
    ## Prepare phenotype data
    norm_expr = pd.read_csv(f'/CIMA/Data/eQTL_L1/normal_dis/{cell}.csv', index_col=0)
    #解决R语言列名把“-”变成'.'的问题
    filtered_map_dict = {key: gene_map_dict[key] for key in norm_expr.columns}
    # 使用字典重命名 B 数据框的列
    norm_expr.rename(columns=filtered_map_dict, inplace=True)

    norm_expr = norm_expr.T
    norm_expr = norm_expr.reset_index()   # gene x sample

    pheno_file =  pd.merge(tss, norm_expr, right_on='index', left_on='gene_id', how='inner')
    pheno_file = pheno_file.drop(['index'], axis=1).rename(columns={'chr':'#chr'})
    pheno_file['#chr'] = pheno_file['#chr'].str.replace('chr','')    # required phenotype file

    if norm_expr.shape[0] != pheno_file.shape[0]:
        raise ValueError("The number of rows in the two dataframes are not equal.")
    else:
        pheno_file.to_csv(f'/CIMA/Data/eQTL_L1/bed_file/{cell}.bed', sep='\t', index=False)

processing_CD4_T
processing_CD8_T
processing_NK
processing_Myeloid
processing_B


In [13]:
pheno_file[pheno_file['gene_id'] == 'CD3D']

Unnamed: 0,#chr,start,end,gene_id,gene_id_map,E-B21100458292,E-B21103279967,E-B21105477143,E-B21106792844,E-B21107799305,...,E-B21947952445,E-B21948759165,E-B21949041549,E-B21949919782,E-B21951228975,E-B21951946398,E-B21954745404,E-B21955109293,E-B21955692773,E-B21998005446
7903,11,118342743,118342744,CD3D,CD3D,-1.904923,0.828465,-0.275781,0.167894,0.45462,...,0.461494,0.055732,0.256544,0.111637,0.400363,-0.987379,0.703922,-0.56025,0.373681,-0.567506
