In [1]:
import scanpy as sc
import pandas as pd
import decoupler as dc
from sklearn.preprocessing import QuantileTransformer
from multiprocessing import Pool, cpu_count
from functools import partial
#from sklearn.preprocessing import QuantileTransformer, StandardScaler
#quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
#scaler = StandardScaler()

In [None]:
#去除其他染色体上的基因和一些样本以及细胞类型
adata = sc.read_h5ad('/CIMA/scRNA_Data/NatualCohort_All_Annotation_Final_reUMAP.h5ad')

tss_bed = pd.read_csv('/CIMA/Make_Gene_TSS/tss.bed',sep = ' ')

adata = adata[:,adata.var_names.isin(tss_bed['gene_id'])]

sample_with_genomics = pd.read_table('/CIMA/Data/413sample.txt')

#去除没有wgs信息的样本
adata = adata[adata.obs['sample'].isin(sample_with_genomics['FID']),:]

celltype_over_70 = pd.read_csv('/CIMA/Data/20241230_xQTL_cell_sample_number/20241230_celltype_70_scRNA.csv')

#去除伪bulk样本少的细胞类型
adata = adata[adata.obs['final_annotation'].isin(celltype_over_70['final_annotation']),:]

del adata.layers

In [None]:
def preprocess_RNA_data(adata_use):
    for celltype in adata_use.obs['final_annotation'].unique():
        print(f"正在处理 {celltype}", flush=True)
        #分小类
        adata_use_celltype = adata_use[adata_use.obs['final_annotation'] == celltype]

        #算伪Bulk
        pdata = dc.get_pseudobulk(
            adata_use_celltype,
            sample_col='sample',
            groups_col=None,
            mode='mean',
            min_cells=10,
            min_counts=0,
            min_prop=0,
            min_smpls=0)
        
        #生成伪bulk矩阵
        pseudo_matrix = pd.DataFrame(pdata.X)
        pseudo_matrix.columns = pdata.var_names
        pseudo_matrix.index = pdata.obs.index

        #只选取在90%的样本中都表达的特征
        non_zero_ratio = (pseudo_matrix != 0).mean()
        columns_to_keep = non_zero_ratio[non_zero_ratio >= 0.9].index
        pseudo_matrix = pseudo_matrix[columns_to_keep]

        # 计算每列的均值和标准差
        means = pseudo_matrix.mean()
        stds = pseudo_matrix.std()
        # 计算变异系数
        cv = (stds / means).abs() * 100
        # 按变异系数从大到小排序
        sorted_columns = cv.sort_values(ascending=False).index
        # 选取变异系数最高的前2000列
        top_2000_columns = sorted_columns[:min(2000,pseudo_matrix.shape[1])]

        if pseudo_matrix.shape[1] > 0:
            #normalized_pseudo_matrix = quantile_transformer.fit_transform(pseudo_matrix)
            #normalized_pseudo_matrix = scaler.fit_transform(normalized_pseudo_matrix)
            #normalized_pseudo_matrix = pd.DataFrame(normalized_pseudo_matrix, columns=pseudo_matrix.columns,index=pseudo_matrix.index)
            #normalized_pseudo_matrix.to_csv(f'/CIMA/Data/eQTL/normal_dis/{celltype}.csv')
            #normalized_pseudo_matrix[top_2000_columns].to_csv(f'/CIMA/Data/eQTL/top2000_normal_dis/{celltype}.csv')
            pseudo_matrix.to_csv(f'/CIMA/Data/eQTL/pseudobulk/{celltype}.csv')
            pseudo_matrix[top_2000_columns].to_csv(f'/CIMA/Data/eQTL/top2000_pseudobulk/{celltype}.csv')
        print(f"处理完成 {celltype}", flush=True)


In [None]:
'''
def process_celltype(celltype,adata_use):
    print(f"正在处理 {celltype}", flush=True)
    #分小类
    adata_use_celltype = adata_use[adata_use.obs['final_annotation'] == celltype]

    #算伪Bulk
    pdata = dc.get_pseudobulk(
        adata_use_celltype,
        sample_col='sample',
        groups_col=None,
        mode='mean',
        min_cells=10,
        min_counts=0,
        min_prop=0,
        min_smpls=0)
        
    #生成伪bulk矩阵
    pseudo_matrix = pd.DataFrame(pdata.X)
    pseudo_matrix.columns = pdata.var_names
    pseudo_matrix.index = pdata.obs.index

    #只选取在90%的样本中都表达的特征
    non_zero_ratio = (pseudo_matrix != 0).mean()
    columns_to_keep = non_zero_ratio[non_zero_ratio >= 0.9].index
    pseudo_matrix = pseudo_matrix[columns_to_keep]

    # 计算每列的均值和标准差
    means = pseudo_matrix.mean()
    stds = pseudo_matrix.std()
    # 计算变异系数
    cv = (stds / means).abs() * 100
    # 按变异系数从大到小排序
    sorted_columns = cv.sort_values(ascending=False).index
    # 选取变异系数最高的前2000列
    top_2000_columns = sorted_columns[:min(2000,pseudo_matrix.shape[1])]

    if pseudo_matrix.shape[1] > 0:
        normalized_pseudo_matrix = quantile_transformer.fit_transform(pseudo_matrix)
        normalized_pseudo_matrix = StandardScaler(normalized_pseudo_matrix)
        normalized_pseudo_matrix = pd.DataFrame(normalized_pseudo_matrix, columns=pseudo_matrix.columns,index=pseudo_matrix.index)
        normalized_pseudo_matrix.to_csv(f'/CIMA/Data/eQTL/normal_dis/{celltype}.csv')
        normalized_pseudo_matrix[top_2000_columns].to_csv(f'/media/AnalysisDisk1/Huangzhuoli/caQTL_pseudobulk_matrix/normal_dis_for_peer/{celltype}.csv')
    
    print(f"处理完成 {celltype}", flush=True)

def preprocess_xQTL_data(adata_all):
    # Get the unique cell types
    celltypes = list(adata_all.obs['final_annotation'].unique())
    # Create a pool of processes
    with Pool(30) as pool:
        # Map the cell types to the processing function
        process_func = partial(process_celltype, adata_use=adata_all)
        pool.map(process_func, celltypes)
'''

In [12]:
preprocess_RNA_data(adata)

正在处理 CD8_Tem_CCR7neg
处理完成 CD8_Tem_CCR7neg
正在处理 cMono_CD14
处理完成 cMono_CD14
正在处理 cMono_IL1B
处理完成 cMono_IL1B
正在处理 CD4_Tn_CCR7
处理完成 CD4_Tn_CCR7
正在处理 CD4_Tem_CCR7neg
处理完成 CD4_Tem_CCR7neg
正在处理 Mature_NK_dim_FCGR3A
处理完成 Mature_NK_dim_FCGR3A
正在处理 CD4_Tcm_IFI44L
处理完成 CD4_Tcm_IFI44L
正在处理 Atypical_Bm_ITGAX
处理完成 Atypical_Bm_ITGAX
正在处理 CD4_Th1-like_GZMK
处理完成 CD4_Th1-like_GZMK
正在处理 NK_bright_XCL1
处理完成 NK_bright_XCL1
正在处理 CD4_Th17-like_RORC
处理完成 CD4_Th17-like_RORC
正在处理 cMono_CXCL10
处理完成 cMono_CXCL10
正在处理 CD8_CTL_GZMB
处理完成 CD8_CTL_GZMB
正在处理 Terminal_NK_dim_CD160neg
处理完成 Terminal_NK_dim_CD160neg
正在处理 Switched_Bm_IGHDneg
处理完成 Switched_Bm_IGHDneg
正在处理 cMono_IFI44L
处理完成 cMono_IFI44L
正在处理 CD4_Tcm_CXCR5
处理完成 CD4_Tcm_CXCR5
正在处理 NKT_NCR1
处理完成 NKT_NCR1
正在处理 CD4_CTL_GZMH
处理完成 CD4_CTL_GZMH
正在处理 ncMono_C1QA
处理完成 ncMono_C1QA
正在处理 ncMono_FCGR3A
处理完成 ncMono_FCGR3A
正在处理 gdT2_IL12RB2
处理完成 gdT2_IL12RB2
正在处理 intMono_GFRA2
处理完成 intMono_GFRA2
正在处理 CD8_Tn_CCR7
处理完成 CD8_Tn_CCR7
正在处理 CD4_Th_LMNA
处理完成 CD4_Th_LMNA
正在处理 Switche

In [None]:
'''
test
        adata_use_celltype = adata[adata.obs['final_annotation'] == 'CD4_CTL_GZMH']

        #算伪Bulk
        pdata = dc.get_pseudobulk(
            adata_use_celltype,
            sample_col='sample',
            groups_col=None,
            mode='mean',
            min_cells=10,
            min_counts=0,
            min_prop=0,
            min_smpls=0)
        
        #生成伪bulk矩阵
        pseudo_matrix = pd.DataFrame(pdata.X)
        pseudo_matrix.columns = pdata.var_names
        pseudo_matrix.index = pdata.obs.index

        #只选取在90%的样本中都表达的特征
        non_zero_ratio = (pseudo_matrix != 0).mean()
        columns_to_keep = non_zero_ratio[non_zero_ratio >= 0.9].index
        pseudo_matrix = pseudo_matrix[columns_to_keep]

pseudo_matrix.to_csv('/CIMA/Data/test_file/CD4_CTL_GZMH_test.csv')
'''