In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import multiprocessing

In [2]:
#这个值是每个celltype中的显著eGene/caPeak在另外一个细胞类型中被检测的比例（交集数/显著数量）
# 定义计算基因交集比例的函数
def calculate_proportion(sig_genes, detected_genes):
    intersection = len(set(sig_genes).intersection(set(detected_genes)))
    return intersection / len(sig_genes)

#洗牌函数
def shuffle_ranks(xqtl_df, cell_type, num_shuffles=10000):
    # 选择特定细胞类型的数据
    use_xqtl_df = xqtl_df[xqtl_df['celltype'] == cell_type].copy()

    # 创建一个以phenotype_id为索引的rank_series
    rank_series = pd.Series(use_xqtl_df['rank_in_celltype'].values, index=use_xqtl_df['phenotype_id'].values)

    # 用于保存每次shuffle的rank数据
    rank_data = []

    # 执行num_shuffles次随机打乱
    for i in range(num_shuffles):
        shuffled_series = rank_series.sample(frac=1, random_state=i)  # 随机打乱
        ranks = shuffled_series.values  # 获取打乱后的rank
        rank_data.append(ranks)

    # 将每次shuffle的rank数据存储到DataFrame
    rank_df = pd.DataFrame(rank_data, columns=rank_series.index)

    return cell_type, rank_df

#并发洗牌函数
def parallel_shuffle_ranks(xqtl_df, num_shuffles=10000):
    # 获取所有独特的celltype
    celltypes = xqtl_df['celltype'].unique()

    # 创建进程池
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        results = pool.starmap(shuffle_ranks, [(xqtl_df, cell_type, num_shuffles) for cell_type in celltypes])

    # 将结果转化为字典，key是celltype，value是对应的rank_df
    result_dict = {cell_type: rank_df for cell_type, rank_df in results}

    return result_dict

# onek1k数据处理

In [31]:
cis_df_all_eQTL = pd.read_csv('/CIMA/Result/eQTL_L1_downstream/20250219_cis_eQTL_leal_all.csv',index_col=0)
#各个细胞类型的eGene按照p_value从小到大排名
cis_df_all_eQTL['rank_in_celltype'] = cis_df_all_eQTL.groupby('celltype')['pval_nominal'].rank(method='average', ascending=True).astype('int')
cis_df_all_eQTL_sig = cis_df_all_eQTL[cis_df_all_eQTL['study_wise_qval'] < 0.05].copy()
#每个细胞类型的study-wide显著eGene
sig_list_own = cis_df_all_eQTL_sig.groupby('celltype')['phenotype_id'].unique()

In [32]:
#每个细胞类型检测的eGene
detected_list = cis_df_all_eQTL.groupby('celltype')['phenotype_id'].unique()

In [61]:
onek1k = pd.read_csv('/CIMA/Data/public_eQTL/20250218_file_38_for_comparasion/20250218_onek1k_leap_snp.csv',index_col=0)
onek1k = onek1k[onek1k['GENE'].isin(eQTL_own['phenotype_id'])]
#每个细胞类型的study-wide显著eGene
sig_list = onek1k.groupby('celltype_map')['GENE'].unique()

In [22]:
# 假设xqtl_df是已经加载的数据框
result_dict = parallel_shuffle_ranks(xqtl_df = cis_df_all_eQTL, num_shuffles=10000)

In [40]:
# 创建一个空的数据框，用于存储每个显著基因在不同细胞类型中的数量
number_df = pd.DataFrame(index=sig_list.index, columns=detected_list.index)
# 创建一个空的数据框，用于存储每个显著基因在不同细胞类型中的富集p值
enrich_p_df = pd.DataFrame(index=sig_list.index, columns=detected_list.index)

# 遍历所有被检测的细胞类型
for detected_index in detected_list.index:
    # 筛选出当前细胞类型的数据
    cis_df_all_eQTL_use = cis_df_all_eQTL[cis_df_all_eQTL['celltype'] == detected_index].copy()
    
    # 创建一个以phenotype_id为索引的rank_series
    rank_series = pd.Series(cis_df_all_eQTL_use['rank_in_celltype'].values, index=cis_df_all_eQTL_use['phenotype_id'].values)
    
    # 遍历所有显著基因集合
    for sig_index in sig_list.index:
        # 计算显著基因集合与当前细胞类型中检测到的基因集合的交集
        intersection = set(sig_list[sig_index]).intersection(set(detected_list[detected_index]))
        # 移除当前细胞类型中已经显著的基因
        intersection = list(intersection - set(sig_list_own[detected_index]))
        
        # 将交集的基因数目记录在number_df中
        number_df.loc[sig_index, detected_index] = len(intersection)
        
        np.random.seed(42)
        # 随机选择与交集大小相同数量的基因
        selected_genes = np.random.choice(detected_list[detected_index], size=len(intersection), replace=False)
        
        # 获取这些随机选择的基因在该细胞类型中的rank数据
        rank_df_use = result_dict[detected_index][selected_genes]
        
        # 进行t检验，比较显著基因在该细胞类型中的rank与随机选择基因的rank均值
        t, p = stats.ttest_ind(rank_df_use.mean().values, rank_series[intersection].values,alternative='greater')
        
        # 将t检验的p值存储在enrich_p_df中
        enrich_p_df.loc[sig_index, detected_index] = p

In [41]:
#没有显著结果
enrich_p_df

celltype,B,CD4_T,CD8_T,Myeloid,NK
celltype_map,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
B,1.0,1.0,1.0,1.0,1.0
CD4_T,1.0,1.0,1.0,1.0,1.0
CD8_T,1.0,1.0,1.0,1.0,1.0
Myeloid,1.0,1.0,1.0,1.0,1.0
NK,1.0,1.0,1.0,1.0,1.0


# immuenexut_数据处理

In [42]:
cis_df_all_eQTL = pd.read_csv('/CIMA/Result/eQTL_L1_downstream/20250219_cis_eQTL_leal_all.csv',index_col=0)
#各个细胞类型的eGene按照p_value从小到大排名
cis_df_all_eQTL['rank_in_celltype'] = cis_df_all_eQTL.groupby('celltype')['pval_nominal'].rank(method='average', ascending=True).astype('int')
cis_df_all_eQTL_sig = cis_df_all_eQTL[cis_df_all_eQTL['study_wise_qval'] < 0.05].copy()
#每个细胞类型的study-wide显著eGene
sig_list_own = cis_df_all_eQTL_sig.groupby('celltype')['phenotype_id'].unique()

In [43]:
#每个细胞类型检测的eGene
detected_list = cis_df_all_eQTL.groupby('celltype')['phenotype_id'].unique()

In [46]:
immuenexut = pd.read_csv('/CIMA/Data/public_eQTL/20250218_file_38_for_comparasion/20250219_immuenexut_all_lead_snp.csv',index_col=0)
immuenexut = immuenexut[immuenexut['Gene_name'].isin(eQTL_own['phenotype_id'])]
#每个细胞类型的study-wide显著eGene
sig_list = immuenexut.groupby('celltype_map')['Gene_name'].unique()

In [60]:
len(sig_list['B'])

9955

In [56]:
# 创建一个空的数据框，用于存储每个显著基因在不同细胞类型中的数量
number_df = pd.DataFrame(index=sig_list.index, columns=detected_list.index)
# 创建一个空的数据框，用于存储每个显著基因在不同细胞类型中的富集p值
enrich_p_df = pd.DataFrame(index=sig_list.index, columns=detected_list.index)

# 遍历所有被检测的细胞类型
for detected_index in detected_list.index:
    # 筛选出当前细胞类型的数据
    cis_df_all_eQTL_use = cis_df_all_eQTL[cis_df_all_eQTL['celltype'] == detected_index].copy()
    
    # 创建一个以phenotype_id为索引的rank_series
    rank_series = pd.Series(cis_df_all_eQTL_use['rank_in_celltype'].values, index=cis_df_all_eQTL_use['phenotype_id'].values)
    
    # 遍历所有显著基因集合
    for sig_index in sig_list.index:
        # 计算显著基因集合与当前细胞类型中检测到的基因集合的交集
        intersection = set(sig_list[sig_index]).intersection(set(detected_list[detected_index]))
        # 移除当前细胞类型中已经显著的基因
        intersection = list(intersection - set(sig_list_own[detected_index]))
        
        # 将交集的基因数目记录在number_df中
        number_df.loc[sig_index, detected_index] = len(intersection)
        
        np.random.seed(42)
        # 随机选择与交集大小相同数量的基因
        selected_genes = np.random.choice(detected_list[detected_index], size=len(intersection), replace=False)
        
        # 获取这些随机选择的基因在该细胞类型中的rank数据
        rank_df_use = result_dict[detected_index][selected_genes]
        
        # 进行t检验，比较显著基因在该细胞类型中的rank与随机选择基因的rank均值
        t, p = stats.ttest_ind(rank_df_use.mean().values, rank_series[intersection].values,alternative='greater')
        
        # 将t检验的p值存储在enrich_p_df中
        enrich_p_df.loc[sig_index, detected_index] = p

In [57]:
#得到的结果是不显著的
enrich_p_df

celltype,B,CD4_T,CD8_T,Myeloid,NK
celltype_map,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
B,1.0,1.0,1.0,1.0,1.0
CD4_T,1.0,1.0,1.0,1.0,1.0
CD8_T,1.0,1.0,1.0,1.0,1.0
Myeloid,1.0,1.0,1.0,1.0,1.0
NK,1.0,1.0,1.0,1.0,1.0
