In [1]:
# 20250206
# Author: Zhuoli Huang
# Analysis Objective: Evaluate the enrichment of eGenes/caPeaks from celltype A in celltype B
# Algorithm Description:
# First, sort the eGenes of each celltype by p_nominal in ascending order, and then shuffle 10,000 times.
# In celltype A, select the significant eGenes and identify those that are detected (expressed) in celltype B, but not significant (A_sig_in_B_notsig).
# Count the number of eGenes in the A_sig_in_B_notsig group, denoted as n.
# Randomly select n genes from the shuffled ranks and compute the mean rank for these n genes.
# Perform a t-test comparing the rank of A_sig_in_B_notsig in celltype B with the n shuffled mean ranks.
# If the test is significant, it suggests that eGenes significantly associated with celltype A may have effects in celltype B as well,
# but the lack of significance in celltype B may be due to factors such as sample size or other reasons.

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import multiprocessing

In [2]:
#这个值是每个celltype中的显著eGene/caPeak在另外一个细胞类型中被检测的比例（交集数/显著数量）
# 定义计算基因交集比例的函数
def calculate_proportion(sig_genes, detected_genes):
    intersection = len(set(sig_genes).intersection(set(detected_genes)))
    return intersection / len(sig_genes)

#洗牌函数
def shuffle_ranks(xqtl_df, cell_type, num_shuffles=10000):
    # 选择特定细胞类型的数据
    use_xqtl_df = xqtl_df[xqtl_df['celltype'] == cell_type].copy()

    # 创建一个以phenotype_id为索引的rank_series
    rank_series = pd.Series(use_xqtl_df['rank_in_celltype'].values, index=use_xqtl_df['phenotype_id'].values)

    # 用于保存每次shuffle的rank数据
    rank_data = []

    # 执行num_shuffles次随机打乱
    for i in range(num_shuffles):
        shuffled_series = rank_series.sample(frac=1, random_state=i)  # 随机打乱
        ranks = shuffled_series.values  # 获取打乱后的rank
        rank_data.append(ranks)

    # 将每次shuffle的rank数据存储到DataFrame
    rank_df = pd.DataFrame(rank_data, columns=rank_series.index)

    return cell_type, rank_df

#并发洗牌函数
def parallel_shuffle_ranks(xqtl_df, num_shuffles=10000):
    # 获取所有独特的celltype
    celltypes = xqtl_df['celltype'].unique()

    # 创建进程池
    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        results = pool.starmap(shuffle_ranks, [(xqtl_df, cell_type, num_shuffles) for cell_type in celltypes])

    # 将结果转化为字典，key是celltype，value是对应的rank_df
    result_dict = {cell_type: rank_df for cell_type, rank_df in results}

    return result_dict

# eQTL 数据处理

In [None]:
cis_df_all_eQTL = pd.read_csv('/CIMA/Result/20250108_cis_eQTL_all.csv',index_col=0)
#各个细胞类型的eGene按照p_value从小到大排名
cis_df_all_eQTL['rank_in_celltype'] = cis_df_all_eQTL.groupby('celltype')['pval_nominal'].rank(method='average', ascending=True).astype('int')
#每个细胞类型检测的eGene
detected_list = cis_df_all_eQTL.groupby('celltype')['phenotype_id'].unique()
cis_df_all_eQTL_sig = cis_df_all_eQTL[cis_df_all_eQTL['study_wise_qval'] < 0.05].copy()
#每个细胞类型的study-wide显著eGene
sig_list = cis_df_all_eQTL_sig.groupby('celltype')['phenotype_id'].unique()

In [4]:
# 创建一个空的DataFrame来存储比例结果
proportion_df = pd.DataFrame(index=sig_list.index, columns=detected_list.index)

# 填充DataFrame
for sig_index in sig_list.index:
    for detected_index in detected_list.index:
        proportion_df.loc[sig_index, detected_index] = calculate_proportion(
            sig_list[sig_index], detected_list[detected_index]
        )

# 将比例列转换为float类型
proportion_df = proportion_df.astype(float)

In [5]:
# 假设xqtl_df是已经加载的数据框
result_dict = parallel_shuffle_ranks(xqtl_df = cis_df_all_eQTL, num_shuffles=10000)

In [46]:
# 创建一个空的数据框，用于存储每个显著基因在不同细胞类型中的数量
number_df = pd.DataFrame(index=sig_list.index, columns=detected_list.index)
# 创建一个空的数据框，用于存储每个显著基因在不同细胞类型中的富集p值
enrich_p_df = pd.DataFrame(index=sig_list.index, columns=detected_list.index)

# 遍历所有被检测的细胞类型
for detected_index in detected_list.index:
    # 筛选出当前细胞类型的数据
    cis_df_all_eQTL_use = cis_df_all_eQTL[cis_df_all_eQTL['celltype'] == detected_index].copy()
    
    # 创建一个以phenotype_id为索引的rank_series
    rank_series = pd.Series(cis_df_all_eQTL_use['rank_in_celltype'].values, index=cis_df_all_eQTL_use['phenotype_id'].values)
    
    # 遍历所有显著基因集合
    for sig_index in sig_list.index:
        # 计算显著基因集合与当前细胞类型中检测到的基因集合的交集
        intersection = set(sig_list[sig_index]).intersection(set(detected_list[detected_index]))
        # 移除当前细胞类型中已经显著的基因
        intersection = list(intersection - set(sig_list[detected_index]))
        
        # 将交集的基因数目记录在number_df中
        number_df.loc[sig_index, detected_index] = len(intersection)
        
        np.random.seed(42)
        # 随机选择与交集大小相同数量的基因
        selected_genes = np.random.choice(detected_list[detected_index], size=len(intersection), replace=False)
        
        # 获取这些随机选择的基因在该细胞类型中的rank数据
        rank_df_use = result_dict[detected_index][selected_genes]
        
        # 进行t检验，比较显著基因在该细胞类型中的rank与随机选择基因的rank均值
        t, p = stats.ttest_ind(rank_df_use.mean().values, rank_series[intersection].values,alternative='greater')
        
        # 将t检验的p值存储在enrich_p_df中
        enrich_p_df.loc[sig_index, detected_index] = p

# 行是提供显著基因的类型，列是被检测的类型

In [None]:
proportion_df.to_csv('/CIMA/Result/summary/20250206_eGene_enrich_a_sig_in_B_detected_percentage.csv')
number_df.to_csv('/CIMA/Result/summary/20250206_eGene_enrich_number.csv')
enrich_p_df.to_csv('/CIMA/Result/summary/20250206_eGene_enrich_p_df.csv')

# caQTL数据处理

In [None]:
cis_df_all_caQTL = pd.read_csv('/CIMA/Result/20250108_cis_caQTL_all.csv',index_col=0)
#各个细胞类型的eGene按照p_value从小到大排名
cis_df_all_caQTL['rank_in_celltype'] = cis_df_all_caQTL.groupby('celltype')['pval_nominal'].rank(method='average', ascending=True).astype('int')
#每个细胞类型检测的eGene
detected_list = cis_df_all_caQTL.groupby('celltype')['phenotype_id'].unique()
cis_df_all_caQTL_sig = cis_df_all_caQTL[cis_df_all_caQTL['study_wise_qval'] < 0.05].copy()
#每个细胞类型的study-wide显著eGene
sig_list = cis_df_all_caQTL_sig.groupby('celltype')['phenotype_id'].unique()

In [10]:
# 创建一个空的DataFrame来存储比例结果
proportion_df = pd.DataFrame(index=sig_list.index, columns=detected_list.index)

# 填充DataFrame
for sig_index in sig_list.index:
    for detected_index in detected_list.index:
        proportion_df.loc[sig_index, detected_index] = calculate_proportion(
            sig_list[sig_index], detected_list[detected_index]
        )

# 将比例列转换为float类型
proportion_df = proportion_df.astype(float)

In [11]:
# 假设xqtl_df是已经加载的数据框
result_dict = parallel_shuffle_ranks(xqtl_df = cis_df_all_caQTL, num_shuffles=10000)

In [12]:
# 创建一个空的数据框，用于存储每个显著基因在不同细胞类型中的数量
number_df = pd.DataFrame(index=sig_list.index, columns=detected_list.index)
# 创建一个空的数据框，用于存储每个显著基因在不同细胞类型中的富集p值
enrich_p_df = pd.DataFrame(index=sig_list.index, columns=detected_list.index)

# 遍历所有被检测的细胞类型
for detected_index in detected_list.index:
    # 筛选出当前细胞类型的数据
    cis_df_all_caQTL_use = cis_df_all_caQTL[cis_df_all_caQTL['celltype'] == detected_index].copy()
    
    # 创建一个以phenotype_id为索引的rank_series
    rank_series = pd.Series(cis_df_all_caQTL_use['rank_in_celltype'].values, index=cis_df_all_caQTL_use['phenotype_id'].values)
    
    # 遍历所有显著基因集合
    for sig_index in sig_list.index:
        # 计算显著基因集合与当前细胞类型中检测到的基因集合的交集
        intersection = set(sig_list[sig_index]).intersection(set(detected_list[detected_index]))
        # 移除当前细胞类型中已经显著的基因
        intersection = list(intersection - set(sig_list[detected_index]))
        
        # 将交集的基因数目记录在number_df中
        number_df.loc[sig_index, detected_index] = len(intersection)
        
        np.random.seed(42)
        # 随机选择与交集大小相同数量的基因
        selected_genes = np.random.choice(detected_list[detected_index], size=len(intersection), replace=False)
        
        # 获取这些随机选择的基因在该细胞类型中的rank数据
        rank_df_use = result_dict[detected_index][selected_genes]
        
        # 进行t检验，比较显著基因在该细胞类型中的rank与随机选择基因的rank均值
        t, p = stats.ttest_ind(rank_df_use.mean().values, rank_series[intersection].values,alternative='greater')
        
        # 将t检验的p值存储在enrich_p_df中
        enrich_p_df.loc[sig_index, detected_index] = p

In [None]:
proportion_df.to_csv('/CIMA/Result/summary/20250206_caPeak_enrich_a_sig_in_B_detected_percentage.csv')
number_df.to_csv('/CIMA/Result/summary/20250206_caPeak_enrich_number.csv')
enrich_p_df.to_csv('/CIMA/Result/summary/20250206_caPeak_enrich_p_df.csv')