In [70]:
import pandas as pd
import numpy as np
import os

In [5]:
eQTL_dir = '/CIMA/Result/eQTL_tensorqtl_L1/'
output_dir = '/CIMA/Result/eQTL_tensorqtl_mergenominal_L1/'

In [6]:
import concurrent.futures

def process_celltype(celltype, eQTL_dir, output_dir):
    print(f'processing_{celltype}')
    parquet_files = [f for f in os.listdir(f'{eQTL_dir}{celltype}') if f.endswith('.parquet')]
    parquet_files.sort(key=lambda x: int(x.split('.')[2]))

    eQTL_full = pd.concat(
        [pd.read_parquet(os.path.join(f'{eQTL_dir}{celltype}', file)) for file in parquet_files],
        ignore_index=True
    )

    eQTL_full.to_parquet(f'{output_dir}/{celltype}.parquet')

def process_all_celltypes(eQTL_dir, output_dir,celltypes):
    
    # 设置ThreadPoolExecutor并发线程数为4
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(lambda celltype: process_celltype(celltype, eQTL_dir, output_dir), celltypes)

In [8]:
celltypes_L1 = ["B","CD4_T","CD8_T","Myeloid","NK"]

In [9]:
process_all_celltypes(eQTL_dir, output_dir,celltypes= celltypes_L1)

processing_B
processing_CD4_T
processing_CD8_T
processing_Myeloid
processing_NK


# 合并ONEK1K位点

In [58]:
bim_file = pd.read_csv('/CIMA/genetics/qc/10.maf01.bim',sep='\t',header=None)
bim_file.columns = ['chr','variant_id','unknown','pos','eff_own','other_own']
bim_file = bim_file[['variant_id','eff_own','other_own']]

In [59]:
onek1k_lead_snp = pd.read_csv('/CIMA/Data/public_eQTL/20250218_file_38_for_comparasion/20250218_onek1k_leap_snp.csv',index_col=0)
onek1k_lead_snp = onek1k_lead_snp.rename(columns={'celltype_map': 'celltype_A'})
onek1k_lead_snp['pair'] = onek1k_lead_snp['GENE']+'_'+onek1k_lead_snp['CHR']+'_'+onek1k_lead_snp['POS'].astype(str)
onek1k_lead_snp = onek1k_lead_snp[['pair','celltype_A', 'A2', 'A1','SPEARMANS_RHO']]
onek1k_lead_snp.columns = ['pair','celltype_A', 'eff_public', 'other_public','beta_public']

In [61]:
df_list = []
for celltype in ["B","CD4_T","CD8_T","Myeloid","NK"]:
    print(f'processing_{celltype}')
    nominal_P = pd.read_parquet(f'/CIMA/Result/eQTL_tensorqtl_mergenominal_L1/{celltype}.parquet')
    nominal_P = pd.merge(nominal_P,bim_file,on='variant_id',how = 'inner')
    nominal_P['celltype_B'] = f'{celltype}_own'
    nominal_P['pair'] = nominal_P['phenotype_id']+'_'+nominal_P['variant_id']
    #nominal_P = nominal_P[['pair','celltype_B','pval_nominal']]
    #合并_只选取相同的pair
    nominal_merged_df = pd.merge(nominal_P,onek1k_lead_snp, on='pair', how='inner') 
    df_list.append(nominal_merged_df)

processing_B
processing_CD4_T
processing_CD8_T
processing_Myeloid
processing_NK


In [78]:
final_df = pd.concat(df_list, ignore_index=True)
final_df['celltype_pair'] = final_df['celltype_A']+'_xxx_'+final_df['celltype_B']
print(min(final_df['celltype_pair'].value_counts()))
print(final_df['celltype_pair'].nunique())

460
25


In [80]:
final_df.to_csv('/CIMA/Result/eQTL_L1_downstream/20250219_onek1k_vs_own.csv')

# 合并immueNexut位点

In [81]:
bim_file = pd.read_csv('/CIMA/genetics/qc/10.maf01.bim',sep='\t',header=None)
bim_file.columns = ['chr','variant_id','unknown','pos','eff_own','other_own']
bim_file = bim_file[['variant_id','eff_own','other_own']]

In [97]:
immuenexut_lead_snp = pd.read_csv('/CIMA/Data/public_eQTL/20250218_file_38_for_comparasion/20250219_immuenexut_all_lead_snp.csv',index_col=0)
immuenexut_lead_snp = immuenexut_lead_snp.rename(columns={'celltype_map': 'celltype_A'})
immuenexut_lead_snp['pair'] = immuenexut_lead_snp['Gene_name']+'_'+immuenexut_lead_snp['CHR']+'_'+immuenexut_lead_snp['Variant_position_start'].astype(str)
immuenexut_lead_snp = immuenexut_lead_snp[['pair','celltype_A', 'ALT', 'REF','slope(ALT)']]
immuenexut_lead_snp.columns = ['pair','celltype_A', 'eff_public', 'other_public','beta_public']

In [98]:
df_list = []
for celltype in ["B","CD4_T","CD8_T","Myeloid","NK"]:
    print(f'processing_{celltype}')
    nominal_P = pd.read_parquet(f'/CIMA/Result/eQTL_tensorqtl_mergenominal_L1/{celltype}.parquet')
    nominal_P = pd.merge(nominal_P,bim_file,on='variant_id',how = 'inner')
    nominal_P['celltype_B'] = f'{celltype}_own'
    nominal_P['pair'] = nominal_P['phenotype_id']+'_'+nominal_P['variant_id']
    #nominal_P = nominal_P[['pair','celltype_B','pval_nominal']]
    #合并_只选取相同的pair
    nominal_merged_df = pd.merge(nominal_P,immuenexut_lead_snp, on='pair', how='inner') 
    df_list.append(nominal_merged_df)

processing_B
processing_CD4_T
processing_CD8_T
processing_Myeloid
processing_NK


In [107]:
final_df = pd.concat(df_list, ignore_index=True)
final_df['celltype_pair'] = final_df['celltype_A']+'_xxx_'+final_df['celltype_B']
print(min(final_df['celltype_pair'].value_counts()))
print(final_df['celltype_pair'].nunique())

3677
25


In [108]:
final_df.to_csv('/CIMA/Result/eQTL_L1_downstream/20250219_immuenexut_vs_own.csv')