In [1]:
gwas_file = "/home/xutingfeng/GIFT/data/GWAS/T1Mapping_Cortex_20240129.csv_firstorder_Median_all_2023_GRCh38_unionKidneys.tsv.gz"
save_dir = "/home/xutingfeng/GIFT/data/analysis/DNAJC16/coloc"
from pathlib import Path

Path(save_dir).mkdir(parents=True, exist_ok=True)

In [2]:
from finemap_tools.reader.gwas import load_GWASFormated_file
from finemap_tools.utils import add_ID
from finemap_tools.snpfilter import filter_pipline


topLoci = (1, 15583355, 15583355)  # (chr, start, end)
locus_range = 1000  # 100kb
locus_range = locus_range * 1000
locus_range_tuple = (topLoci[0], topLoci[1] - locus_range, topLoci[2] + locus_range)

locus_region = f"{locus_range_tuple[0]}:{locus_range_tuple[1]}-{locus_range_tuple[2]}"
print(f"locus_region: {locus_region}")


sumstats = load_GWASFormated_file(gwas_file, region=locus_region)  # load

locus_region: 1:14583355-16583355
Will load region from file.


In [3]:
sumstats

Unnamed: 0,chrom,pos,ref,alt,pval,af,beta,sebeta
0,1,14583587,G,A,0.826078,0.158770,0.005861,0.026674
1,1,14583907,T,C,0.131199,0.000677,-0.556895,0.368953
2,1,14583992,G,A,0.292274,0.000678,0.388817,0.369195
3,1,14584191,G,A,0.125667,0.001244,-0.423208,0.276351
4,1,14584347,C,T,0.161867,0.005443,0.183348,0.131073
...,...,...,...,...,...,...,...,...
16201,1,16582952,G,T,0.158365,0.060092,0.074419,0.052757
16202,1,16583062,G,A,0.302697,0.000915,0.331578,0.321712
16203,1,16583105,A,C,0.986135,0.011341,-0.001656,0.095266
16204,1,16583157,G,A,0.591985,0.002474,-0.106121,0.198001


In [4]:
default_coloc_map = {
    "snp": "SNP",
    "chr": "chrom",
    "position": "pos",
    "beta": "beta",
    "varbeta": None,
}


def sumstats2coloc(sumstats, map_dict=None):
    sumstats = sumstats.copy()
    polyfun_cols = list(default_coloc_map.keys())
    used_polyfun_map_dict = default_coloc_map.copy()
    if map_dict:
        used_polyfun_map_dict.update(map_dict)

    for col in polyfun_cols:
        current_check_col = used_polyfun_map_dict.get(col, None)

        if (current_check_col not in sumstats.columns) | (current_check_col is None):
            if col == "varbeta":
                # assert beta and se in map_dict
                assert (
                    "sebeta" in used_polyfun_map_dict.keys()
                ), "sebeta not in map_dict"
                sumstats["varbeta"] = sumstats[used_polyfun_map_dict["sebeta"]] ** 2

                used_polyfun_map_dict["varbeta"] = "varbeta"
            else:
                raise ValueError(f"{current_check_col} not in sumstats")

    rename_dict = {v: k for k, v in used_polyfun_map_dict.items() if v is not None}
    sumstats.rename(columns=rename_dict, inplace=True)
    return sumstats[
        list(rename_dict.values())
        + [col for col in sumstats.columns if col not in rename_dict.values()]
    ]


necessary_check_cols = ["beta", "varbeta"]

In [5]:
sumstats["SNP"] = add_ID(
    sumstats,
    col_list=["chrom", "pos", "ref", "alt"],
)
sumstats_coloc = sumstats2coloc(sumstats, map_dict={"sebeta": "sebeta"})
is_na_nums_sumstats = sumstats_coloc[necessary_check_cols].isna().sum()
print(f"NA nums in sumstats: {is_na_nums_sumstats} at cols: {necessary_check_cols}")
sumstats_coloc = sumstats_coloc.dropna(subset=necessary_check_cols)
if (dup_nums := sumstats_coloc.duplicated(subset=["snp"]).sum()) > 0:
    print(f"duplicated SNPs in sumstats {dup_nums}")
    sumstats_coloc = sumstats_coloc.drop_duplicates(subset=["SNP"])

sumstats_coloc

NA nums in sumstats: beta       0
varbeta    0
dtype: int64 at cols: ['beta', 'varbeta']


Unnamed: 0,snp,chr,position,beta,varbeta,sebeta,ref,alt,pval,af
0,1:14583587:G:A,1,14583587,0.005861,0.000711,0.026674,G,A,0.826078,0.158770
1,1:14583907:T:C,1,14583907,-0.556895,0.136126,0.368953,T,C,0.131199,0.000677
2,1:14583992:G:A,1,14583992,0.388817,0.136305,0.369195,G,A,0.292274,0.000678
3,1:14584191:G:A,1,14584191,-0.423208,0.076370,0.276351,G,A,0.125667,0.001244
4,1:14584347:C:T,1,14584347,0.183348,0.017180,0.131073,C,T,0.161867,0.005443
...,...,...,...,...,...,...,...,...,...,...
16201,1:16582952:G:T,1,16582952,0.074419,0.002783,0.052757,G,T,0.158365,0.060092
16202,1:16583062:G:A,1,16583062,0.331578,0.103499,0.321712,G,A,0.302697,0.000915
16203,1:16583105:A:C,1,16583105,-0.001656,0.009076,0.095266,A,C,0.986135,0.011341
16204,1:16583157:G:A,1,16583157,-0.106121,0.039204,0.198001,G,A,0.591985,0.002474


In [6]:
from finemap_tools.reader.eQTL import GTEx_tabix_reader


eQTL_path = "/home/xutingfeng/GIFT/GTEx_V8/ge/Kidney_Cortex.tsv.gz"

eQTL_df = GTEx_tabix_reader(eQTL_path, region=locus_region)


eQTL_df["snp"] = add_ID(eQTL_df, ["chromosome", "position", "ref", "alt"])

GTEx_coloc_map_dict = {
    "snp": "snp",
    "chr": "chromosome",
    "position": "position",
    "beta": "beta",
    "sebeta": "se",
}
eQTL_df = sumstats2coloc(eQTL_df, map_dict=GTEx_coloc_map_dict)
is_na_num_eqtl = eQTL_df[necessary_check_cols].isna().any(axis=1).sum()
print(f"eQTL: {is_na_num_eqtl} rows with NA in {necessary_check_cols}")
eQTL_df = eQTL_df.dropna(subset=necessary_check_cols)

if (
    dup_nums := eQTL_df.duplicated(
        subset=[
            "snp",
            "gene_id",
        ]
    ).sum()
) > 0:
    print(f"duplicated SNPs in eQTL {dup_nums}")
    eQTL_df = eQTL_df.drop_duplicates(
        subset=[
            "snp",
            "gene_id",
        ]
    )

eQTL_df
# eQTL_df

Will load region from file.
eQTL: 10677 rows with NA in ['beta', 'varbeta']
duplicated SNPs in eQTL 310


Unnamed: 0,snp,chr,position,beta,varbeta,sebeta,variant,r2,pvalue,molecular_trait_object_id,molecular_trait_id,maf,gene_id,median_tpm,an,ac,ref,alt,type,rsid
0,1:14583587:G:A,1,14583587,-0.523009,0.038039,0.195036,chr1_14583587_G_A,,0.010016,ENSG00000215704,ENSG00000215704,0.191781,ENSG00000215704,0.109959,146.0,28,G,A,SNP,rs34982023
1,1:14583587:G:A,1,14583587,-0.494978,0.034707,0.186299,chr1_14583587_G_A,,0.010676,ENSG00000162438,ENSG00000162438,0.191781,ENSG00000162438,0.193525,146.0,28,G,A,SNP,rs34982023
2,1:14583587:G:A,1,14583587,-0.231571,0.014802,0.121662,chr1_14583587_G_A,,0.062993,ENSG00000142634,ENSG00000142634,0.191781,ENSG00000142634,18.635300,146.0,28,G,A,SNP,rs34982023
3,1:14583587:G:A,1,14583587,-0.275230,0.022320,0.149399,chr1_14583587_G_A,,0.071621,ENSG00000175147,ENSG00000175147,0.191781,ENSG00000175147,3.083490,146.0,28,G,A,SNP,rs34982023
4,1:14583587:G:A,1,14583587,-0.141635,0.013845,0.117663,chr1_14583587_G_A,,0.234596,ENSG00000171729,ENSG00000171729,0.191781,ENSG00000171729,30.311600,146.0,28,G,A,SNP,rs34982023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386642,1:16583157:G:A,1,16583157,-0.123758,0.745492,0.863419,chr1_16583157_G_A,,0.886626,ENSG00000224459,ENSG00000224459,0.013699,ENSG00000224459,0.121452,146.0,2,G,A,SNP,rs558467297
386643,1:16583157:G:A,1,16583157,0.095035,0.488745,0.699103,chr1_16583157_G_A,,0.892439,ENSG00000233954,ENSG00000233954,0.013699,ENSG00000233954,4.323420,146.0,2,G,A,SNP,rs558467297
386644,1:16583157:G:A,1,16583157,0.040898,0.191090,0.437138,chr1_16583157_G_A,,0.925850,ENSG00000162458,ENSG00000162458,0.013699,ENSG00000162458,17.314400,146.0,2,G,A,SNP,rs558467297
386645,1:16583157:G:A,1,16583157,0.028620,0.267724,0.517421,chr1_16583157_G_A,,0.956119,ENSG00000183888,ENSG00000183888,0.013699,ENSG00000183888,0.725811,146.0,2,G,A,SNP,rs558467297


In [7]:
intersection = set(sumstats_coloc["snp"]).intersection(set(eQTL_df["snp"]))
print(f"intersection: {len(intersection)}")
eQTL_df_gene = eQTL_df[eQTL_df["snp"].isin(intersection)]
sumstats_coloc = sumstats_coloc[sumstats_coloc["snp"].isin(intersection)]
sumstats_coloc

intersection: 8132


Unnamed: 0,snp,chr,position,beta,varbeta,sebeta,ref,alt,pval,af
0,1:14583587:G:A,1,14583587,0.005861,0.000711,0.026674,G,A,0.826078,0.158770
6,1:14584424:G:A,1,14584424,-0.027372,0.000395,0.019872,G,A,0.168390,0.398367
7,1:14584509:T:C,1,14584509,-0.224028,0.046741,0.216197,T,C,0.300099,0.002376
10,1:14584737:T:C,1,14584737,-0.005055,0.000680,0.026071,T,C,0.846272,0.833258
12,1:14585405:C:T,1,14585405,0.004201,0.000699,0.026432,C,T,0.873712,0.163672
...,...,...,...,...,...,...,...,...,...,...
16171,1:16577626:T:G,1,16577626,0.048317,0.001663,0.040774,T,G,0.236014,0.659827
16181,1:16579584:T:A,1,16579584,-0.174521,0.035891,0.189450,T,A,0.356948,0.002712
16185,1:16580288:T:C,1,16580288,-0.159916,0.037436,0.193484,T,C,0.408515,0.002593
16201,1:16582952:G:T,1,16582952,0.074419,0.002783,0.052757,G,T,0.158365,0.060092


In [8]:
eQTL_df

Unnamed: 0,snp,chr,position,beta,varbeta,sebeta,variant,r2,pvalue,molecular_trait_object_id,molecular_trait_id,maf,gene_id,median_tpm,an,ac,ref,alt,type,rsid
0,1:14583587:G:A,1,14583587,-0.523009,0.038039,0.195036,chr1_14583587_G_A,,0.010016,ENSG00000215704,ENSG00000215704,0.191781,ENSG00000215704,0.109959,146.0,28,G,A,SNP,rs34982023
1,1:14583587:G:A,1,14583587,-0.494978,0.034707,0.186299,chr1_14583587_G_A,,0.010676,ENSG00000162438,ENSG00000162438,0.191781,ENSG00000162438,0.193525,146.0,28,G,A,SNP,rs34982023
2,1:14583587:G:A,1,14583587,-0.231571,0.014802,0.121662,chr1_14583587_G_A,,0.062993,ENSG00000142634,ENSG00000142634,0.191781,ENSG00000142634,18.635300,146.0,28,G,A,SNP,rs34982023
3,1:14583587:G:A,1,14583587,-0.275230,0.022320,0.149399,chr1_14583587_G_A,,0.071621,ENSG00000175147,ENSG00000175147,0.191781,ENSG00000175147,3.083490,146.0,28,G,A,SNP,rs34982023
4,1:14583587:G:A,1,14583587,-0.141635,0.013845,0.117663,chr1_14583587_G_A,,0.234596,ENSG00000171729,ENSG00000171729,0.191781,ENSG00000171729,30.311600,146.0,28,G,A,SNP,rs34982023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386642,1:16583157:G:A,1,16583157,-0.123758,0.745492,0.863419,chr1_16583157_G_A,,0.886626,ENSG00000224459,ENSG00000224459,0.013699,ENSG00000224459,0.121452,146.0,2,G,A,SNP,rs558467297
386643,1:16583157:G:A,1,16583157,0.095035,0.488745,0.699103,chr1_16583157_G_A,,0.892439,ENSG00000233954,ENSG00000233954,0.013699,ENSG00000233954,4.323420,146.0,2,G,A,SNP,rs558467297
386644,1:16583157:G:A,1,16583157,0.040898,0.191090,0.437138,chr1_16583157_G_A,,0.925850,ENSG00000162458,ENSG00000162458,0.013699,ENSG00000162458,17.314400,146.0,2,G,A,SNP,rs558467297
386645,1:16583157:G:A,1,16583157,0.028620,0.267724,0.517421,chr1_16583157_G_A,,0.956119,ENSG00000183888,ENSG00000183888,0.013699,ENSG00000183888,0.725811,146.0,2,G,A,SNP,rs558467297


In [9]:
sumstats_coloc.reset_index().to_feather(f"{save_dir}/sumstats_coloc.feather")
eQTL_df.reset_index().to_feather(f"{save_dir}/eQTL_coloc.feather")

测试rpy2

In [10]:
from rpy2 import robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

pandas2ri.activate()

In [11]:
r = robjects.r
r.source("2.2_coloc_analysis.R")  # load script

R[write to console]: This is coloc version 5.2.3

R[write to console]: 
Attaching package: ‘dplyr’


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


R[write to console]: 

R[write to console]: Registered S3 methods overwritten by 'treeio':
  method              from    
  MRCA.phylo          tidytree
  MRCA.treedata       tidytree
  Nnode.treedata      tidytree
  Ntip.treedata       tidytree
  ancestor.phylo      tidytree
  ancestor.treedata   tidytree
  child.phylo         tidytree
  child.treedata      tidytree
  full_join.phylo     tidytree
  full_join.treedata  tidytree
  groupClade.phylo    tidytree
  groupClade.treedata tidytree
  groupOTU.phylo      tidytree
  groupOTU.treedata   tidytree
  is.rooted.treedata  tidytree
  nodeid.phylo        tidytree
  nodeid.treedata     tidytree
  nodelab.phylo       tidytree
  node

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

<rpy2.robjects.vectors.ListVector object at 0x7f77b5b967c0> [RTYPES.VECSXP]
R classes: ('list',)
[SexpClosure, BoolSexpVector]
  value: <class 'rpy2.rinterface.SexpClosure'>
  <rpy2.rinterface.SexpClosure object at 0x7f77b5af17c0> [RTYPES.CLOSXP]
  visible: <class 'rpy2.rinterface.BoolSexpVector'>
  <rpy2.rinterface.BoolSexpVector object at 0x7f77b5afba40> [RTYPES.LGLSXP]

In [12]:
eQTL_df["gene_id"].unique()

array(['ENSG00000215704', 'ENSG00000162438', 'ENSG00000142634',
       'ENSG00000175147', 'ENSG00000171729', 'ENSG00000278480',
       'ENSG00000116138', 'ENSG00000116731', 'ENSG00000189337',
       'ENSG00000132906', 'ENSG00000142621', 'ENSG00000233485',
       'ENSG00000272510', 'ENSG00000142615', 'ENSG00000116771',
       'ENSG00000235084', 'ENSG00000237301', 'ENSG00000197312',
       'ENSG00000116786', 'ENSG00000162461', 'ENSG00000237938',
       'ENSG00000162460', 'ENSG00000224459', 'ENSG00000162458',
       'ENSG00000224321', 'ENSG00000233954', 'ENSG00000178715',
       'ENSG00000179743', 'ENSG00000065526', 'ENSG00000116809',
       'ENSG00000183888', 'ENSG00000186510', 'ENSG00000173641',
       'ENSG00000232456', 'ENSG00000184908', 'ENSG00000185519',
       'ENSG00000227959', 'ENSG00000142627', 'ENSG00000224621',
       'ENSG00000142632', 'ENSG00000237276', 'ENSG00000132881',
       'ENSG00000233929', 'ENSG00000037637', 'ENSG00000055070',
       'ENSG00000157191', 'ENSG000002260

In [13]:
test_df = eQTL_df[eQTL_df["gene_id"].isin(["ENSG00000142623", "ENSG00000159339"])]

In [31]:
res = r.runColocAnalysis(
    data_gwas=sumstats_coloc,
    data_eQTLs=test_df,
    output_folder="./test",
    n_gwas=4869,
    n_eQTL=73,
    gwas_sdY=1,
    gwas_type="quant",
)

[1] "Gene ID: ENSG00000142623"
[1] "Number of SNPs in common: 1135"
PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf 
   0.7850    0.1020    0.0907    0.0118    0.0102 
[1] "PP abf for shared variant: 1.02%"


R[write to console]: 'select()' returned 1:1 mapping between keys and columns



Joining with `by = join_by(snp)`
[1] "Gene: PADI1 PP.H4.abf 0.010165684705165"
[1] "Gene ID: ENSG00000159339"
[1] "Number of SNPs in common: 790"
PP.H0.abf PP.H1.abf PP.H2.abf PP.H3.abf PP.H4.abf 
   0.8410    0.0782    0.0668    0.0062    0.0078 
[1] "PP abf for shared variant: 0.78%"


R[write to console]: 'select()' returned 1:1 mapping between keys and columns



Joining with `by = join_by(snp)`
[1] "Gene: PADI4 PP.H4.abf 0.00779744751547604"


In [32]:
pandas2ri.ri2py_vector(res[1][0])

snp,beta_eQTL,varbeta_eQTL,...,varbeta_gwas,pvalue_gwas,PADI1_PP.H4
...,...,...,...,...,...,...


In [58]:
from rpy2.robjects.conversion import localconverter
import rpy2.robjects as ro

with localconverter(ro.default_converter + pandas2ri.converter):
    res_0 = res[0]
    res_1 = []
    for k in res[1].keys():
        pp_h4 = res[1][k].columns[-1]
        symbol_name = pp_h4.split("_")[0]
        current_df = res[1][k][
            ["snp", "beta_eQTL", "varbeta_eQTL", "pvalue_eQTL", pp_h4]
        ].rename(
            columns={
                "beta_eQTL": f"beta_eQTL_{symbol_name}",
                "varbeta_eQTL": f"varbeta_eQTL_{symbol_name}",
                "pvalue_eQTL": f"pvalue_eQTL_{symbol_name}",
            }
        )
        res_1.append(current_df)

In [59]:
from functools import reduce


tes = reduce(lambda x, y: x.merge(y, on=["snp"]), res_1)
tes

Unnamed: 0,snp,beta_eQTL_PADI1,varbeta_eQTL_PADI1,pvalue_eQTL_PADI1,PADI1_PP.H4,beta_eQTL_PADI4,varbeta_eQTL_PADI4,pvalue_eQTL_PADI4,PADI4_PP.H4
0,1:16308428:C:T,-0.140821,0.034713,0.453445,0.000103,-0.317598,0.040203,0.119765,0.000203
1,1:16308692:A:G,-1.309590,0.657946,0.112972,0.000832,1.071350,0.811516,0.240183,0.001142
2,1:16309214:C:T,0.004877,0.454699,0.994260,0.000712,-0.191289,0.546856,0.796992,0.000996
3,1:16309548:A:T,0.118970,0.029544,0.492176,0.000094,0.094605,0.035750,0.619112,0.000126
4,1:16309919:A:G,0.106196,0.026472,0.517063,0.000091,0.095910,0.031973,0.594171,0.000124
...,...,...,...,...,...,...,...,...,...
785,1:16577626:T:G,-0.260081,0.023916,0.099109,0.000543,0.248346,0.029215,0.152743,0.000647
786,1:16579584:T:A,-0.163994,0.675490,0.842687,0.000704,-0.637943,0.805722,0.480706,0.000989
787,1:16580288:T:C,-0.163994,0.675490,0.842687,0.000685,-0.637943,0.805722,0.480706,0.000962
788,1:16582952:G:T,-0.283698,0.050032,0.210796,0.000660,-0.172939,0.061652,0.489477,0.000793


In [61]:
sumstats_coloc.rename(columns={"beta": "beta_GWAS", "varbeta": "varbeta_GWAS"}).merge(
    tes, how="inner"
)

Unnamed: 0,snp,chr,position,beta_GWAS,varbeta_GWAS,sebeta,ref,alt,pval,af,beta_eQTL_PADI1,varbeta_eQTL_PADI1,pvalue_eQTL_PADI1,PADI1_PP.H4,beta_eQTL_PADI4,varbeta_eQTL_PADI4,pvalue_eQTL_PADI4,PADI4_PP.H4
0,1:16308428:C:T,1,16308428,-0.004002,0.000503,0.022424,C,T,0.858357,0.241968,-0.140821,0.034713,0.453445,0.000103,-0.317598,0.040203,0.119765,0.000203
1,1:16308692:A:G,1,16308692,0.138096,0.010390,0.101932,A,G,0.175486,0.009110,-1.309590,0.657946,0.112972,0.000832,1.071350,0.811516,0.240183,0.001142
2,1:16309214:C:T,1,16309214,0.186176,0.127144,0.356572,C,T,0.601582,0.000790,0.004877,0.454699,0.994260,0.000712,-0.191289,0.546856,0.796992,0.000996
3,1:16309548:A:T,1,16309548,0.007775,0.000399,0.019973,A,T,0.697084,0.368201,0.118970,0.029544,0.492176,0.000094,0.094605,0.035750,0.619112,0.000126
4,1:16309919:A:G,1,16309919,0.007188,0.000400,0.020010,A,G,0.719427,0.367149,0.106196,0.026472,0.517063,0.000091,0.095910,0.031973,0.594171,0.000124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
785,1:16577626:T:G,1,16577626,0.048317,0.001663,0.040774,T,G,0.236014,0.659827,-0.260081,0.023916,0.099109,0.000543,0.248346,0.029215,0.152743,0.000647
786,1:16579584:T:A,1,16579584,-0.174521,0.035891,0.189450,T,A,0.356948,0.002712,-0.163994,0.675490,0.842687,0.000704,-0.637943,0.805722,0.480706,0.000989
787,1:16580288:T:C,1,16580288,-0.159916,0.037436,0.193484,T,C,0.408515,0.002593,-0.163994,0.675490,0.842687,0.000685,-0.637943,0.805722,0.480706,0.000962
788,1:16582952:G:T,1,16582952,0.074419,0.002783,0.052757,G,T,0.158365,0.060092,-0.283698,0.050032,0.210796,0.000660,-0.172939,0.061652,0.489477,0.000793
