In [1]:
gwas_file = "/home/xutingfeng/GIFT/data/GWAS/T1Mapping_Cortex_20240129.csv_firstorder_Median_all_2023_GRCh38_unionKidneys.tsv.gz"
# bfile_path = "/home/xutingfeng/GIFT/data/bgen/DNAJC16"
bfile_path = "/mnt/d/桌面/work/GIFT/data/pgen/DNAJC16_GRCh38"

bgen_path = f"{bfile_path}.bgen"
sample_path = f"{bfile_path}.sample"
bgi_path = f"{bfile_path}.bgen.bgi"

In [2]:
from finemap_tools.reader.gwas import load_GWASFormated_file


topLoci = (1, 15583355, 15583355)  # (chr, start, end)
locus_range = 250  # 100kb
locus_range = locus_range * 1000
locus_range_tuple = (topLoci[0], topLoci[1] - locus_range, topLoci[2] + locus_range)

locus_region = f"{locus_range_tuple[0]}:{locus_range_tuple[1]}-{locus_range_tuple[2]}"
print(f"locus_region: {locus_region}")

locus_region: 1:15333355-15833355


In [3]:
save_dir = "/home/xutingfeng/GIFT/data/analysis/DNAJC16/FINEMAP"
from pathlib import Path

Path(save_dir).mkdir(parents=True, exist_ok=True)

In [4]:
import pandas as pd
from finemap_tools.utils import add_ID
import numpy as np

zfile_gwasformated_map = {
    "rsid": None,
    "chromosome": "chrom",
    "position": "pos",
    "allele1": "ref",  # allele1 is the first allele of bgen, and mostly is the ref allele in UKB or others, but plz check.
    "allele2": "alt",  # allele2 is the second allele of bgen, and mostly is the alt allele in UKB or others, but plz check.
    "maf": None,
    "beta": "beta",
    "se": "sebeta",
}


def to_zfile(sumstats: pd.DataFrame, cols_map: dict = None):

    used_col_map = zfile_gwasformated_map.copy()
    if cols_map is not None:
        used_col_map.update(cols_map)

    if used_col_map.get("rsid", None) is None:
        sumstats["rsid"] = add_ID(
            sumstats,
            [
                used_col_map["chromosome"],
                used_col_map["position"],
                used_col_map["allele1"],
                used_col_map["allele2"],
            ],
            new_col="rsid",
        )

        used_col_map["rsid"] = "rsid"

    if used_col_map.get("maf", None) is None:
        freq_col = used_col_map.get("freq_col", "af")  # default freq_col is "af"
        if freq_col not in sumstats.columns:
            raise ValueError(
                f"freq_col: {freq_col} not in gwas columns , freq_col or maf must be passed "
            )
        sumstats[freq_col] = sumstats[freq_col].astype(float)
        sumstats["maf"] = np.where(
            sumstats[freq_col] < 0.5, sumstats[freq_col], 1.0 - sumstats[freq_col]
        )

        used_col_map["maf"] = "maf"

    rename_cols = {
        v: k for k, v in used_col_map.items() if k in zfile_gwasformated_map.keys()
    }
    sumstats = sumstats.rename(columns=rename_cols)
    return sumstats[list(rename_cols.values())]


from finemap_tools.snpfilter import filter_pipline

In [14]:
sumstats = load_GWASFormated_file(gwas_file, region=locus_region)  # load
sumstats["variant_id"] = add_ID(sumstats, ["chrom", "pos", "ref", "alt"])  # add rsid
sumstats = filter_pipline(sumstats=sumstats, id_col="variant_id")  # filter
zfile = to_zfile(sumstats, cols_map={"rsid": "variant_id"})  # to zfile format

tabix have a header, so will take the first line as header and remove it.
drop 463 ambiguous alleles
drop 10 biallelic snps


In [17]:
# check all in bgen file
try:
    import hail as hl

    hl.init()
except:
    pass
# # TODO:contig_recoding may need to update
try:
    data = hl.import_bgen(bgen_path, entry_fields=[], sample_file=sample_path)
except:
    hl.index_bgen(bgen_path, contig_recoding={"1": "chr1"}, reference_genome="GRCh38")
    data = hl.import_bgen(bgen_path, entry_fields=[], sample_file=sample_path)

bgen_var = data.rows().to_pandas()
before_merge_nums = len(zfile)
zfile = zfile[zfile["rsid"].isin(bgen_var["rsid"])]
after_merge_nums = len(zfile)
print(
    f"before merge nums: {before_merge_nums}, after merge nums: {after_merge_nums}, droped nums: {before_merge_nums - after_merge_nums}"
)

In [20]:
import os
from finemap_tools.utils import iter_count

# sample_tgt_path = os.path.join(save_d
n_sample = iter_count(sample_path) - 2

tgt_bgen = os.path.join(save_dir, "data.bgen")
tgt_sample = os.path.join(save_dir, "data.sample")
tgt_bgi = os.path.join(save_dir, "data.bgen.bgi")
tgt_z = os.path.join(save_dir, "data.z")


# for bgen link data
try:
    os.symlink(bgen_path, tgt_bgen)
    os.symlink(sample_path, tgt_sample)
    os.symlink(bgi_path, tgt_bgi)
except:
    pass

zfile.to_csv(tgt_z, sep=" ", index=False, na_rep="NA")


header_list = [
    "z",
    "bgen",
    "bgi",
    "sample",
    "bcor",
    "bdose",
    "ld",
    "n_samples",
    "snp",
    "config",
    "cred",
    "log",
]
file_list = [
    "data.z",
    "data.bgen",
    "data.bgen.bgi",
    "data.sample",
    "data.bcor",
    "data.bdose",
    "data.ld",
    n_sample,
    "data.snp",
    "data.config",
    "data.cred",
    "data.log",
]


with open(os.path.join(save_dir, "data"), "w") as f:
    for write_list in [header_list, file_list]:
        f.write(";".join([str(x) for x in write_list]) + "\n")

487409

In [28]:
zfile.to_csv(
    f"/home/xutingfeng/GIFT/data/analysis/DNAJC16/FINEMAP/dataset.z",
    sep=" ",
    index=False,
    na_rep="NA",
)

In [8]:
from bgen import BgenReader, BgenWriter

bfile = BgenReader(bgen_path)
rsids = bfile.rsids()

In [16]:
var = bfile[10]

var.probabilities

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]], dtype=float32)

In [30]:
var.probabilities.shape

(487409, 3)

In [None]:
# select a variant by indexing
var = bfile[1000]

# pull out genotype probabilities
probs = var.probabilities  # returns 2D numpy array
dosage = var.minor_allele_dosage  # returns 1D numpy array for biallelic variant

# iterate through every variant in the file
with BgenReader(BGEN_PATH, delay_parsing=True) as bfile:
    for var in bfile:
        dosage = var.minor_allele_dosage

# get all variants in a genomic region
variants = bfile.fetch("21", 10000, 5000000)

# or for writing bgen files
import numpy as np
from bgen import BgenWriter

geno = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]).astype(np.float64)
with BgenWriter(BGEN_PATH, n_samples=3) as bfile:
    bfile.add_variant(
        varid="var1",
        rsid="rs1",
        chrom="chr1",
        pos=1,
        alleles=["A", "G"],
        genotypes=geno,
    )

In [21]:
import hail as hl

# hl.init()
from hail.plot import show
from pprint import pprint

hl.plot.output_notebook()

Initializing Hail with default parameters...
your 131072x1 screen size is bogus. expect trouble


24/03/13 20:28:19 WARN Utils: Your hostname, DESKTOP-JUVLFQO resolves to a loopback address: 127.0.1.1; using 172.18.33.194 instead (on interface eth0)
24/03/13 20:28:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/03/13 20:28:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.4
SparkUI available at http://172.18.33.194:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.128-eead8100a1c1
LOGGING: writing to /home/xutingfeng/github_code/mine/finemap_tools/pipline/finemap/hail-20240313-2028-0.2.128-eead8100a1c1.log
2024-03-13 20:28:23.300 Hail: INFO: downloading 1KG VCF ...
  Source: https://storage.googleapis.com/hail-tutorial/1kg.vcf.bgz
2024-03-13 20:28:27.456 Hail: INFO: importing VCF and writing to matrix table...
2024-03-13 20:28:28.386 Hail: INFO: scanning VCF for sortedness...
2024-03-13 20:28:31.289 Hail: INFO: Coerced sorted VCF - no additional import work to do
2024-03-13 20:28:33.212 Hail: INFO: wrote matrix table with 10879 rows and 284 columns in 16 partitions to data/1kg.mt
2024-03-13 20:28:33.250 Hail: INFO: downloading 1KG