# Lead and novel variants

In [1]:
import gwaslab as gl

In [2]:
gl.show_version()

2025/12/26 11:36:20 GWASLab v4.0.0 https://cloufield.github.io/gwaslab/
2025/12/26 11:36:20 (C) 2022-2025, Yunye He, Kamatani Lab, GPL-3.0 license, gwaslab@gmail.com
2025/12/26 11:36:20 Python version: 3.12.0 | packaged by conda-forge | (main, Oct  3 2023, 08:43:22) [GCC 12.3.0]


## Load sample data

Use only first 1000000 variants as example

In [3]:
mysumstats = gl.Sumstats("../0_sample_data/t2d_bbj.txt.gz",
             snpid="SNP",
             chrom="CHR",
             pos="POS",
             ea="ALT",
             nea="REF",            
             beta="BETA",
             se="SE",
             p="P", 
             build="19",
             verbose=False, 
             nrows=1000000)
mysumstats.basic_check(verbose=False)

Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,BETA,SE,P
0,1:725932_G_A,1,725932,G,A,1960099,-0.0737,0.1394,0.59700
1,1:725933_A_G,1,725933,G,A,1960099,0.0737,0.1394,0.59730
2,1:737801_T_C,1,737801,C,T,1960099,0.0490,0.1231,0.69080
3,1:749963_T_TAA,1,749963,TAA,T,1960399,0.0213,0.0199,0.28460
4,1:751343_T_A,1,751343,T,A,1960099,0.0172,0.0156,0.27050
...,...,...,...,...,...,...,...,...,...
999995,2:6347639_C_A,2,6347639,C,A,1960099,-0.0159,0.0111,0.15100
999996,2:6347694_G_C,2,6347694,G,C,1960099,-0.0151,0.0111,0.17250
999997,2:6348478_G_A,2,6348478,G,A,1960099,-0.0152,0.0111,0.17160
999998,2:6348490_G_C,2,6348490,G,C,1960099,-0.3180,0.2032,0.11750


## Get lead variants

GWASLab will use MLOG10P first by default. If MLOG10P is not avaiable, it will look for P column.

In [4]:
mysumstats.get_lead()

2025/12/26 11:36:24  -Genomic coordinates are based on GRCh37/hg19...
2025/12/26 11:36:24 Start to extract lead variants ...(v4.0.0)
2025/12/26 11:36:24  -Processing 1000000 variants...
2025/12/26 11:36:24  -Significance threshold : 5e-08
2025/12/26 11:36:24  -Sliding window size: 500  kb
2025/12/26 11:36:24  -Using P for extracting lead variants...
2025/12/26 11:36:24  -Found 543 significant variants in total...
2025/12/26 11:36:24  -Identified 4 lead variants!
2025/12/26 11:36:24 Finished extracting lead variants.


Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,BETA,SE,P
96739,1:22068326_A_G,1,22068326,G,A,1960099,0.0621,0.0103,1.629e-09
213860,1:51103268_T_C,1,51103268,C,T,1960099,-0.0802,0.012,2.519e-11
534095,1:154309595_TA_T,1,154309595,TA,T,1960399,-0.0915,0.0166,3.289e-08
969974,2:640986_CACAT_C,2,640986,C,CACAT,1960399,-0.0946,0.015,2.665e-10


## Get lead variants with gene name annotation

In [5]:
mysumstats.get_lead(anno=True)

2025/12/26 11:36:24  -Genomic coordinates are based on GRCh37/hg19...
2025/12/26 11:36:24 Start to extract lead variants ...(v4.0.0)
2025/12/26 11:36:24  -Processing 1000000 variants...
2025/12/26 11:36:24  -Significance threshold : 5e-08
2025/12/26 11:36:24  -Sliding window size: 500  kb
2025/12/26 11:36:24  -Using P for extracting lead variants...
2025/12/26 11:36:24  -Found 543 significant variants in total...
2025/12/26 11:36:24  -Identified 4 lead variants!
2025/12/26 11:36:24  -Annotating variants using references:ensembl
2025/12/26 11:36:24  -Annotating variants using references based on genome build:19
2025/12/26 11:36:24  -Genomic coordinates are based on GRCh37/hg19...
2025/12/26 11:36:24 Start to annotate variants with nearest gene name(s) ...(v4.0.0)
2025/12/26 11:36:24  -Current Dataframe shape : 4 x 10 ; Memory usage: 0.64 MB
2025/12/26 11:36:24  -Genomic coordinates are based on GRCh37/hg19...
2025/12/26 11:36:24  -Assigning Gene name using ensembl_hg19_gtf for protein c

Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,BETA,SE,P,LOCATION,GENE
96739,1:22068326_A_G,1,22068326,G,A,1960099,0.0621,0.0103,1.629e-09,0,USP48
213860,1:51103268_T_C,1,51103268,C,T,1960099,-0.0802,0.012,2.519e-11,0,FAF1
534095,1:154309595_TA_T,1,154309595,TA,T,1960399,-0.0915,0.0166,3.289e-08,0,ATP8B2
969974,2:640986_CACAT_C,2,640986,C,CACAT,1960399,-0.0946,0.015,2.665e-10,-26349,TMEM18


## Different window sizes

In [6]:
mysumstats.get_lead(windowsizekb=1000)

2025/12/26 11:36:28  -Genomic coordinates are based on GRCh37/hg19...
2025/12/26 11:36:28 Start to extract lead variants ...(v4.0.0)
2025/12/26 11:36:28  -Current Dataframe shape : 1000000 x 9 ; Memory usage: 60.72 MB
2025/12/26 11:36:28  -Processing 1000000 variants...
2025/12/26 11:36:28  -Significance threshold : 5e-08
2025/12/26 11:36:28  -Sliding window size: 1000  kb
2025/12/26 11:36:29  -Using P for extracting lead variants...
2025/12/26 11:36:29  -Found 543 significant variants in total...
2025/12/26 11:36:29  -Identified 4 lead variants!
2025/12/26 11:36:29 Finished extracting lead variants.


Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,BETA,SE,P
96739,1:22068326_A_G,1,22068326,G,A,1960099,0.0621,0.0103,1.629e-09
213860,1:51103268_T_C,1,51103268,C,T,1960099,-0.0802,0.012,2.519e-11
534095,1:154309595_TA_T,1,154309595,TA,T,1960399,-0.0915,0.0166,3.289e-08
969974,2:640986_CACAT_C,2,640986,C,CACAT,1960399,-0.0946,0.015,2.665e-10


## Different thresholds

In [7]:
mysumstats.get_lead(sig_level=1e-10)

2025/12/26 11:36:30  -Genomic coordinates are based on GRCh37/hg19...
2025/12/26 11:36:30 Start to extract lead variants ...(v4.0.0)
2025/12/26 11:36:30  -Processing 1000000 variants...
2025/12/26 11:36:30  -Significance threshold : 1e-10
2025/12/26 11:36:30  -Sliding window size: 500  kb
2025/12/26 11:36:30  -Using P for extracting lead variants...
2025/12/26 11:36:30  -Found 1 significant variants in total...
2025/12/26 11:36:30  -Identified 1 lead variants!
2025/12/26 11:36:30 Finished extracting lead variants.


Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,BETA,SE,P
213860,1:51103268_T_C,1,51103268,C,T,1960099,-0.0802,0.012,2.519e-11


## Check if novel against a tabular file

In [8]:
novel = mysumstats.get_novel(known="../0_sample_data//toy_data/known_loci.txt")

2025/12/26 11:36:36  -Genomic coordinates are based on GRCh37/hg19...
2025/12/26 11:36:36 Start to check if lead variants are known ...(v4.0.0)
2025/12/26 11:36:36  -Genomic coordinates are based on GRCh37/hg19...
2025/12/26 11:36:36 Start to extract lead variants ...(v4.0.0)
2025/12/26 11:36:36  -Processing 1000000 variants...
2025/12/26 11:36:36  -Significance threshold : 5e-08
2025/12/26 11:36:36  -Sliding window size: 500  kb
2025/12/26 11:36:36  -Using P for extracting lead variants...
2025/12/26 11:36:36  -Found 543 significant variants in total...
2025/12/26 11:36:36  -Identified 4 lead variants!
2025/12/26 11:36:36 Finished extracting lead variants.
2025/12/26 11:36:36  -Lead variants in known loci: 2
2025/12/26 11:36:36  -Checking the minimum distance between identified lead variants and provided known variants...
2025/12/26 11:36:36  -Identified  2  known vairants in current sumstats...
2025/12/26 11:36:36  -Identified  2  novel vairants in current sumstats...
2025/12/26 11:3

In [9]:
novel

Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,BETA,SE,P,DISTANCE_TO_KNOWN,KNOWN_ID,NOVEL,LOCATION_OF_KNOWN
0,1:22068326_A_G,1,22068326,G,A,1960099,0.0621,0.0103,1.629e-09,29034942.0,1:51103268,True,Upstream
1,1:51103268_T_C,1,51103268,C,T,1960099,-0.0802,0.012,2.519e-11,0.0,1:51103268,False,Same
2,1:154309595_TA_T,1,154309595,TA,T,1960399,-0.0915,0.0166,3.289e-08,0.0,1:154309595,False,Same
3,2:640986_CACAT_C,2,640986,C,CACAT,1960399,-0.0946,0.015,2.665e-10,,,True,NoneOnThisChr


## Ccheck against GWAS Catalog using EFO ID

In [11]:
mysumstats.liftover(to_build="38")

2025/12/26 11:37:18 Start to perform liftover ...(v4.0.0)
2025/12/26 11:37:18  -Using built-in chain file: /home/yunye/anaconda3/envs/py312/lib/python3.12/site-packages/gwaslab/data/chains/hg19ToHg38.over.chain.gz
2025/12/26 11:37:18  -Converting variants with status code xxx0xxx: 1,000,000
2025/12/26 11:37:18  -Target build: 38
2025/12/26 11:37:18  -Input positions are 1-based
2025/12/26 11:37:18  -Output positions will be 1-based
2025/12/26 11:37:19  -Chromosome mismatches detected: 52 variants (treated as unmapped)
2025/12/26 11:37:19  -Examples of chromosome mismatches:
2025/12/26 11:37:19    SNPID=1:13127868_C_A | CHR=1 | POS=13127868 | CHR_LIFT=1_KI270766v1_alt | POS_LIFT=5939 | STATUS=1960099
2025/12/26 11:37:19    SNPID=1:146705000_G_A | CHR=1 | POS=146705000 | CHR_LIFT=15 | POS_LIFT=39894340 | STATUS=1960099
2025/12/26 11:37:19    SNPID=1:147758522_T_C | CHR=1 | POS=147758522 | CHR_LIFT=8 | POS_LIFT=38661844 | STATUS=1960099
2025/12/26 11:37:19    SNPID=1:223725692_T_C | CHR=1

In [12]:
# EFO ID can be found on gwas catalog
mysumstats.get_novel(efo="MONDO_0005148")

2025/12/26 11:37:25  -Genomic coordinates are based on GRCh38/hg38...
2025/12/26 11:37:25 Start to check if lead variants are known ...(v4.0.0)
2025/12/26 11:37:25  -Genomic coordinates are based on GRCh38/hg38...
2025/12/26 11:37:25 Start to extract lead variants ...(v4.0.0)
2025/12/26 11:37:25  -Processing 999381 variants...
2025/12/26 11:37:25  -Significance threshold : 5e-08
2025/12/26 11:37:25  -Sliding window size: 500  kb
2025/12/26 11:37:25  -Using P for extracting lead variants...
2025/12/26 11:37:25  -Found 543 significant variants in total...
2025/12/26 11:37:25  -Identified 4 lead variants!
2025/12/26 11:37:25 Finished extracting lead variants.
2025/12/26 11:37:25  -Genomic coordinates are based on GRCh38/hg38...
2025/12/26 11:37:25  -Genomic coordinates are based on GRCh38/hg38...
2025/12/26 11:37:25  -Sumstats build matches target build
2025/12/26 11:37:25 Start to retrieve data using EFO: MONDO_0005148...
2025/12/26 11:37:25  -Querying GWAS Catalog API v2 for trait: MOND

Unnamed: 0,SNPID,CHR,POS,EA,NEA,STATUS,BETA,SE,P,DISTANCE_TO_KNOWN,KNOWN_ID,KNOWN_PUBMED_ID,KNOWN_AUTHOR,NOVEL,LOCATION_OF_KNOWN
0,1:22068326_A_G,1,21741833,G,A,3860099,0.0621,0.0103,1.629e-09,0,rs1825307,30718926,Suzuki K,False,Same
1,1:51103268_T_C,1,50637596,C,T,3860099,-0.0802,0.012,2.519e-11,0,rs12031188,30718926,Suzuki K,False,Same
2,1:154309595_TA_T,1,154337119,TA,T,3860399,-0.0915,0.0166,3.289e-08,1,rs68062313,30718926,Suzuki K,False,Upstream
3,2:640986_CACAT_C,2,640986,C,CACAT,3860399,-0.0946,0.015,2.665e-10,-1931,rs7564708,34594039,Sakaue S,False,Downstream
