# Formatting and saving

In [None]:
!wget -O t2d_bbj.txt.gz http://jenger.riken.jp/14/

In [1]:
import sys
sys.path.insert(0,"/home/he/work/gwaslab/src")
import gwaslab as gl

## Loading data

In [2]:
mysumstats = gl.Sumstats("t2d_bbj.txt.gz",
             snpid="SNP",
             chrom="CHR",
             pos="POS",
             ea="ALT",
             nea="REF",
             neaf="Frq",
             beta="BETA",
             se="SE",
             p="P",
             direction="Dir",
             n="N")

Mon Dec 26 00:40:05 2022 Start to initiate from file :t2d_bbj.txt.gz
Mon Dec 26 00:40:25 2022  -Reading columns          : POS,Dir,SNP,P,Frq,REF,CHR,BETA,N,ALT,SE
Mon Dec 26 00:40:25 2022  -Renaming columns to      : POS,DIRECTION,SNPID,P,EAF,NEA,CHR,BETA,N,EA,SE
Mon Dec 26 00:40:25 2022  -Current Dataframe shape : 12557761  x  11
Mon Dec 26 00:40:26 2022  -Initiating a status column: STATUS ...
Mon Dec 26 00:40:30 2022  -NEAF is specified...
Mon Dec 26 00:40:30 2022  -Checking if 0<= NEAF <=1 ...
Mon Dec 26 00:40:31 2022  -Converted NEAF to EAF.
Mon Dec 26 00:40:31 2022  -Removed 0 variants with bad NEAF.
Mon Dec 26 00:40:31 2022 Start to reorder the columns...
Mon Dec 26 00:40:31 2022  -Current Dataframe shape : 12557761  x  12
Mon Dec 26 00:40:31 2022  -Reordering columns to    : SNPID,CHR,POS,EA,NEA,EAF,BETA,SE,P,N,DIRECTION,STATUS
Mon Dec 26 00:40:31 2022 Finished sorting columns successfully!
Mon Dec 26 00:40:31 2022 Finished loading data successfully!


## Check availble formats

In [3]:
gl.list_formats()

Mon Dec 26 00:29:00 2022 Available formats: bolt_lmm,fastgwa,gwascatalog,gwascatalog_hm,gwaslab,ldsc,metal,mrmega,pgscatalog,pgscatalog_hm,pheweb,plink,plink2,regenie,saige,ssf,template,vcf


In [4]:
gl.check_format("saige")

Mon Dec 26 00:29:00 2022 Available formats:Mon Dec 26 00:29:00 2022 meta_dataMon Dec 26 00:29:00 2022 format_dictMon Dec 26 00:29:00 2022 
Mon Dec 26 00:29:00 2022 {'format_name': 'saige', 'format_source': 'https://saigegit.github.io//SAIGE-doc/docs/single_step2.html', 'format_version': 'v1.1.3', 'last_check_date': 20220806}Mon Dec 26 00:29:00 2022 {'SNPID': 'SNPID', 'CHR': 'CHR', 'POS': 'POS', 'Allele1': 'NEA', 'Allele2': 'EA', 'AF_Allele2': 'EAF', 'N': 'N', 'BETA': 'BETA', 'SE': 'SE', 'p.value': 'P', 'imputationInfo': 'INFO'}

## Filter for output

In [5]:
mysumstats.filter_value('CHR == "6"',inplace=True)
mysumstats.basic_check()

Mon Dec 26 00:29:01 2022 Start filtering values by condition: CHR == "6"
Mon Dec 26 00:29:01 2022  -Removing 11724339 variants not meeting the conditions: CHR == "6"
Mon Dec 26 00:29:01 2022 Finished filtering values.
Mon Dec 26 00:29:01 2022 Start to check IDs...
Mon Dec 26 00:29:01 2022  -Current Dataframe shape : 833422  x  12
Mon Dec 26 00:29:01 2022  -Checking if SNPID is chr:pos:ref:alt...(separator: - ,: , _)
Mon Dec 26 00:29:03 2022 Finished checking IDs successfully!
Mon Dec 26 00:29:03 2022 Start to fix chromosome notation...
Mon Dec 26 00:29:03 2022  -Current Dataframe shape : 833422  x  12
Mon Dec 26 00:29:04 2022  -Vairants with standardized chromosome notation: 833422
Mon Dec 26 00:29:06 2022  -All CHR are already fixed...
Mon Dec 26 00:29:07 2022 Finished fixing chromosome notation successfully!
Mon Dec 26 00:29:07 2022 Start to fix basepair positions...
Mon Dec 26 00:29:07 2022  -Current Dataframe shape : 833422  x  12
Mon Dec 26 00:29:07 2022  -Converting to Int64 data

## Formatting and saving

### get ready for submission to gwas catalog

In [6]:
mysumstats.to_format("./t2d",fmt="gwascatalog",md5sum=True)

Mon Dec 26 00:29:13 2022 Start to format the output sumstats in:  gwascatalog  format
Mon Dec 26 00:29:13 2022  -Formatting statistics ...
Mon Dec 26 00:29:15 2022  - Float statistics formats:
Mon Dec 26 00:29:15 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Mon Dec 26 00:29:15 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Mon Dec 26 00:29:15 2022  - Start outputting sumstats in gwascatalog format...
Mon Dec 26 00:29:15 2022  -gwascatalog format will be loaded...
Mon Dec 26 00:29:15 2022  -gwascatalog format meta info:
Mon Dec 26 00:29:15 2022   - format_name  :  gwascatalog
Mon Dec 26 00:29:15 2022   - format_source  :  https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics
Mon Dec 26 00:29:15 2022   - format_version  :  20220726
Mon Dec 26 00:29:15 2022  -gwaslab to gwascatalog format dictionary:
Mon Dec 26 00:29:15 2022   - gwaslab keys: SNPID,CHR,POS,NEA,EA,BETA,EAF,SE,P,OR,OR_95L,OR_95U
Mon Dec 26 00:29:15 2022   - gwascatalog values: variant_id,chromosome,b

### calculate md5sum for checking the integrity when sharing

In [7]:
!head t2d.gwascatalog.tsv.gz.md5sum

3b73c8d14e60b125c79a7cfc24d47622


### GWAS-ssf format

In [8]:
mysumstats.to_format("./mysumstats",fmt="ssf")

Mon Dec 26 00:29:27 2022 Start to format the output sumstats in:  ssf  format
Mon Dec 26 00:29:27 2022  -Formatting statistics ...
Mon Dec 26 00:29:28 2022  - Float statistics formats:
Mon Dec 26 00:29:28 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Mon Dec 26 00:29:28 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Mon Dec 26 00:29:28 2022  - Start outputting sumstats in ssf format...
Mon Dec 26 00:29:29 2022  -ssf format will be loaded...
Mon Dec 26 00:29:29 2022  -ssf format meta info:
Mon Dec 26 00:29:29 2022   - format_name  :  ssf
Mon Dec 26 00:29:29 2022   - format_source  :  https://www.biorxiv.org/content/10.1101/2022.07.15.500230v1.full
Mon Dec 26 00:29:29 2022   - format_version  :  20220726
Mon Dec 26 00:29:29 2022  -gwaslab to ssf format dictionary:
Mon Dec 26 00:29:29 2022   - gwaslab keys: SNPID,rsID,CHR,POS,NEA,EA,N,BETA,SE,P,INFO,OR,OR_95L,OR_95U
Mon Dec 26 00:29:29 2022   - ssf values: variant_id,rsid,chromosome,bas_pair_location,other_allele,effect_

### ldsc default format

In [9]:
mysumstats.to_format("./mysumstats",fmt="ldsc",hapmap3=True,exclude_hla=True)

Mon Dec 26 00:29:38 2022 Start to format the output sumstats in:  ldsc  format
Mon Dec 26 00:29:38 2022  -Excluding variants in HLA region ...
Mon Dec 26 00:29:39 2022  -Exclude 75022 variants in HLA region.
Mon Dec 26 00:29:39 2022  -Processing 758400 raw variants...
Mon Dec 26 00:29:39 2022  -Loading Hapmap3 variants data...
Mon Dec 26 00:29:40 2022  -Since rsID not in sumstats, chr:pos( build 19) will be used for matching...
Mon Dec 26 00:29:43 2022  -Raw input contains 66674 hapmaps variants based on chr:pos...
Mon Dec 26 00:29:43 2022  -Extract 66674 variants in Hapmap3 datasets for build 19.
Mon Dec 26 00:29:43 2022  -Formatting statistics ...
Mon Dec 26 00:29:43 2022  - Float statistics formats:
Mon Dec 26 00:29:43 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Mon Dec 26 00:29:43 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Mon Dec 26 00:29:43 2022  - Start outputting sumstats in ldsc format...
Mon Dec 26 00:29:43 2022  -ldsc format will be loaded...
Mon Dec 

### vcf

In [11]:
mysumstats.to_format("./mysumstats",fmt="vcf",bgzip=True,tabix=True)

Mon Dec 26 00:30:15 2022 Start to format the output sumstats in:  vcf  format
Mon Dec 26 00:30:15 2022  -Formatting statistics ...
Mon Dec 26 00:30:16 2022  - Float statistics formats:
Mon Dec 26 00:30:16 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Mon Dec 26 00:30:16 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Mon Dec 26 00:30:16 2022  - Start outputting sumstats in vcf format...
Mon Dec 26 00:30:16 2022  -vcf format will be loaded...
Mon Dec 26 00:30:16 2022  -vcf format meta info:
Mon Dec 26 00:30:16 2022   - format_name  :  vcf
Mon Dec 26 00:30:16 2022   - format_source  :  https://github.com/MRCIEU/gwas-vcf-specification/tree/1.0.0
Mon Dec 26 00:30:16 2022   - format_version  :  20220923
Mon Dec 26 00:30:16 2022   - format_citation  :  Lyon, M.S., Andrews, S.J., Elsworth, B. et al. The variant call format provides efficient and robust storage of GWAS summary statistics. Genome Biol 22, 32 (2021). https://doi.org/10.1186/s13059-020-02248-0
Mon Dec 26 00:30:16

## For annotation

### convert to bed format

In [3]:
mysumstats.to_format("./mysumstats",fmt="bed")

Mon Dec 26 00:40:31 2022 Start to format the output sumstats in:  bed  format
Mon Dec 26 00:40:32 2022  -Formatting statistics ...
Mon Dec 26 00:40:52 2022  - Float statistics formats:
Mon Dec 26 00:40:52 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Mon Dec 26 00:40:52 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Mon Dec 26 00:40:52 2022  - Start outputting sumstats in bed format...
Mon Dec 26 00:40:55 2022  -Number of SNPs : 11421217
Mon Dec 26 00:40:56 2022  -Number of Insertions : 781996
Mon Dec 26 00:40:57 2022  -Number of Deletions : 354536
Mon Dec 26 00:40:57 2022  -formatting to 0-based bed-like file...
Mon Dec 26 00:41:07 2022  -Output columns: Index(['CHR', 'START', 'END', 'NEA/EA', 'STRAND', 'SNPID'], dtype='object')
Mon Dec 26 00:41:07 2022  -Output path: ./mysumstats.bed
Mon Dec 26 00:41:37 2022  -Saving log file: ./mysumstats.bed.log
Mon Dec 26 00:41:37 2022 Finished outputting successfully!


### convert to vep default format

In [4]:
mysumstats.to_format("./mysumstats",fmt="vep")

Mon Dec 26 00:41:38 2022 Start to format the output sumstats in:  vep  format
Mon Dec 26 00:41:39 2022  -Formatting statistics ...
Mon Dec 26 00:41:58 2022  - Float statistics formats:
Mon Dec 26 00:41:58 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Mon Dec 26 00:41:58 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Mon Dec 26 00:41:58 2022  - Start outputting sumstats in vep format...
Mon Dec 26 00:42:01 2022  -Number of SNPs : 11421217
Mon Dec 26 00:42:02 2022  -Number of Insertions : 781996
Mon Dec 26 00:42:03 2022  -Number of Deletions : 354536
Mon Dec 26 00:42:03 2022  -formatting to 1-based bed-like file (for vep)...
Mon Dec 26 00:42:13 2022  -Output columns: Index(['CHR', 'START', 'END', 'NEA/EA', 'STRAND', 'SNPID'], dtype='object')
Mon Dec 26 00:42:13 2022  -Output path: ./mysumstats.vep.gz
Mon Dec 26 00:44:26 2022  -Saving log file: ./mysumstats.vep.log
Mon Dec 26 00:44:26 2022 Finished outputting successfully!


### convert to annovar default input format

In [5]:
mysumstats.to_format("./mysumstats",fmt="annovar")

Mon Dec 26 00:44:27 2022 Start to format the output sumstats in:  annovar  format
Mon Dec 26 00:44:27 2022  -Formatting statistics ...
Mon Dec 26 00:44:47 2022  - Float statistics formats:
Mon Dec 26 00:44:47 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Mon Dec 26 00:44:47 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Mon Dec 26 00:44:47 2022  - Start outputting sumstats in annovar format...
Mon Dec 26 00:44:50 2022  -Number of SNPs : 11421217
Mon Dec 26 00:44:51 2022  -Number of Insertions : 781996
Mon Dec 26 00:44:52 2022  -Number of Deletions : 354536
Mon Dec 26 00:44:52 2022  -formatting to 1-based bed-like file...
Mon Dec 26 00:44:58 2022  -Output columns: Index(['CHR', 'START', 'END', 'NEA_out', 'EA_out', 'SNPID'], dtype='object')
Mon Dec 26 00:44:58 2022  -Output path: ./mysumstats.annovar
Mon Dec 26 00:45:27 2022  -Saving log file: ./mysumstats.annovar.log
Mon Dec 26 00:45:27 2022 Finished outputting successfully!
