# Formatting and saving

In [None]:
!wget -O t2d_bbj.txt.gz http://jenger.riken.jp/14/

In [1]:
import sys
sys.path.insert(0,"/Users/he/work/gwaslab/src")
import gwaslab as gl

## Loading data

In [2]:
mysumstats = gl.Sumstats("t2d_bbj.txt.gz",
             snpid="SNP",
             chrom="CHR",
             pos="POS",
             ea="ALT",
             nea="REF",
             neaf="Frq",
             beta="BETA",
             se="SE",
             p="P",
             direction="Dir",
             n="N")

Mon Nov 28 13:41:21 2022 Start to initiate from file :t2d_bbj.txt.gz
Mon Nov 28 13:41:49 2022  -Reading columns          : REF,ALT,BETA,POS,P,Dir,N,SE,Frq,CHR,SNP
Mon Nov 28 13:41:49 2022  -Renaming columns to      : NEA,EA,BETA,POS,P,DIRECTION,N,SE,EAF,CHR,SNPID
Mon Nov 28 13:41:49 2022  -Current Dataframe shape : 12557761  x  11
Mon Nov 28 13:41:52 2022  -Initiating a status column: STATUS ...
Mon Nov 28 13:41:57 2022  -NEAF is specified...
Mon Nov 28 13:41:57 2022  -Checking if 0<= NEAF <=1 ...
Mon Nov 28 13:41:59 2022  -Converted NEAF to EAF.
Mon Nov 28 13:41:59 2022  -Removed 0 variants with bad NEAF.
Mon Nov 28 13:42:00 2022 Start to reorder the columns...
Mon Nov 28 13:42:00 2022  -Current Dataframe shape : 12557761  x  12
Mon Nov 28 13:42:00 2022  -Reordering columns to    : SNPID,CHR,POS,EA,NEA,EAF,BETA,SE,P,N,DIRECTION,STATUS
Mon Nov 28 13:42:00 2022 Finished sorting columns successfully!
Mon Nov 28 13:42:01 2022 Finished loading data successfully!


## Check availble formats

In [3]:
gl.list_formats()

Mon Nov 28 13:42:01 2022 Available formats: fastgwa,gwascatalog,gwascatalog_hm,gwaslab,ldsc,metal,mrmega,pgscatalog,pgscatalog_hm,pheweb,plink,plink2,regenie,saige,ssf,template,vcf


In [4]:
gl.check_format("saige")

Mon Nov 28 13:42:01 2022 Available formats:Mon Nov 28 13:42:01 2022 meta_dataMon Nov 28 13:42:01 2022 format_dictMon Nov 28 13:42:01 2022 
Mon Nov 28 13:42:01 2022 {'format_name': 'saige', 'format_source': 'https://saigegit.github.io//SAIGE-doc/docs/single_step2.html', 'format_version': 'v1.1.3', 'last_check_date': 20220806}Mon Nov 28 13:42:01 2022 {'SNPID': 'SNPID', 'CHR': 'CHR', 'POS': 'POS', 'Allele1': 'NEA', 'Allele2': 'EA', 'AF_Allele2': 'EAF', 'N': 'N', 'BETA': 'BETA', 'SE': 'SE', 'p.value': 'P', 'imputationInfo': 'INFO'}

## Filter for output

In [5]:
mysumstats.filter_value('CHR == "6"',inplace=True)
mysumstats.basic_check()

Mon Nov 28 13:42:01 2022 Start filtering values by condition: CHR == "6"
Mon Nov 28 13:42:02 2022  -Removing 11724339 variants not meeting the conditions: CHR == "6"
Mon Nov 28 13:42:02 2022 Finished filtering values.
Mon Nov 28 13:42:02 2022 Start to check IDs...
Mon Nov 28 13:42:02 2022  -Current Dataframe shape : 833422  x  12
Mon Nov 28 13:42:02 2022  -Checking if SNPID is chr:pos:ref:alt...(separator: - ,: , _)
Mon Nov 28 13:42:04 2022 Finished checking IDs successfully!
Mon Nov 28 13:42:04 2022 Start to fix chromosome notation...
Mon Nov 28 13:42:04 2022  -Current Dataframe shape : 833422  x  12
Mon Nov 28 13:42:08 2022  -Vairants with standardized chromosome notation: 833422
Mon Nov 28 13:42:10 2022  -All CHR are already fixed...
Mon Nov 28 13:42:11 2022 Finished fixing chromosome notation successfully!
Mon Nov 28 13:42:11 2022 Start to fix basepair positions...
Mon Nov 28 13:42:11 2022  -Current Dataframe shape : 833422  x  12
Mon Nov 28 13:42:11 2022  -Converting to Int64 data

## Formatting and saving

### get ready for submission to gwas catalog

In [7]:
mysumstats.to_format("./t2d",fmt="gwascatalog",md5sum=True)

Mon Nov 28 13:44:17 2022 Start to format the output sumstats in:  gwascatalog  format
Mon Nov 28 13:44:17 2022  -Formatting statistics ...
Mon Nov 28 13:44:18 2022  - Float statistics formats:
Mon Nov 28 13:44:18 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Mon Nov 28 13:44:18 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Mon Nov 28 13:44:18 2022  - Start outputting sumstats in gwascatalog format...
Mon Nov 28 13:44:18 2022  -gwascatalog format will be loaded...
Mon Nov 28 13:44:18 2022  -gwascatalog format meta info:
Mon Nov 28 13:44:18 2022   - format_name  :  gwascatalog
Mon Nov 28 13:44:18 2022   - format_source  :  https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics
Mon Nov 28 13:44:18 2022   - format_version  :  20220726
Mon Nov 28 13:44:18 2022  -gwaslab to gwascatalog format dictionary:
Mon Nov 28 13:44:18 2022   - gwaslab keys: SNPID,CHR,POS,NEA,EA,BETA,EAF,SE,P,OR,OR_95L,OR_95U
Mon Nov 28 13:44:18 2022   - gwascatalog values: variant_id,chromosome,b

### calculate md5sum for checking the integrity when sharing

In [9]:
!head t2d.gwascatalog.tsv.gz.md5sum

d6d5b9ab6596b93fdafc2cc5aba0bcba


### convert the variant info in annovar default input format  

In [4]:
mysumstats.to_format("./mysumstats",fmt="annovar")

Thu Sep 22 23:34:47 2022 Start to format the output sumstats in:  annovar  format
Thu Sep 22 23:34:47 2022  -Formatting statistics ...
Thu Sep 22 23:34:48 2022  - Float statistics formats:
Thu Sep 22 23:34:48 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Thu Sep 22 23:34:48 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Thu Sep 22 23:34:48 2022  - Start outputting sumstats in annovar format...
Thu Sep 22 23:34:48 2022  -Number of SNPs : 757588
Thu Sep 22 23:34:48 2022  -Number of Insertions : 52621
Thu Sep 22 23:34:48 2022  -Number of Deletions : 23213
Thu Sep 22 23:34:48 2022  -formatting to 1-based bed-like file...
Thu Sep 22 23:34:48 2022  -Output columns: Index(['CHR', 'START', 'END', 'NEA_out', 'EA_out', 'SNPID'], dtype='object')
Thu Sep 22 23:34:48 2022  -Output path: ./mysumstats.annovar
Thu Sep 22 23:34:50 2022  -Saving log file: ./mysumstats.annovar.log
Thu Sep 22 23:34:50 2022 Finished outputting successfully!


### GWAS-ssf format

In [5]:
mysumstats.to_format("./mysumstats",fmt="ssf")

Thu Sep 22 23:34:50 2022 Start to format the output sumstats in:  ssf  format
Thu Sep 22 23:34:50 2022  -Formatting statistics ...
Thu Sep 22 23:34:51 2022  - Float statistics formats:
Thu Sep 22 23:34:51 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Thu Sep 22 23:34:51 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Thu Sep 22 23:34:51 2022  - Start outputting sumstats in ssf format...
Thu Sep 22 23:34:51 2022  -ssf format will be loaded...
Thu Sep 22 23:34:51 2022  -ssf format meta info:
Thu Sep 22 23:34:51 2022   - format_name  :  ssf
Thu Sep 22 23:34:51 2022   - format_source  :  https://www.biorxiv.org/content/10.1101/2022.07.15.500230v1.full
Thu Sep 22 23:34:51 2022   - format_version  :  20220726
Thu Sep 22 23:34:51 2022  -gwaslab to ssf format dictionary:
Thu Sep 22 23:34:51 2022   - gwaslab keys: ['SNPID', 'rsID', 'CHR', 'POS', 'NEA', 'EA', 'N', 'BETA', 'SE', 'P', 'INFO', 'OR', 'OR_95L', 'OR_95U']
Thu Sep 22 23:34:51 2022   - ssf values: ['variant_id', 'rsid',

### ldsc default format

In [6]:
mysumstats.to_format("./mysumstats",fmt="ldsc",hapmap3=True,exclude_hla=True)

Thu Sep 22 23:34:58 2022 Start to format the output sumstats in:  ldsc  format
Thu Sep 22 23:34:58 2022  -Excluding variants in HLA region ...
Thu Sep 22 23:34:58 2022  -Exclude 75022 variants in HLA region.
Thu Sep 22 23:34:58 2022  -Processing 758400 raw variants...
Thu Sep 22 23:34:58 2022  -Loading Hapmap3 variants data...
Thu Sep 22 23:34:59 2022  -Since rsID not in sumstats, chr:pos( build 19) will be used for matching...
Thu Sep 22 23:35:01 2022  -Raw input contains 66674 hapmaps variants based on chr:pos...
Thu Sep 22 23:35:01 2022  -Extract 66674 variants in Hapmap3 datasets for build 19.
Thu Sep 22 23:35:01 2022  -Formatting statistics ...
Thu Sep 22 23:35:01 2022  - Float statistics formats:
Thu Sep 22 23:35:01 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Thu Sep 22 23:35:01 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Thu Sep 22 23:35:01 2022  - Start outputting sumstats in ldsc format...
Thu Sep 22 23:35:01 2022  -ldsc format will be loaded...
Thu Sep 

In [7]:
mysumstats.to_format("./mysumstats",fmt="ldsc",hapmap3=True,exclude_hla=True)

Thu Sep 22 23:35:02 2022 Start to format the output sumstats in:  ldsc  format
Thu Sep 22 23:35:02 2022  -Excluding variants in HLA region ...
Thu Sep 22 23:35:02 2022  -Exclude 75022 variants in HLA region.
Thu Sep 22 23:35:02 2022  -Processing 758400 raw variants...
Thu Sep 22 23:35:02 2022  -Loading Hapmap3 variants data...
Thu Sep 22 23:35:03 2022  -Since rsID not in sumstats, chr:pos( build 19) will be used for matching...
Thu Sep 22 23:35:05 2022  -Raw input contains 66674 hapmaps variants based on chr:pos...
Thu Sep 22 23:35:05 2022  -Extract 66674 variants in Hapmap3 datasets for build 19.
Thu Sep 22 23:35:05 2022  -Formatting statistics ...
Thu Sep 22 23:35:05 2022  - Float statistics formats:
Thu Sep 22 23:35:05 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Thu Sep 22 23:35:05 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Thu Sep 22 23:35:05 2022  - Start outputting sumstats in ldsc format...
Thu Sep 22 23:35:05 2022  -ldsc format will be loaded...
Thu Sep 

### convert to bed format

In [10]:
mysumstats.to_format("./mysumstats",fmt="bed")

Thu Sep 22 23:55:15 2022 Start to format the output sumstats in:  bed  format
Thu Sep 22 23:55:15 2022  -Formatting statistics ...
Thu Sep 22 23:55:16 2022  - Float statistics formats:
Thu Sep 22 23:55:16 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Thu Sep 22 23:55:16 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Thu Sep 22 23:55:16 2022  - Start outputting sumstats in bed format...
Thu Sep 22 23:55:16 2022  -Number of SNPs : 757588
Thu Sep 22 23:55:16 2022  -Number of Insertions : 52621
Thu Sep 22 23:55:16 2022  -Number of Deletions : 23213
Thu Sep 22 23:55:16 2022  -formatting to 0-based bed-like file...
Thu Sep 22 23:55:17 2022  -Output columns: Index(['CHR', 'START', 'END', 'NEA/EA', 'STRAND', 'SNPID'], dtype='object')
Thu Sep 22 23:55:17 2022  -Output path: ./mysumstats.bed
Thu Sep 22 23:55:18 2022  -Saving log file: ./mysumstats.bed.log
Thu Sep 22 23:55:18 2022 Finished outputting successfully!


### convert to vep default format

In [11]:
mysumstats.to_format("./mysumstats",fmt="vep")

Thu Sep 22 23:55:18 2022 Start to format the output sumstats in:  vep  format
Thu Sep 22 23:55:18 2022  -Formatting statistics ...
Thu Sep 22 23:55:19 2022  - Float statistics formats:
Thu Sep 22 23:55:19 2022   - Columns: ['EAF', 'BETA', 'SE', 'P']
Thu Sep 22 23:55:19 2022   - Output formats: ['{:.4g}', '{:.4f}', '{:.4f}', '{:.4e}']
Thu Sep 22 23:55:19 2022  - Start outputting sumstats in vep format...
Thu Sep 22 23:55:19 2022  -Number of SNPs : 757588
Thu Sep 22 23:55:19 2022  -Number of Insertions : 52621
Thu Sep 22 23:55:19 2022  -Number of Deletions : 23213
Thu Sep 22 23:55:19 2022  -formatting to 1-based bed-like file (for vep)...
Thu Sep 22 23:55:20 2022  -Output columns: Index(['CHR', 'START', 'END', 'NEA/EA', 'STRAND', 'SNPID'], dtype='object')
Thu Sep 22 23:55:20 2022  -Output path: ./mysumstats.vep.gz
Thu Sep 22 23:55:26 2022  -Saving log file: ./mysumstats.vep.log
Thu Sep 22 23:55:26 2022 Finished outputting successfully!
