# Data conversion

In [1]:
import gwaslab as gl

## Loading sample data

In [2]:
mysumstats = gl.Sumstats("t2d_bbj.txt.gz",
             snpid="SNP",
             chrom="CHR",
             pos="POS",
             ea="ALT",
             nea="REF",
             beta="BETA",
             se="SE",nrows=5)

Sat Oct 29 14:30:03 2022 Start to initiate from file :t2d_bbj.txt.gz
Sat Oct 29 14:30:03 2022  -Reading columns          : SE,SNP,POS,REF,BETA,ALT,CHR
Sat Oct 29 14:30:03 2022  -Renaming columns to      : SE,SNPID,POS,NEA,BETA,EA,CHR
Sat Oct 29 14:30:03 2022  -Current dataframe shape  : Rows  5  x  7  Columns
Sat Oct 29 14:30:03 2022  -Initiating a status column ...
Sat Oct 29 14:30:03 2022  -Reordering columns to    : SNPID,CHR,POS,EA,NEA,BETA,SE,STATUS
Sat Oct 29 14:30:03 2022 Finished loading data successfully!


In [3]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,BETA,SE,STATUS
0,1:725932_G_A,1,725932,G,A,-0.0737,0.1394,9999999
1,1:725933_A_G,1,725933,G,A,0.0737,0.1394,9999999
2,1:737801_T_C,1,737801,C,T,0.049,0.1231,9999999
3,1:749963_T_TAA,1,749963,TAA,T,0.0213,0.0199,9999999
4,1:751343_T_A,1,751343,T,A,0.0172,0.0156,9999999


In [4]:
mysumstats.check_sanity()

Sat Oct 29 14:30:03 2022 Start sanity check for statistics ...
Sat Oct 29 14:30:03 2022  -Current Dataframe shape : 5  x  8
Sat Oct 29 14:30:03 2022  -Checking if abs(BETA)<10 ...
Sat Oct 29 14:30:03 2022  -Removed 0 variants with bad BETA.
Sat Oct 29 14:30:03 2022  -Checking if SE >0 ...
Sat Oct 29 14:30:03 2022  -Removed 0 variants with bad SE.
Sat Oct 29 14:30:03 2022  -Checking STATUS...
Sat Oct 29 14:30:04 2022  -Coverting STAUTUS to category.
Sat Oct 29 14:30:04 2022  -Removed 0 variants with bad statistics in total.
Sat Oct 29 14:30:04 2022 Finished sanity check successfully!


## BETA -> OR  , BETA/SE -> OR/OR_95L/OR_95U

In [5]:
mysumstats.fill_data(to_fill=["OR"])

Sat Oct 29 14:30:04 2022 Start filling data using existing columns...
Sat Oct 29 14:30:04 2022  -Raw input columns:  ['SNPID', 'CHR', 'POS', 'EA', 'NEA', 'BETA', 'SE', 'STATUS']
Sat Oct 29 14:30:04 2022  -Overwrite mode:  False
Sat Oct 29 14:30:04 2022   - Skipping columns:  []
Sat Oct 29 14:30:04 2022 Filling columns:  ['OR']
Sat Oct 29 14:30:04 2022   - Filling OR using BETA column...
Sat Oct 29 14:30:04 2022   - Filling OR_95L/OR_95U using BETA/SE columns...
Sat Oct 29 14:30:04 2022 Finished filling data using existing columns.


In [6]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,BETA,SE,OR,OR_95L,OR_95U,STATUS
0,1:725932_G_A,1,725932,G,A,-0.0737,0.1394,0.92895,0.706863,1.220815,9999999
1,1:725933_A_G,1,725933,G,A,0.0737,0.1394,1.076484,0.819125,1.414702,9999999
2,1:737801_T_C,1,737801,C,T,0.049,0.1231,1.05022,0.825083,1.33679,9999999
3,1:749963_T_TAA,1,749963,TAA,T,0.0213,0.0199,1.021528,0.982453,1.062159,9999999
4,1:751343_T_A,1,751343,T,A,0.0172,0.0156,1.017349,0.986714,1.048935,9999999


In [7]:
mysumstats.data.drop(labels=["BETA","SE"],axis=1,inplace=True)

In [8]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,OR,OR_95L,OR_95U,STATUS
0,1:725932_G_A,1,725932,G,A,0.92895,0.706863,1.220815,9999999
1,1:725933_A_G,1,725933,G,A,1.076484,0.819125,1.414702,9999999
2,1:737801_T_C,1,737801,C,T,1.05022,0.825083,1.33679,9999999
3,1:749963_T_TAA,1,749963,TAA,T,1.021528,0.982453,1.062159,9999999
4,1:751343_T_A,1,751343,T,A,1.017349,0.986714,1.048935,9999999


## OR -> BETA  ,  OR/OR_95L/OR_95U -> BETA/SE

In [9]:
mysumstats.fill_data(to_fill=["BETA","SE"])

Sat Oct 29 14:30:04 2022 Start filling data using existing columns...
Sat Oct 29 14:30:04 2022  -Raw input columns:  ['SNPID', 'CHR', 'POS', 'EA', 'NEA', 'OR', 'OR_95L', 'OR_95U', 'STATUS']
Sat Oct 29 14:30:04 2022  -Overwrite mode:  False
Sat Oct 29 14:30:04 2022   - Skipping columns:  []
Sat Oct 29 14:30:04 2022 Filling columns:  ['BETA', 'SE']
Sat Oct 29 14:30:04 2022   - Filling BETA value using OR column...
Sat Oct 29 14:30:04 2022   - Filling SE value using OR/OR_95U column...
Sat Oct 29 14:30:04 2022 Finished filling data using existing columns.


In [10]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,BETA,SE,OR,OR_95L,OR_95U,STATUS
0,1:725932_G_A,1,725932,G,A,-0.0737,0.1394,0.92895,0.706863,1.220815,9999999
1,1:725933_A_G,1,725933,G,A,0.0737,0.1394,1.076484,0.819125,1.414702,9999999
2,1:737801_T_C,1,737801,C,T,0.049,0.1231,1.05022,0.825083,1.33679,9999999
3,1:749963_T_TAA,1,749963,TAA,T,0.0213,0.0199,1.021528,0.982453,1.062159,9999999
4,1:751343_T_A,1,751343,T,A,0.0172,0.0156,1.017349,0.986714,1.048935,9999999


## BETA/SE -> Z

In [11]:
mysumstats.fill_data(to_fill=["Z"])

Sat Oct 29 14:30:05 2022 Start filling data using existing columns...
Sat Oct 29 14:30:05 2022  -Raw input columns:  ['SNPID', 'CHR', 'POS', 'EA', 'NEA', 'BETA', 'SE', 'OR', 'OR_95L', 'OR_95U', 'STATUS']
Sat Oct 29 14:30:05 2022  -Overwrite mode:  False
Sat Oct 29 14:30:05 2022   - Skipping columns:  []
Sat Oct 29 14:30:05 2022 Filling columns:  ['Z']
Sat Oct 29 14:30:05 2022   - Filling Z using BETA/SE column...
Sat Oct 29 14:30:05 2022 Finished filling data using existing columns.


In [12]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,BETA,SE,Z,OR,OR_95L,OR_95U,STATUS
0,1:725932_G_A,1,725932,G,A,-0.0737,0.1394,-0.528694,0.92895,0.706863,1.220815,9999999
1,1:725933_A_G,1,725933,G,A,0.0737,0.1394,0.528694,1.076484,0.819125,1.414702,9999999
2,1:737801_T_C,1,737801,C,T,0.049,0.1231,0.398051,1.05022,0.825083,1.33679,9999999
3,1:749963_T_TAA,1,749963,TAA,T,0.0213,0.0199,1.070354,1.021528,0.982453,1.062159,9999999
4,1:751343_T_A,1,751343,T,A,0.0172,0.0156,1.102576,1.017349,0.986714,1.048935,9999999


## P -> MLOG10P

In [13]:
mysumstats.fill_data(to_fill=["MLOG10P"])

Sat Oct 29 14:30:05 2022 Start filling data using existing columns...
Sat Oct 29 14:30:05 2022  -Raw input columns:  ['SNPID', 'CHR', 'POS', 'EA', 'NEA', 'BETA', 'SE', 'Z', 'OR', 'OR_95L', 'OR_95U', 'STATUS']
Sat Oct 29 14:30:05 2022  -Overwrite mode:  False
Sat Oct 29 14:30:05 2022   - Skipping columns:  []
Sat Oct 29 14:30:05 2022 Filling columns:  ['MLOG10P']
Sat Oct 29 14:30:05 2022   - Filling P value using Z column...
Sat Oct 29 14:30:05 2022   - Filling MLOG10P using P column...
Sat Oct 29 14:30:06 2022 Finished filling data using existing columns.


## MLOG10P -> P 

In [14]:
mysumstats.fill_data(to_fill=["P"])

Sat Oct 29 14:30:06 2022 Start filling data using existing columns...
Sat Oct 29 14:30:06 2022  -Raw input columns:  ['SNPID', 'CHR', 'POS', 'EA', 'NEA', 'BETA', 'SE', 'Z', 'MLOG10P', 'OR', 'OR_95L', 'OR_95U', 'STATUS']
Sat Oct 29 14:30:06 2022  -Overwrite mode:  False
Sat Oct 29 14:30:06 2022   - Skipping columns:  []
Sat Oct 29 14:30:06 2022 Filling columns:  ['P']
Sat Oct 29 14:30:06 2022   - Filling P value using MLOG10P column...
Sat Oct 29 14:30:06 2022 Finished filling data using existing columns.


In [15]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,BETA,SE,Z,P,MLOG10P,OR,OR_95L,OR_95U,STATUS
0,1:725932_G_A,1,725932,G,A,-0.0737,0.1394,-0.528694,0.597018,0.224013,0.92895,0.706863,1.220815,9999999
1,1:725933_A_G,1,725933,G,A,0.0737,0.1394,0.528694,0.597018,0.224013,1.076484,0.819125,1.414702,9999999
2,1:737801_T_C,1,737801,C,T,0.049,0.1231,0.398051,0.690593,0.160778,1.05022,0.825083,1.33679,9999999
3,1:749963_T_TAA,1,749963,TAA,T,0.0213,0.0199,1.070354,0.28446,0.545979,1.021528,0.982453,1.062159,9999999
4,1:751343_T_A,1,751343,T,A,0.0172,0.0156,1.102576,0.270211,0.568297,1.017349,0.986714,1.048935,9999999
