# Data conversion

In [1]:
import gwaslab as gl

## Loading sample data

In [2]:
mysumstats = gl.Sumstats("t2d_bbj.txt.gz",
             snpid="SNP",
             chrom="CHR",
             pos="POS",
             ea="ALT",
             nea="REF",
             beta="BETA",
             se="SE",nrows=5)

Fri Nov  4 16:15:44 2022 Start to initiate from file :t2d_bbj.txt.gz
Fri Nov  4 16:15:44 2022  -Reading columns          : CHR,REF,POS,BETA,ALT,SE,SNP
Fri Nov  4 16:15:44 2022  -Renaming columns to      : CHR,NEA,POS,BETA,EA,SE,SNPID
Fri Nov  4 16:15:44 2022  -Current dataframe shape  : Rows  5  x  7  Columns
Fri Nov  4 16:15:44 2022  -Initiating a status column ...
Fri Nov  4 16:15:45 2022 Start to reorder the columns...
Fri Nov  4 16:15:45 2022  -Current Dataframe shape : 5  x  8
Fri Nov  4 16:15:45 2022  -Reordering columns to    : SNPID,CHR,POS,EA,NEA,BETA,SE,STATUS
Fri Nov  4 16:15:45 2022 Finished sorting columns successfully!
Fri Nov  4 16:15:45 2022 Finished loading data successfully!


before conversion , sanity-check the data

In [3]:
mysumstats.check_sanity()

Fri Nov  4 16:15:45 2022 Start sanity check for statistics ...
Fri Nov  4 16:15:45 2022  -Current Dataframe shape : 5  x  8
Fri Nov  4 16:15:45 2022  -Checking if  -10 <BETA)< 10  ...
Fri Nov  4 16:15:45 2022  -Removed 0 variants with bad BETA.
Fri Nov  4 16:15:45 2022  -Checking if  0 <SE< inf  ...
Fri Nov  4 16:15:45 2022  -Removed 0 variants with bad SE.
Fri Nov  4 16:15:45 2022  -Checking STATUS...
Fri Nov  4 16:15:46 2022  -Coverting STAUTUS to category.
Fri Nov  4 16:15:46 2022  -Removed 0 variants with bad statistics in total.
Fri Nov  4 16:15:46 2022 Finished sanity check successfully!


## BETA -> OR  , BETA/SE -> OR/OR_95L/OR_95U

In [4]:
mysumstats.fill_data(to_fill=["OR"])

Fri Nov  4 16:15:46 2022 Start filling data using existing columns...
Fri Nov  4 16:15:46 2022  -Raw input columns:  ['SNPID', 'CHR', 'POS', 'EA', 'NEA', 'BETA', 'SE', 'STATUS']
Fri Nov  4 16:15:46 2022  -Overwrite mode:  False
Fri Nov  4 16:15:46 2022   -Skipping columns:  []
Fri Nov  4 16:15:46 2022  -Filling columns:  ['OR']
Fri Nov  4 16:15:46 2022   - Filling OR using BETA column...
Fri Nov  4 16:15:46 2022   - Filling OR_95L/OR_95U using BETA/SE columns...
Fri Nov  4 16:15:46 2022 Start to reorder the columns...
Fri Nov  4 16:15:46 2022  -Current Dataframe shape : 5  x  11
Fri Nov  4 16:15:46 2022  -Reordering columns to    : SNPID,CHR,POS,EA,NEA,BETA,SE,OR,OR_95L,OR_95U,STATUS
Fri Nov  4 16:15:46 2022 Finished sorting columns successfully!
Fri Nov  4 16:15:46 2022 Finished filling data using existing columns.


In [5]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,BETA,SE,OR,OR_95L,OR_95U,STATUS
0,1:725932_G_A,1,725932,G,A,-0.0737,0.1394,0.92895,0.706863,1.220815,9999999
1,1:725933_A_G,1,725933,G,A,0.0737,0.1394,1.076484,0.819125,1.414702,9999999
2,1:737801_T_C,1,737801,C,T,0.049,0.1231,1.05022,0.825083,1.33679,9999999
3,1:749963_T_TAA,1,749963,TAA,T,0.0213,0.0199,1.021528,0.982453,1.062159,9999999
4,1:751343_T_A,1,751343,T,A,0.0172,0.0156,1.017349,0.986714,1.048935,9999999


In [6]:
mysumstats.data.drop(labels=["BETA","SE"],axis=1,inplace=True)

## OR -> BETA  ,  OR/OR_95L/OR_95U -> BETA/SE

In [7]:
mysumstats.fill_data(to_fill=["BETA","SE"])

Fri Nov  4 16:15:46 2022 Start filling data using existing columns...
Fri Nov  4 16:15:46 2022  -Raw input columns:  ['SNPID', 'CHR', 'POS', 'EA', 'NEA', 'OR', 'OR_95L', 'OR_95U', 'STATUS']
Fri Nov  4 16:15:46 2022  -Overwrite mode:  False
Fri Nov  4 16:15:46 2022   -Skipping columns:  []
Fri Nov  4 16:15:46 2022  -Filling columns:  ['BETA', 'SE']
Fri Nov  4 16:15:46 2022   - Filling BETA value using OR column...
Fri Nov  4 16:15:46 2022   - Filling SE value using OR/OR_95U column...
Fri Nov  4 16:15:46 2022 Start to reorder the columns...
Fri Nov  4 16:15:46 2022  -Current Dataframe shape : 5  x  11
Fri Nov  4 16:15:46 2022  -Reordering columns to    : SNPID,CHR,POS,EA,NEA,BETA,SE,OR,OR_95L,OR_95U,STATUS
Fri Nov  4 16:15:46 2022 Finished sorting columns successfully!
Fri Nov  4 16:15:46 2022 Finished filling data using existing columns.


In [8]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,BETA,SE,OR,OR_95L,OR_95U,STATUS
0,1:725932_G_A,1,725932,G,A,-0.0737,0.1394,0.92895,0.706863,1.220815,9999999
1,1:725933_A_G,1,725933,G,A,0.0737,0.1394,1.076484,0.819125,1.414702,9999999
2,1:737801_T_C,1,737801,C,T,0.049,0.1231,1.05022,0.825083,1.33679,9999999
3,1:749963_T_TAA,1,749963,TAA,T,0.0213,0.0199,1.021528,0.982453,1.062159,9999999
4,1:751343_T_A,1,751343,T,A,0.0172,0.0156,1.017349,0.986714,1.048935,9999999


## BETA/SE -> Z

In [9]:
mysumstats.fill_data(to_fill=["Z"])

Fri Nov  4 16:15:46 2022 Start filling data using existing columns...
Fri Nov  4 16:15:46 2022  -Raw input columns:  ['SNPID', 'CHR', 'POS', 'EA', 'NEA', 'BETA', 'SE', 'OR', 'OR_95L', 'OR_95U', 'STATUS']
Fri Nov  4 16:15:46 2022  -Overwrite mode:  False
Fri Nov  4 16:15:46 2022   -Skipping columns:  []
Fri Nov  4 16:15:46 2022  -Filling columns:  ['Z']
Fri Nov  4 16:15:46 2022   - Filling Z using BETA/SE column...
Fri Nov  4 16:15:46 2022 Start to reorder the columns...
Fri Nov  4 16:15:46 2022  -Current Dataframe shape : 5  x  12
Fri Nov  4 16:15:46 2022  -Reordering columns to    : SNPID,CHR,POS,EA,NEA,BETA,SE,Z,OR,OR_95L,OR_95U,STATUS
Fri Nov  4 16:15:46 2022 Finished sorting columns successfully!
Fri Nov  4 16:15:46 2022 Finished filling data using existing columns.


In [10]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,BETA,SE,Z,OR,OR_95L,OR_95U,STATUS
0,1:725932_G_A,1,725932,G,A,-0.0737,0.1394,-0.528694,0.92895,0.706863,1.220815,9999999
1,1:725933_A_G,1,725933,G,A,0.0737,0.1394,0.528694,1.076484,0.819125,1.414702,9999999
2,1:737801_T_C,1,737801,C,T,0.049,0.1231,0.398051,1.05022,0.825083,1.33679,9999999
3,1:749963_T_TAA,1,749963,TAA,T,0.0213,0.0199,1.070354,1.021528,0.982453,1.062159,9999999
4,1:751343_T_A,1,751343,T,A,0.0172,0.0156,1.102576,1.017349,0.986714,1.048935,9999999


## P -> MLOG10P

In [11]:
mysumstats.fill_data(to_fill=["MLOG10P"])

Fri Nov  4 16:15:46 2022 Start filling data using existing columns...
Fri Nov  4 16:15:46 2022  -Raw input columns:  ['SNPID', 'CHR', 'POS', 'EA', 'NEA', 'BETA', 'SE', 'Z', 'OR', 'OR_95L', 'OR_95U', 'STATUS']
Fri Nov  4 16:15:46 2022  -Overwrite mode:  False
Fri Nov  4 16:15:46 2022   -Skipping columns:  []
Fri Nov  4 16:15:46 2022  -Filling columns:  ['MLOG10P']
Fri Nov  4 16:15:46 2022   - Filling P value using Z column...
Fri Nov  4 16:15:46 2022   - Filling MLOG10P using P column...
Fri Nov  4 16:15:46 2022 Start to reorder the columns...
Fri Nov  4 16:15:46 2022  -Current Dataframe shape : 5  x  13
Fri Nov  4 16:15:46 2022  -Reordering columns to    : SNPID,CHR,POS,EA,NEA,BETA,SE,Z,MLOG10P,OR,OR_95L,OR_95U,STATUS
Fri Nov  4 16:15:46 2022 Finished sorting columns successfully!
Fri Nov  4 16:15:46 2022 Finished filling data using existing columns.


## MLOG10P -> P 

In [12]:
mysumstats.fill_data(to_fill=["P"])

Fri Nov  4 16:15:46 2022 Start filling data using existing columns...
Fri Nov  4 16:15:46 2022  -Raw input columns:  ['SNPID', 'CHR', 'POS', 'EA', 'NEA', 'BETA', 'SE', 'Z', 'MLOG10P', 'OR', 'OR_95L', 'OR_95U', 'STATUS']
Fri Nov  4 16:15:46 2022  -Overwrite mode:  False
Fri Nov  4 16:15:46 2022   -Skipping columns:  []
Fri Nov  4 16:15:46 2022  -Filling columns:  ['P']
Fri Nov  4 16:15:46 2022   - Filling P value using MLOG10P column...
Fri Nov  4 16:15:46 2022 Start to reorder the columns...
Fri Nov  4 16:15:46 2022  -Current Dataframe shape : 5  x  14
Fri Nov  4 16:15:46 2022  -Reordering columns to    : SNPID,CHR,POS,EA,NEA,BETA,SE,Z,P,MLOG10P,OR,OR_95L,OR_95U,STATUS
Fri Nov  4 16:15:46 2022 Finished sorting columns successfully!
Fri Nov  4 16:15:46 2022 Finished filling data using existing columns.


In [13]:
mysumstats.data

Unnamed: 0,SNPID,CHR,POS,EA,NEA,BETA,SE,Z,P,MLOG10P,OR,OR_95L,OR_95U,STATUS
0,1:725932_G_A,1,725932,G,A,-0.0737,0.1394,-0.528694,0.597018,0.224013,0.92895,0.706863,1.220815,9999999
1,1:725933_A_G,1,725933,G,A,0.0737,0.1394,0.528694,0.597018,0.224013,1.076484,0.819125,1.414702,9999999
2,1:737801_T_C,1,737801,C,T,0.049,0.1231,0.398051,0.690593,0.160778,1.05022,0.825083,1.33679,9999999
3,1:749963_T_TAA,1,749963,TAA,T,0.0213,0.0199,1.070354,0.28446,0.545979,1.021528,0.982453,1.062159,9999999
4,1:751343_T_A,1,751343,T,A,0.0172,0.0156,1.102576,0.270211,0.568297,1.017349,0.986714,1.048935,9999999
