In [1]:
import numpy as np
import h5py
import glob
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import logging
import argparse

import sys
import os
import sib_ldsc_z as ld

In [8]:
ldscpath = "/disk/genetics/ukb/alextisyoung/haplotypes/relatives/bedfiles/ldscores/*[0-9].l2.ldscore.gz"
filenames = "/disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_*.hdf5"
ldcolnames = ["CHR", "SNP", "BP", "L2"]

In [6]:
ldscores= ld.read_ldscores(ldscpath, ldcolnames)

In [7]:
ldscores.head()

Unnamed: 0,CHR,SNP,BP,L2
0,1,rs3131972,752721,3.691
1,1,rs12184325,754105,2.113
2,1,rs3131962,756604,3.793
3,1,rs114525117,759036,2.103
4,1,rs115991721,767096,1.103


In [13]:
metadata

array([[b'6', b'rs11757325', b'0.269301865963754', b'203397', b'C', b'T'],
       [b'6', b'rs80014302', b'0.269663095780137', b'203722', b'T', b'G'],
       [b'6', b'rs80324517', b'0.270023385241025', b'204031', b'G', b'A'],
       ...,
       [b'6', b'rs17860838', b'198.250630831501', b'170889829', b'C',
        b'T'],
       [b'6', b'rs12206696', b'198.251798897992', b'170899075', b'T',
        b'C'],
       [b'6', b'rs7775031', b'198.254831357579', b'170907734', b'G',
        b'A']], dtype='|S17')

In [14]:
files = glob.glob(filenames)

file = files[0]
print("Reading in file: ", file)
hf = h5py.File(file, 'r')
metadata = hf.get("bim")[()]
chromosome = metadata[:, 0]
snp = metadata[:, 1]
bp = metadata[:, 3]
theta  = hf.get('estimate')[()]
se  = hf.get('estimate_ses')[()]
N = hf.get('N_L')[()]
S = hf.get('estimate_covariance')[()]
f = hf.get('freqs')[()]

# normalizing S
sigma2 = hf.get('sigma2')[()]
tau = hf.get('tau')[()]
phvar = sigma2+sigma2/tau

if len(files) > 1:
    for file in files[1:]:
        print("Reading in file: ", file)
        hf = h5py.File(file, 'r')
        metadata = hf.get("bim")[()]
        chromosome_file = metadata[:, 0]  
        snp_file = metadata[:, 1]
        bp_file = metadata[:, 3]
        theta_file  = hf.get('estimate')[()]
        se_file  = hf.get('estimate_ses')[()]
        S_file = hf.get('estimate_covariance')[()]
        f_file = hf.get('freqs')[()]
        N_file = hf.get('N_L')[()]

        # normalizing S
        sigma2 = hf.get('sigma2')[()]
        tau = hf.get('tau')[()]

        chromosome = np.append(chromosome, chromosome_file, axis = 0)
        snp = np.append(snp, snp_file, axis = 0)
        bp = np.append(bp, bp_file, axis = 0)
        theta = np.append(theta, theta_file, axis = 0)
        se = np.append(se, se_file, axis = 0)
        S = np.append(S, S_file, axis = 0)
        f = np.append(f, f_file, axis = 0)
        N = np.append(N, N_file, axis = 0)

# Constructing dataframe of data
zdata = pd.DataFrame({'CHR' : chromosome,
                    'SNP' : snp,
                    'BP' : bp,
                    'N' : N,
                    "f" : f,
                    'theta' : theta.tolist(),
                    'se' : se.tolist(),
                    "S" : S.tolist()})

Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_8.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_16.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_12.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_4.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_1.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_5.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_9.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_17.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_13.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_22.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/relatives/traits/9/chr_3

In [11]:
zdata.head()

Unnamed: 0,CHR,SNP,N,f,theta,se,S
0,b'8',b'rs11780869',43342,0.075331,"[-0.012398325838148594, 0.044778984040021896, ...","[0.02189680002629757, 0.039695702493190765, 0....","[[0.00047946989070624113, -0.00037018390139564..."
1,b'8',b'rs2003497',43342,0.621533,"[0.0010558661306276917, 0.0021548783406615257,...","[0.011948716826736927, 0.02137044072151184, 0....","[[0.000142771823448129, -0.0001093009341275319..."
2,b'8',b'rs56080987',43342,0.980838,"[0.043102409690618515, -0.06967756897211075, -...","[0.042830925434827805, 0.07517721503973007, 0....","[[0.0018344882410019636, -0.001305687241256237..."
3,b'8',b'rs116844674',43342,0.962588,"[0.04455234110355377, -0.06375543773174286, 0....","[0.03009759448468685, 0.05230757221579552, 0.0...","[[0.000905865163076669, -0.0006721090176142752..."
4,b'8',b'rs10488368',43342,0.906026,"[0.004531058948487043, -0.018941061571240425, ...","[0.01970919780433178, 0.03549977391958237, 0.0...","[[0.0003884524921886623, -0.000290763855446130..."


In [12]:
zdata.merge(ldscores, how = "inner", on = ["CHR", "BP"])

Unnamed: 0,CHR,SNP,N,f,theta,se,S,BP,L2
