In [1]:
import numpy as np
import h5py
import glob
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import logging


import sys
import os
sys.path.append(os.getcwd() + '/..')
import sib_ldsc_z as ld

In [22]:
traitnos = glob.glob("/disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/*/")
traitnos = [int(i[-3:-1].strip('/')) for i in traitnos]

traitcodes = pd.read_csv("/disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/traits.txt", 
            names = ['traitcode', 'trait'], 
            sep = ' ')
            
traitname = traitcodes.loc[traitcodes['traitcode'] == 4, 'trait'].values[0]
print(traitname)

FEV1


In [6]:
startTime = datetime.datetime.now()  
logging.info(f"Start time:  {startTime}")

# == Reading in Data == #
print("=====================================")
print("Reading in Data")
print("=====================================")
# reading in  data
files = glob.glob("/disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_*.hdf5")

file = files[0]
print("Reading in file: ", file)
hf = h5py.File(file, 'r')
metadata = hf.get("bim")[()]
chromosome = metadata[:, 0]
snp = metadata[:, 3]
theta  = hf.get('estimate')[()]
se  = hf.get('estimate_ses')[()]
N = hf.get('N_L')[()]
S = hf.get('estimate_covariance')[()]
f = hf.get('freqs')[()]

# normalizing S
sigma2 = hf.get('sigma2')[()]
tau = hf.get('tau')[()]
phvar = sigma2+sigma2/tau


for file in files[1:]:
    print("Reading in file: ", file)
    hf = h5py.File(file, 'r')
    metadata = hf.get("bim")[()]
    chromosome_file = metadata[:, 0]  
    snp_file = metadata[:, 3]
    theta_file  = hf.get('estimate')[()]
    se_file  = hf.get('estimate_ses')[()]
    S_file = hf.get('estimate_covariance')[()]
    f_file = hf.get('freqs')[()]
    N_file = hf.get('N_L')[()]

    # normalizing S
    sigma2 = hf.get('sigma2')[()]
    tau = hf.get('tau')[()]

    chromosome = np.append(chromosome, chromosome_file, axis = 0)
    snp = np.append(snp, snp_file, axis = 0)
    theta = np.append(theta, theta_file, axis = 0)
    se = np.append(se, se_file, axis = 0)
    S = np.append(S, S_file, axis = 0)
    f = np.append(f, f_file, axis = 0)
    N = np.append(N, N_file, axis = 0)

# Constructing dataframe of data
zdata = pd.DataFrame({'CHR' : chromosome,
                    'BP' : snp,
                    'N' : N,
                    "f" : f,
                    'theta' : theta.tolist(),
                    'se' : se.tolist(),
                    "S" : S.tolist()})


zdata['CHR'] = zdata['CHR'].astype(int)
zdata['BP'] = zdata['BP'].astype(str).str.replace("b'", "").str[:-1]

Reading in Data
Reading in file:  /disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_5.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_17.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_1.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_13.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_22.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_9.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_8.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_4.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_16.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_12.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/GS20k_sumstats/traits/4/chr_18.hdf5
Reading in file:  /disk/genetics/ukb/alexti

In [7]:
# == Reading in LD Scores == #

ldscore_path = "/disk/genetics/ukb/alextisyoung/GS20k_sumstats/ldscores/"
ldfiles = "*[0-9].l2.ldscore.gz"
ldcolnames = ["CHR", "SNP", "BP", "L2"]
Mfiles = "*[0-9].l2.M_5_50"
Mcolnames = ["M", "CHR"]

ldscores, mfile = ld.read_ldscores(ldscore_path, ldfiles, ldcolnames, Mfiles, Mcolnames)

In [9]:
# Merging LD scores with main Data Frame
ldscores['BP'] = ldscores['BP'].astype('int')
zdata['BP'] = zdata['BP'].astype('int')

main_df = zdata.merge(ldscores, how = "inner", on = ["CHR", "BP"])

# dropping NAs
main_df = main_df.dropna()

In [11]:
# transforming inputs

S = np.array(list(main_df.S)) 
theta = np.array(list(main_df.theta))
f = np.array(list(main_df["f"]))
l = np.array(list(main_df["L2"]))
u = np.array(list(main_df["L2"]))

# M = mfile['M'].sum()
M = len(S)

effect_estimated = "direct_plus_population"

S, theta = ld.transform_estimates(effect_estimated, S, theta)

In [113]:
# making z value
zval = ld.theta2z(theta, S, M = M)

# == Initializing model == #
model = ld.sibreg(S = S, 
                z = zval, 
                l = l,
                f = f,
                u = u,
                M = M) 

In [114]:
output_matrix, result = model.solve() 

No initial guess provided.
Making Method of Moments Guess
Initial estimate: [22.74811853 75.56236502  0.58662413]


  log_ll += (1/ui) * _log_ll(V, zi, Si, li, M)
  log_ll += (1/ui) * _log_ll(V, zi, Si, li, M)
  std_err_mat = np.sqrt(invH)
