In [15]:
import numpy as np
import h5py
import glob
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import logging


import sys
import os
sys.path.append(os.getcwd() + '/..')
import sib_ldsc_z as ld

In [41]:
# == Reading in data == #
print("=====================================")
print("Reading in Data")
print("=====================================")
# reading in  data
filenames = f"/disk/genetics/ukb/alextisyoung/haplotypes/simulated_pops/from_chr1_to_chr23_start0_end50_run0_p0-0_ab_corr1-0_vb0-25_length2/gen_0_gen_1_phenotype.hdf5"

files = glob.glob(filenames)

file = files[0]
print("Reading in file: ", file)
hf = h5py.File(file, 'r')
metadata = hf.get("bim")[()]
chromosome = metadata[:, 0]
snp = metadata[:, 1]
theta  = hf.get('estimate')[()]
se  = hf.get('estimate_ses')[()]
N = hf.get('N_L')[()]
S = hf.get('estimate_covariance')[()]
f = hf.get('freqs')[()]

# normalizing S
sigma2 = hf.get('sigma2')[()]
tau = hf.get('tau')[()]
phvar = sigma2+sigma2/tau

if len(files) > 1:
    for file in files[1:]:
        print("Reading in file: ", file)
        hf = h5py.File(file, 'r')
        metadata = hf.get("bim")[()]
        chromosome_file = metadata[:, 0]  
        snp_file = metadata[:, 1]
        theta_file  = hf.get('estimate')[()]
        se_file  = hf.get('estimate_ses')[()]
        S_file = hf.get('estimate_covariance')[()]
        f_file = hf.get('freqs')[()]
        N_file = hf.get('N_L')[()]

        # normalizing S
        sigma2 = hf.get('sigma2')[()]
        tau = hf.get('tau')[()]

        chromosome = np.append(chromosome, chromosome_file, axis = 0)
        snp = np.append(snp, snp_file, axis = 0)
        theta = np.append(theta, theta_file, axis = 0)
        se = np.append(se, se_file, axis = 0)
        S = np.append(S, S_file, axis = 0)
        f = np.append(f, f_file, axis = 0)
        N = np.append(N, N_file, axis = 0)

Reading in Data
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/simulated_pops/from_chr1_to_chr23_start0_end50_run0_p0-0_ab_corr1-0_vb0-25_length2/gen_0_gen_1_phenotype.hdf5


In [17]:
# Constructing dataframe of data
zdata = pd.DataFrame({'CHR' : chromosome,
                    'SNP' : snp,
                    'N' : N,
                    "f" : f,
                    'theta' : theta.tolist(),
                    'se' : se.tolist(),
                    "S" : S.tolist()})


zdata['CHR'] = zdata['CHR'].astype(int)
zdata['SNP'] = zdata['SNP'].astype(str).str.replace("b'", "").str[:-1]

In [46]:
# == Reading in LD Scores == #
ldscore_path = "/disk/genetics/ukb/alextisyoung/haplotypes/simulated_pops/from_chr1_to_chr23_start0_end50_run0_p0-0_ab_corr1-0_vb0-25_length2/ldscores/*[0-9].l2.ldscore.gz"
ldcolnames = ["CHR", "SNP", "BP", "L2"]
Mfiles = "/disk/genetics/ukb/alextisyoung/haplotypes/simulated_pops/from_chr1_to_chr23_start0_end50_run0_p0-0_ab_corr1-0_vb0-25_length2/ldscores/*[0-9].l2.M_5_50"
Mcolnames = ["M", "CHR"]

ldscores= ld.read_ldscores(ldscore_path, ldcolnames)
nloci = ld.read_mfiles(Mfiles, Mcolnames)

# Merging LD scores with main Data Frame
main_df = zdata.merge(ldscores, how = "inner", on = ["CHR", "SNP"])

# dropping NAs
main_df = main_df.dropna()

In [47]:
main_df.shape[0] - zdata.shape[0]

0

In [49]:
# transforming inputs

S = np.array(list(main_df.S)) 
theta = np.array(list(main_df.theta))
f = np.array(list(main_df["f"]))
l = np.array(list(main_df["L2"]))
u = np.array(list(main_df["L2"]))

# M = mfile['M'].sum()
M = len(S)

effect_estimated = "direct_plus_population"

S, theta = ld.transform_estimates(effect_estimated, S, theta)

# making z value
zval = ld.theta2z(theta, S, M = M)

# == Initializing model == #
model = ld.sibreg(S = S, 
                z = zval, 
                l = l,
                f = f,
                u = u,
                M = M) 

output_matrix, result = model.solve()

No initial guess provided.
Making Method of Moments Guess
Initial estimate: [22.89196202 48.93274186  0.99863346]


  log_ll += (1/ui) * _log_ll(V, zi, Si, li, M)
  log_ll += (1/ui) * _log_ll(V, zi, Si, li, M)


In [52]:
    estimates = {'v1' : output_matrix['v1'],
                'v2' : output_matrix['v2'],
                'v3' : output_matrix['r']}
    
    varcovar_mat = np.diag(output_matrix['std_err_mat'])

In [53]:
varcovar_mat

array([0.05996191, 0.11832532, 0.0013139 ])