In [4]:
import numpy as np
import sib_ldsc_z as ld
from scipy.optimize import minimize
from scipy.special import comb
from scipy.misc import derivative
import scipy.stats
from importlib import reload
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import datetime
import multiprocessing
# import numdifftools as nd
reload(ld)

<module 'sib_ldsc_z' from '/disk/homedirs/nber/harij/gitrepos/SNIPar/ldsc_reg/sib_ldsc_z.py'>

In [2]:
np.random.seed(123)

N = int(1e4)
S = np.array([[[1., -0.5], [-0.5, 1.]]] * N)/N
V = np.array([[0.5, 0.25], [0.25, 0.5]])

model = ld.sibreg(S = S)
model.simdata(V/N, N, simld = True)

No value for U given. Generating a vector of ones (all SNPs weighted equally)
No value for LD Scores given. Generating a vector of ones for l
No value for effective number of loci is given. Using total number of loci instead
Simulated LD scores!


In [3]:
# solving
%time output, result = model.solve()
print(result)

No initial guess provided.
Making Method of Moments Guess
Initial estimate: [1.55430175 1.47910632 0.53409088]
Wall time: 12.4 s
      fun: 37189.05234762325
 hess_inv: <3x3 LbfgsInvHessProduct with dtype=float64>
      jac: array([-0.02739723, -0.02879562, -0.00672324])
  message: b'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 11
      nit: 8
   status: 0
  success: True
        x: array([0.51801607, 0.49264674, 0.53215414])


  std_err_mat = np.sqrt(invH)


In [4]:
output

{'v1': 0.5180160703428223,
 'v2': 0.4926467370660001,
 'r': 0.5321541425848505,
 'std_err_mat': array([[0.01221355, 0.00124924,        nan],
        [0.00124924, 0.01186258,        nan],
        [       nan,        nan, 0.01696686]])}

# Exploring how data is sorted

In [34]:

import h5py
import pandas as pd

filenames = "/disk/genetics/ukb/alextisyoung/haplotypes/simulated_pops_large/from_chr1_to_chr23_start0_endNone_run0_p0-0_ab_corr0-5_vb0-25_length2/phenotype_dir_par_corr_0.5/1/chr_*.hdf5"
files = glob.glob(filenames)

file = files[0]
print("Reading in file: ", file)
hf = h5py.File(file, 'r')
metadata = hf.get("bim")[()]
chromosome = metadata[:, 0]
snp = metadata[:, 1]
bp = metadata[:, 3]
theta  = hf.get('estimate')[()]
se  = hf.get('estimate_ses')[()]
N = hf.get('N_L')[()]
S = hf.get('estimate_covariance')[()]
f = hf.get('freqs')[()]

# normalizing S
sigma2 = hf.get('sigma2')[()]
tau = hf.get('tau')[()]
phvar = sigma2+sigma2/tau

if len(files) > 1:
    for file in files[1:]:
        print("Reading in file: ", file)
        hf = h5py.File(file, 'r')
        metadata = hf.get("bim")[()]
        chromosome_file = metadata[:, 0]  
        snp_file = metadata[:, 1]
        bp_file = metadata[:, 3]
        theta_file  = hf.get('estimate')[()]
        se_file  = hf.get('estimate_ses')[()]
        S_file = hf.get('estimate_covariance')[()]
        f_file = hf.get('freqs')[()]
        N_file = hf.get('N_L')[()]

        # normalizing S
        sigma2 = hf.get('sigma2')[()]
        tau = hf.get('tau')[()]

        chromosome = np.append(chromosome, chromosome_file, axis = 0)
        snp = np.append(snp, snp_file, axis = 0)
        bp = np.append(bp, bp_file, axis = 0)
        theta = np.append(theta, theta_file, axis = 0)
        se = np.append(se, se_file, axis = 0)
        S = np.append(S, S_file, axis = 0)
        f = np.append(f, f_file, axis = 0)
        N = np.append(N, N_file, axis = 0)

# Constructing dataframe of data
zdata = pd.DataFrame({'CHR' : chromosome,
                    'SNP' : snp,
                    'BP' : bp,
                    'N' : N,
                    "f" : f,
                    'theta' : theta.tolist(),
                    'se' : se.tolist(),
                    "S" : S.tolist()})


# cleaning up a bit
zdata['CHR'] = zdata['CHR'].astype(int)
zdata['SNP'] = zdata['SNP'].astype(str).str.replace("b'", "").str[:-1]
zdata['BP'] = zdata['BP'].astype(str).str.replace("b'", "").str[:-1]
zdata['BP'] = zdata['BP'].astype('int')

# sorting by chromosome
zdata = zdata.sort_values(by = ['CHR']).reset_index(drop = True)
zdata['ordering'] = zdata.index


zdata_n_message = f"Number of Observations before merging LD-Scores, before removing low MAF SNPs: {zdata.shape[0]}"

print(zdata_n_message)

# dropping obs based on MAF
# zdata = zdata[zdata['f'] >= args.maf/100.0]

zdata_n_message = f"Number of Observations before merging LD-Scores, after removing low MAF SNPs: {zdata.shape[0]}"

print(zdata_n_message)

# == Reading in LD Scores == #
ldscore_path = "/disk/genetics/ukb/alextisyoung/haplotypes/simulated_pops_large/from_chr1_to_chr23_start0_endNone_run0_p0-0_ab_corr0-5_vb0-25_length2/ldscores/*[0-9].l2.ldscore.gz"
ldcolnames = ["CHR", "SNP", "BP", "L2"]
ldscores= ld.read_ldscores(ldscore_path, ldcolnames)
# ldscores['BP'] = ldscores['BP'].astype('int')

# Merging LD scores with main Data Frame
main_df = zdata.merge(ldscores, how = "inner", on = ["CHR", "SNP"])
main_df = main_df.sort_values(by = ['ordering'])

# dropping NAs
main_df = main_df.dropna()

Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/simulated_pops_large/from_chr1_to_chr23_start0_endNone_run0_p0-0_ab_corr0-5_vb0-25_length2/phenotype_dir_par_corr_0.5/1/chr_17.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/simulated_pops_large/from_chr1_to_chr23_start0_endNone_run0_p0-0_ab_corr0-5_vb0-25_length2/phenotype_dir_par_corr_0.5/1/chr_5.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/simulated_pops_large/from_chr1_to_chr23_start0_endNone_run0_p0-0_ab_corr0-5_vb0-25_length2/phenotype_dir_par_corr_0.5/1/chr_1.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/simulated_pops_large/from_chr1_to_chr23_start0_endNone_run0_p0-0_ab_corr0-5_vb0-25_length2/phenotype_dir_par_corr_0.5/1/chr_13.hdf5
Reading in file:  /disk/genetics/ukb/alextisyoung/haplotypes/simulated_pops_large/from_chr1_to_chr23_start0_endNone_run0_p0-0_ab_corr0-5_vb0-25_length2/phenotype_dir_par_corr_0.5/1/chr_22.hdf5
Reading in file:  /disk/genetics/ukb/

# Exploring JKSE

In [27]:
from functools import partial
import multiprocessing as mp

blocksize = 100
nblocks = int(np.ceil(model.z.shape[0]/blocksize))
indices = list(range(model.z.shape[0]))
index_blocks = [indices[i * blocksize:(i + 1) * blocksize] for i in range((len(indices) + blocksize - 1) // blocksize )]

full_est = np.array([output['v1'], output['v2'], output['r']])

jkse_toparallelize = partial(ld.jkse_core, model = model, full_est = full_est, rbounds = True)

num_procs = 4
pool = mp.Pool(num_procs)
estimates_jk = pool.map(jkse_toparallelize, index_blocks)
estimates_jk = np.array(estimates_jk)


Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.53215414]
Initial estimate: [0.51801607 0.49264674 0.532

In [112]:
ld.jkse(model, output, blocksize = 100, num_procs=4)

array([0.01234442, 0.01156068, 0.01806745])

# Alternate JKSE

In [5]:
def _check_shape(x, y):
    '''Check that x and y have the correct shapes (for regression jackknives).'''
    if len(x.shape) != 2 or len(y.shape) != 2:
        raise ValueError('x and y must be 2D arrays.')
    if x.shape[0] != y.shape[0]:
        raise ValueError(
            'Number of datapoints in x != number of datapoints in y.')
    if y.shape[1] != 1:
        raise ValueError('y must have shape (n_snp, 1)')
    n, p = x.shape
    if p > n:
        raise ValueError('More dimensions than datapoints.')

    return (n, p)

class Jackknife():

    '''
    Base class for jackknife objects. Input involves x,y, so this base class is tailored
    for statistics computed from independent and dependent variables (e.g., regressions).
    The __delete_vals_to_pseudovalues__ and __jknife__ methods will still be useful for other
    sorts of statistics, but the __init__ method will need to be overriden.
    Parameters
    ----------
    x : np.matrix with shape (n, p)
        Independent variable.
    y : np.matrix with shape (n, 1)
        Dependent variable.
    n_blocks : int
        Number of jackknife blocks
    *args, **kwargs :
        Arguments for inheriting jackknives.
    Attributes
    ----------
    n_blocks : int
        Number of jackknife blocks
    p : int
        Dimensionality of the independent varianble
    N : int
        Number of datapoints (equal to x.shape[0])
    Methods
    -------
    jknife(pseudovalues):
        Computes jackknife estimate and variance from the jackknife pseudovalues.
    delete_vals_to_pseudovalues(delete_vals, est):
        Converts delete values and the whole-data estimate to pseudovalues.
    get_separators():
        Returns (approximately) evenly-spaced jackknife block boundaries.
    '''

    def __init__(self, model, n_blocks=None, separators=None):
        self.N, self.p = model.z.shape
        if separators is not None:
            if max(separators) != self.N:
                raise ValueError(
                    'Max(separators) must be equal to number of data points.')
            if min(separators) != 0:
                raise ValueError('Max(separators) must be equal to 0.')
            self.separators = sorted(separators)
            self.n_blocks = len(separators) - 1
        elif n_blocks is not None:
            self.n_blocks = n_blocks
            self.separators = self.get_separators(self.N, self.n_blocks)
        else:
            raise ValueError('Must specify either n_blocks are separators.')

        if self.n_blocks > self.N:
            raise ValueError('More blocks than data points.')

    @classmethod
    def jknife(cls, pseudovalues):
        '''
        Converts pseudovalues to jackknife estimate and variance.
        Parameters
        ----------
        pseudovalues : np.matrix pf floats with shape (n_blocks, p)
        Returns
        -------
        jknife_est : np.matrix with shape (1, p)
            Jackknifed estimate.
        jknife_var : np.matrix with shape (1, p)
            Variance of jackknifed estimate.
        jknife_se : np.matrix with shape (1, p)
            Standard error of jackknifed estimate, equal to sqrt(jknife_var).
        jknife_cov : np.matrix with shape (p, p)
            Covariance matrix of jackknifed estimate.
        '''
        n_blocks = pseudovalues.shape[0]
        jknife_cov = np.atleast_2d(np.cov(pseudovalues.T, ddof=1) / n_blocks)
        jknife_var = np.atleast_2d(np.diag(jknife_cov))
        jknife_se = np.atleast_2d(np.sqrt(jknife_var))
        jknife_est = np.atleast_2d(np.mean(pseudovalues, axis=0))
        return (jknife_est, jknife_var, jknife_se, jknife_cov)

    @classmethod
    def delete_values_to_pseudovalues(cls, delete_values, est):
        '''
        Converts whole-data estimate and delete values to pseudovalues.
        Parameters
        ----------
        delete_values : np.matrix with shape (n_blocks, p)
            Delete values.
        est : np.matrix with shape (1, p):
            Whole-data estimate.
        Returns
        -------
        pseudovalues : np.matrix with shape (n_blocks, p)
            Psuedovalues.
        Raises
        ------
        ValueError :
            If est.shape != (1, delete_values.shape[1])
        '''
        n_blocks, p = delete_values.shape
        if est.shape != (1, p):
            raise ValueError(
                'Different number of parameters in delete_values than in est.')

        return n_blocks * est - (n_blocks - 1) * delete_values

    @classmethod
    def get_separators(cls, N, n_blocks):
        '''Define evenly-spaced block boundaries.'''
        return np.floor(np.linspace(0, N, n_blocks + 1)).astype(int)

In [106]:
class LstsqJackknifeSlow():

    '''
    Slow linear-regression block jackknife. This class computes delete values directly,
    rather than forming delete values from block values. Useful for testing and for
    non-negative least squares (which as far as I am aware does not admit a fast block
    jackknife algorithm).
    Inherits from Jackknife class.
    Parameters
    ----------
    x : np.matrix with shape (n, p)
        Independent variable.
    y : np.matrix with shape (n, 1)
        Dependent variable.
    n_blocks : int
        Number of jackknife blocks
    nn: bool
        Non-negative least-squares?
    Attributes
    ----------
    est : np.matrix with shape (1, p)
        FWLS estimate.
    jknife_est : np.matrix with shape (1, p)
        Jackknifed estimate.
    jknife_var : np.matrix with shape (1, p)
        Variance of jackknifed estimate.
    jknife_se : np.matrix with shape (1, p)
        Standard error of jackknifed estimate, equal to sqrt(jknife_var).
    jknife_cov : np.matrix with shape (p, p)
        Covariance matrix of jackknifed estimate.
    delete_vals : np.matrix with shape (n_blocks, p)
        Jackknife delete values.
    Methods
    -------
    delete_values(x, y, func, s):
        Compute delete values of func(x, y) the slow way, with blocks defined by s.
    '''

    def __init__(self, model, rbounds=True, n_blocks=None, separators=None):
        
        mdl = Jackknife(model, n_blocks, separators)
        
        self.rbounds = rbounds

        func = model.solve
        self.est = np.atleast_2d(np.array([model.output['v1'], model.output['v2'],
                            model.output['r']]))
        self.s = mdl.separators
        self.delete_values = self.delete_values(model, func, mdl.separators)
        self.pseudovalues = mdl.delete_values_to_pseudovalues(
            self.delete_values, self.est)
        (self.jknife_est, self.jknife_var, self.jknife_se, self.jknife_cov) =\
            mdl.jknife(self.pseudovalues)
        
        
    def delete_values(self, model, func, s):
        '''
        Compute delete values by deleting one block at a time.
        Parameters
        ----------
        x : np.matrix with shape (n, p)
            Independent variable.
        y : np.matrix with shape (n, 1)
            Dependent variable.
        func : function (n, p) , (n, 1) --> (1, p)
            Function of x and y to be jackknived.
        s : list of ints
            Block separators.
        Returns
        -------
        delete_values : np.matrix with shape (n_blocks, p)
            Delete block values (with n_blocks blocks defined by parameter s).
        Raises
        ------
        ValueError :
            If x.shape[0] does not equal y.shape[0] or x and y are not 2D.
        '''
        
        d = []
        for i in range(len(s) - 1):
            d_in = func(z = np.vstack([model.z[0:s[i], ...], model.z[s[i + 1]:, ...]]), 
                      S = np.vstack([model.S[0:s[i], ...], model.S[s[i + 1]:, ...]]),
                      l = np.hstack([model.l[0:s[i], ...], model.l[s[i + 1]:, ...]]),
                      u = np.hstack([model.u[0:s[i], ...], model.u[s[i + 1]:, ...]]),
                      f = model.f,
                      M = model.M,
                    printout = False,
                    est_init = self.est,
                    rbounds = self.rbounds)[0]
            dmat = np.array([d_in['v1'], d_in['v2'], d_in['r']])
            
            d.append(dmat)

        return np.array(d)

In [109]:
ldsc_jk = LstsqJackknifeSlow(model, n_blocks=100)

Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial estimate: [[0.52373377 0.48447394 0.54189711]]
Initial es

LDSC implementation and our implementation of block jackknife seem to yield the same answers (there are very small differences due to each run's estimate being slightly different due to randomization. You can check this by running model.solve on the same data twice and it won't be exactly the same answer).

So sanity check, our block jackknife implementation is fine for now