In [1]:
import helperfuncs as hp
import numpy as np
from scipy.optimize import minimize
from scipy.special import comb
from scipy.misc import derivative
import scipy.stats
from numba import jit, njit, vectorize

# Defining the PDF and the Log Likelihoods

The likelihood for a SNP $i$ is:

$$
l_i = -\frac{d}{2} log (2 \pi) - \frac{1}{2} log ( |I + r_i S_i^{-1/2} V S_i^{-1/2}| ) - \frac{1}{2} z_i^T (I + r_i S_i^{-1/2} V S_i^{-1/2}) ^{-1} z_i
$$

And its derivative:

$$
\frac{dl}{dV} = r_i S^{-1/2} \Sigma_i^{-1} (\Sigma - z_i z_i^T) \Sigma_i^{-1} S^{-1/2}
$$

In [107]:
class sibreg():
    
    def __init__(self, S, z = None, u = None, r = None, f = None):
        
        if S.ndim > 1:
            for s in S:
                n, m = s.shape
                assert n == m

        if z is None:
            print("Warning there is no value for z. Maybe consider simulating it")
        if u is None:
            print("No value for U given. Generating a vector of ones (all SNPs weighted equally)")
            u = np.ones(S.shape[0])
        if r is None:
            print("No value for r given. Generating a vector of ones for r")
            r = np.ones(S.shape[0])
        if f is None:
            print("Warning: No value given for allele frequencies. Some parameters won't be noramlized.")
        
        self.z = None if z is None else z[~np.any(np.isnan(z), axis = 1)]
        self.S = S[~np.any(np.isnan(S), axis = (1, 2))]
        self.u = u[~np.isnan(u)]
        self.r = r[~np.isnan(r)]
        self.f = None if f is None else f[~np.isnan(f)]
    

    def simdata(self, V,  N, simr = False):
        """
        Simulates data for z scores.
        
        Inputs:
        V = varcov matrix of true effects
        N = Number of obs/SNPs to generate
        simr = boolean indicating if we want
                to simulate ldscores
                
        
        Outputs:
        None
        
        - It creates an object within the class
        called z
        """
        
        
        S = self.S
        
        if simr:
            self.r = np.random.uniform(low=1, high=5, size=N)
            print("Simulated LD scores!")
        
        r = self.r

        zhat_vec = np.empty((N, V.shape[1]))
        for i in range(N):
            
            Si = S[i]
            ri = r[i]
            
            V = np.array(V)
            Si = np.array(Si)
            S_inv = np.linalg.inv(np.sqrt(Si))
  
            # get shape of V
            d = V.shape[0]
            zeromat = np.zeros(d)

            # generate true effect vector
            if d > 1:
                sim = np.random.multivariate_normal(zeromat, np.eye(d) + ri * S_inv @ V @ S_inv)
            else:
                sim = np.random.normal(zeromat, np.eye(d) + ri * S_inv @ V @ S_inv)
            
            # Append to vector of effects
            zhat_vec[i, :] = sim
        

        print("Effect Vectors Simulated!")
        
        self.snp = np.arange(1, N+1, 1)
        self.pos = np.arange(1, N+1, 1)
        self.z = zhat_vec
        
    def _log_ll(self, V, z, S, u, r, f):
        
        """
        Returns the log likelihood matrix for a given SNP i as formulated by:
        
        l_i = -\frac{d}{2} log (2 \pi) - \frac{1}{2} log ( |I + r_i S_i^{-1/2} V S_i^{-1/2}| ) -
                \frac{1}{2} z_i^T (I + r_i S_i^{-1/2} V S_i^{-1/2}) ^{-1} z_i
                
        Inputs:
        V = dxd numpy matrix
        z = dx1 numpy matrix
        S = dxd numpy matrix
        u = 1 numpy matrix
        r = 1 numpy matrix
        f = 1 numpy matrix
        
        Outputs:
        logll = 1 dimensional matrix 
        """
        
        S_inv_root = hp.calc_inv_root(S)
        Sigma = np.identity(S.shape[0])+r*np.dot(S_inv_root.dot(V),S_inv_root)
        logdet = np.linalg.slogdet(Sigma)
        Sigma_inv = np.linalg.inv(Sigma)
        z = z.reshape(z.shape[0],1)
        L = logdet[0]*logdet[1]+np.dot(z.T,Sigma_inv.dot(z))
        return L
    
    def _grad_ll_v(self, V, z, S, u, r, f):
        
        """
        Returns the gradient of the log likelihood wrt V for a given SNP i as formulated by:
        
        \frac{dl}{dV} = S^{-1/2} \Sigma_i^{-1} (\Sigma - z_i z_i^T) \Sigma_i^{-1} S^{-1/2}
                
        Inputs:
        V = dxd numpy matrix
        z = dx1 numpy matrix
        S = dxd numpy matrix
        u = 1 numpy matrix
        r = 1 numpy matrix
        f = 1 numpy matrix
        
        Outputs:
        grad_ll_v = dxd matrix 
        """
        
        S_inv_root = hp.calc_inv_root(S)
        Sigma = np.identity(S.shape[0])+r*np.dot(S_inv_root.dot(V),S_inv_root)
        Sigma_inv = np.linalg.inv(Sigma)
        z = z.reshape(z.shape[0],1)
        SSigma_inv = S_inv_root.dot(Sigma_inv)
        g = r * SSigma_inv.dot(np.dot(Sigma-z.dot(z.T),SSigma_inv.T))
        return g
    
    def _num_grad_V(self, V, z, S, u, r, f):
        """
        Returns numerical gradient vector of self._log_ll
        Mostly meant to check if self._grad_ll_v is working
        properly
        
        Inputs:
        V = dxd numpy matrix
        z = dx1 numpy matrix
        S = dxd numpy matrix
        u = 1 numpy matrix
        r = 1 numpy matrix
        f = 1 numpy matrix
        
        Outputs:
        g = dxd matrix 
        """
        
        g = np.zeros(V.shape)
        for i in range(0,V.shape[0]):
            for j in range(0,V.shape[1]):
                dV = np.zeros((V.shape))
                dV[i,j] = 10 ** (-6)
                V_upper = V+dV
                V_lower = V-dV
                g[i,j] = (self._log_ll(V_upper, z, S, u, r, f) - \
                          self._log_ll(V_lower, z, S, u, r, f)) / (2 * 10 ** (-6))
        return g


    def neg_logll_grad(self, V, 
                       z = None, S = None, 
                       u = None, r = None, 
                       f = None,
                       logllfunc = None,
                       gradfunc = None):
        
        """
        Returns the loglikelihood and its gradient wrt V for a given SNP i as formulated by:
        
        l_i = -\frac{d}{2} log (2 \pi) - \frac{1}{2} log ( |I + r_i S_i^{-1/2} V S_i^{-1/2}| ) -
                \frac{1}{2} z_i^T (I + r_i S_i^{-1/2} V S_i^{-1/2}) ^{-1} z_i
        
        and
        
        \frac{dl}{dV} = S^{-1/2} \Sigma_i^{-1} (\Sigma - z_i z_i^T) \Sigma_i^{-1} S^{-1/2}
                
        Inputs:
        V = dxd numpy matrix
        z = dxN numpy matrix
        S = dxd numpy matrix
        u = 1 numpy matrix
        r = 1 numpy matrix
        f = 1 numpy matrix
        logllfunc = function which calculates logll
                    (uses self._log_ll by default)
        gradfunc = function which calculated grad of logll
                    (uses self._grad_ll_v by default)
        
        Outputs:
        -log_ll = 1x1 scalar
        -Gvec = dxd numpy matrix
        """
        
        z = self.z if z is None else z
        S = self.S if S is None else S
        u = self.u if u is None else u
        r = self.r if r is None else r
        f = self.f if f is None else f
        logllfunc = self._log_ll if logllfunc is None else logllfunc
        gradfunc = self._grad_ll_v if gradfunc is None else grandfunc

        # Unflatten V into a matrix
        d = S[0].shape[0]
        V = hp.return_to_symmetric(V, d)
        Gvec = np.zeros((d, d))
        
        N = len(S)
        log_ll = 0
        
        # Normalizing variables
        V_norm = V/N
        for i in range(N):
            
            Si = S[i]
            zi = z[i, :].reshape((d, 1))
            ui = u[i]
            ri = r[i]
            
            
            fi = f[i]  if f is not None else None

            d, ddash = Si.shape
            assert d == ddash # Each S has to be a square matrix
            
            log_ll += logllfunc(V_norm, zi, Si, ui, ri, fi)
            Gvec += gradfunc(V_norm, zi, Si, ui, ri, fi)

        Gvec = hp.extract_upper_triangle(Gvec)
        return -log_ll , -Gvec


    def solve(self,
              z = None, 
              S = None,
              u = None,
              r = None,
              f = None,
              neg_logll_grad = None,
              est_init = None,
              printout = True):
        
        """
        Solves the ldsc problem of infering the V matrix
                
        Inputs:
        z = Nx1 numpy matrix
        S = dxd numpy matrix
        u = 1 numpy matrix
        r = 1 numpy matrix
        f = 1 numpy matrix
        
        Outputs:
        output_matrix = dxd numpy matrix
        result = result of scipy solver 
        """
        
        # inherit parameters from the class if they aren't defined
        z = self.z if z is None else z
        S = self.S if S is None else S
        u = self.u if u is None else u
        r = self.r if r is None else r
        f = self.f if f is None else f
        neg_logll_grad = self.neg_logll_grad if neg_logll_grad is None else neg_logll_grad

        # == Solves our MLE problem == #
        n, m = z.shape
        
        if est_init is not None:
            # Shape of initial varcov guess
            rowstrue = est_init.shape[0] == m
            colstrue = est_init.shape[1] == m

            if rowstrue & colstrue:
                pass
            else:
                if printout == True:
                    print("Warning: Initial Estimate given is not of the proper dimension")
                    print("Making 'optimal' matrix")
                    print("=================================================")
                
                est_init = np.zeros((m, m))
        else:
            if printout == True:
                print("No initial guess provided.")
                print("Making 'optimal' matrix")
                print("=================================================")
            
            est_init = np.zeros((m, m))
            
        
        # exporting for potential later reference
        self.est_init = est_init

        # extract array from est init
        est_init_array = hp.extract_upper_triangle(est_init) 
        
        bounds = hp.extract_bounds(m)     

        result = minimize(
            neg_logll_grad, 
            est_init_array,
            jac = True,
            args = (z, S, u, r, f),
            bounds = bounds,
            method = 'L-BFGS-B'
        )
        
        output_matrix = hp.return_to_symmetric(result.x, m)
        
        # re-normnalizing output matrix
#         output_matrix = output_matrix / n
        
        self.output_matrix = output_matrix
        
        return output_matrix, result 

    def jackknife_se(self,
                  theta  = None, S = None,
                  r = None, u = None,
                  blocksize = 1):

        # Simple jackknife estimator for SE
        # Ref: https://github.com/bulik/ldsc/blob/aa33296abac9569a6422ee6ba7eb4b902422cc74/ldscore/jackknife.py#L231
        # Default value of blocksize = 1 is the normal jackknife

        theta = self.theta if (theta is None) else theta
        S = self.S if (S is None) else S
        r = self.r if (r is None) else r
        u = self.u if (u is None) else u

        
        assert theta.shape[0] == S.shape[0]

        nobs = theta.shape[0]
        
        estimates_jk = []
        
        start_idx = 0
        while True:
            
            end_idx = start_idx + blocksize
            end_idx_cond = end_idx <= theta.shape[0]
            
            # remove blocks of observations

            vars_jk = []

            for var in [theta, S, r, u]:

                var_jk = delete_obs_jk(var, start_idx, end_idx,
                                       end_idx_cond)
                vars_jk.append(var_jk)
            
            if start_idx < theta.shape[0]:
                # Get our estimate
                output_matrix, _ = self.solve(theta = vars_jk[0],
                                              S = vars_jk[1],
                                              r = vars_jk[2],
                                              u = vars_jk[3],
                                              printout = False,
                                              est_init = self.est_init)

                estimates_jk.append(output_matrix)

                start_idx += blocksize
            else:
                break
            
        estimates_jk = np.array(estimates_jk)
        full_est = self.output_matrix
        
        # calculate pseudo-values
        n_blocks = int(nobs/blocksize)
        pseudovalues = n_blocks * full_est - (n_blocks - 1) * estimates_jk
        
        # calculate jackknife se
        pseudovalues = pseudovalues.reshape((n_blocks, theta.shape[1] * theta.shape[1]))
        jknife_cov = np.cov(pseudovalues.T, ddof=1) / n_blocks
        jknife_var = np.diag(jknife_cov)
        jknife_se = np.sqrt(jknife_var)
    
        jknife_se  = jknife_se.reshape((theta.shape[1], theta.shape[1]))
        
        return jknife_se  


In [108]:
N = 100
S_size = int(N/2)
S = np.array([np.array([[.5, 0], [0, .8]]),
    np.array([[0.5, 0], [0, 0.8]])] * S_size )/N
V = np.identity(2) * 0.5

In [109]:
model = sibreg(S = S)
model.simdata(V, N, simr = True)

No value for U given. Generating a vector of ones (all SNPs weighted equally)
No value for r given. Generating a vector of ones for r
Simulated LD scores!
Effect Vectors Simulated!


In [110]:
Vin = hp.extract_upper_triangle(V)
model.neg_logll_grad(Vin * N)

(array([[-1281.7558863]]), array([-6.82758698, 11.53012504,  3.5616072 ]))

In [113]:
# Testing derivatives
aderiv = model._grad_ll_v(V, model.z[0, :], model.S[0], 
                 model.u[0], model.r[0], 
                 model.f)

nderiv = model._num_grad_V(V, model.z[0, :], model.S[0], 
                 model.u[0], model.r[0], 
                 model.f)

np.allclose(aderiv, nderiv)

True

In [114]:
# solving
output, result = model.solve(est_init = V*N)
print(result)

      fun: array([[-82059.8819275]])
 hess_inv: <3x3 LbfgsInvHessProduct with dtype=float64>
      jac: array([1.14884943e+08, 1.72073068e+08, 2.57728481e+08])
  message: b'ABNORMAL_TERMINATION_IN_LNSRCH'
     nfev: 53
      nit: 3
   status: 2
  success: False
        x: array([ 58.56173418, -39.16041368,  25.99068745])


In [45]:
# palying

n=100
scale = -3
S = np.array([[10**scale,0.5*10**scale],[0.5*10**scale,10**scale]])
V = np.array([[10**(scale-1),0.5*10**(scale-1)],[0.5*10**(scale-1),10**(scale-1)]])
z = np.random.multivariate_normal(np.zeros((2)),S+V,size=n)

In [46]:
S = np.array([S.tolist()] * n)

In [47]:
def calc_inv_root(S):
    S_eig = np.linalg.eig(S)
    l = np.zeros(S.shape)
    np.fill_diagonal(l,np.power(S_eig[0],-0.5))
    S_inv_root = S_eig[1].dot(np.dot(l,S_eig[1].T)) 
    return S_inv_root

def likelihood(z,S,V):
    S_inv_root = calc_inv_root(S)
    Sigma = np.identity(S.shape[0])+np.dot(S_inv_root.dot(V),S_inv_root)
    logdet = np.linalg.slogdet(Sigma)
    Sigma_inv = np.linalg.inv(Sigma)
    z = z.reshape(z.shape[0],1)
    L = logdet[0]*logdet[1]+np.dot(z.T,Sigma_inv.dot(z))
    return L

In [48]:
likelihood(z[0], S[0], V)

array([[0.1917612]])