In [2]:
import importlib
import ldsc_72 as ld
import numpy as np
import pandas as pd
import h5py
from numba import jit, njit, prange, vectorize
from helperfuncs import *
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

# reloading modules
importlib.reload(ld)

<module 'ldsc_72' from 'C:\\Users\\Hariharan\\Documents\\git_repos\\SNIPar\\ldsc_reg\\infertheta\\ldsc_72.py'>

# Simulations

In [3]:
np.random.seed(123)

N = int(100)
S_size = int(N/2)
S = np.array([np.array([[.5, 0], [0, .8]]),
    np.array([[0.5, 0], [0, 0.8]])] * S_size )
V = np.identity(2) * 0.5
f = np.random.uniform(0, 1, N)



model = ld.sibreg(S = S, f = f)
model.simdata(V, N, simr = True)

No value for U given. Generating a vector of ones (all SNPs weighted equally)
No value for r given. Generating a vector of ones for r
Simulated LD scores!
Effect Vectors Simulated!


In [4]:
output_matrix, result = model.solve()
print(output_matrix)
print(result)

No initial guess provided.
Making 'optimal' matrix
[[ 0.26164207 -4.67047071]
 [-4.67047071 42.54352214]]
      fun: -529.4063339249765
 hess_inv: <3x3 LbfgsInvHessProduct with dtype=float64>
      jac: array([-14592485.50491885,  -1587275.35175443,   -172652.98886016])
  message: b'ABNORMAL_TERMINATION_IN_LNSRCH'
     nfev: 78
      nit: 4
   status: 2
  success: False
        x: array([ 0.26164207, -4.67047071, 42.54352214])


A curious point here. If I simulate the data with ```r```, I HAVE to put the initial estimate as a zero matrix for it to converge. With the optimal matrix it doesn't converge. This is true only if the true ```V``` vector is large (diagonal elements are 10.0 for example).

This is not true if the diagonal elements are small.

In [151]:
jkse = model.jackknife_se()
print(jkse)

[[30.02415084  9.95263919]
 [ 9.95263919  0.7399902 ]]


# Running LD Score Regression

In [152]:
# Chi sqaure stat for the direct effect
chisq = N * model.theta[:, 0] ** 2

In [153]:
cons = np.ones(N)
reg1 = sm.OLS(endog=chisq, exog=np.array([cons, model.r]).T, 
              missing='drop',
              hasconst = False)

In [154]:
results = reg1.fit()
print(results.summary())

OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.395
Model:                            OLS   Adj. R-squared (uncentered):              0.383
Method:                 Least Squares   F-statistic:                              32.05
Date:                Mon, 24 Aug 2020   Prob (F-statistic):                    1.95e-11
Time:                        21:43:05   Log-Likelihood:                         -780.96
No. Observations:                 100   AIC:                                      1566.
Df Residuals:                      98   BIC:                                      1571.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------

In [178]:
# generating LD score data which LDSC can read

# main ldscore data
ldscores = pd.DataFrame({'CHR' : np.array([22] * N),
                        'SNP' : model.snp,
                        'BP' : model.pos,
                        'L2' : model.r})
ldscores.to_csv("ldscores/22.l2.ldscore.gz",
                compression = 'gzip',
                sep = " ")

# N data
with open("ldscores/22.l2.M", "w") as f:
    f.write(str(N))

with open("ldscores/22.l2.M_5_50", "w") as f:
    f.write(str(N))


# Reading in Data simulated from Actual SNPS

In [133]:
# reading in causal data
file = "C:/Users/Hariharan/Documents/genoecon_work/snipardata/causal.hdf5"

hf = h5py.File(file, 'r')
theta  = hf.get('estimate')[()]
S = hf.get('estimate_covariance')[()]
f = hf.get('freqs')[()]

model = ld2.sibreg(S = S, f= f, theta = theta)

No value for U given. Generating a vector of ones (all SNPs weighted equally)
No value for r given. Generating a vector of ones for r


In [135]:
model.solve(est_init = np.zeros((3, 3)))

(array([[1.001001e-10, 0.000000e+00, 0.000000e+00],
        [0.000000e+00, 1.001001e-10, 0.000000e+00],
        [0.000000e+00, 0.000000e+00, 1.001001e-10]]),
       fun: -89275.12272475495
  hess_inv: <6x6 LbfgsInvHessProduct with dtype=float64>
       jac: array([-2.13456477e+08, -3.98672166e+07, -3.85922911e+07, -3.79255614e+08,
        -3.12722864e+08, -3.26042514e+08])
   message: b'ABNORMAL_TERMINATION_IN_LNSRCH'
      nfev: 21
       nit: 0
    status: 2
   success: False
         x: array([1.e-06, 0.e+00, 0.e+00, 1.e-06, 0.e+00, 1.e-06]))

# Experimental

In [93]:
def core_logll_loop(V, N, S, theta, u, r):
    
    Gvec = np.zeros_like(V)
    log_ll = 0
        
    for i in prange(N):


        Si = S[i]
        thetai = theta[i, :]
        ui = u[i]
        ri = r[i]
        

        d, ddash = Si.shape
        assert d == ddash # Each S has to be a square matrix

        # calculate log likelihood
        log_ll += -(d/2) * np.log(2 * np.pi)
        dit_sv = np.linalg.det(Si + ri * V)
        log_ll += -(1/2) * np.log(dit_sv)
        log_ll += -(1/2) * np.trace(np.outer(thetai, thetai) @ np.linalg.inv(Si + ri * V))
        log_ll *= 1/ui

        # calculate gradient
        SV_inv = np.linalg.inv(Si + ri * V)
        G = -(1 / 2) * SV_inv
        G += (1 / 2) * np.dot(SV_inv,np.dot(np.outer(thetai, thetai),SV_inv))
        G *= 1/ui

        Gvec += G
        
    return log_ll, Gvec

In [192]:
@njit
def numba_core_logll_loop(V, N, S, theta, u, r):
    
    Gvec = np.zeros_like(V)
    log_ll = 0
        
    for i in prange(N):


        Si = S[i]
        thetai = theta[i, :]
        ui = u[i]
        ri = r[i]
        

        d, ddash = Si.shape
        assert d == ddash # Each S has to be a square matrix

        # calculate log likelihood
        log_ll += -(d/2) * np.log(2 * np.pi)
        dit_sv = np.linalg.det(Si + ri * V)
        log_ll += -(1/2) * np.log(dit_sv)
        log_ll += -(1/2) * np.trace(np.outer(thetai, thetai) @ np.linalg.inv(Si + ri * V))
        log_ll *= 1/ui

        # calculate gradient
        SV_inv = np.linalg.inv(Si + ri * V)
        G = -(1 / 2) * SV_inv
        G += (1 / 2) * np.dot(SV_inv,np.dot(np.outer(thetai, thetai),SV_inv))
        G *= 1/ui

        Gvec += G
        
    return log_ll, Gvec

In [193]:
def outer_neg_logll_grad(V, theta, S, u, r, loopfunc):
        
    # ============================================ #
    # returns negative log likelihood and negative
    # of the gradient
    # ============================================ #
    
    # Unflatten V into a matrix
    d = S[0].shape[0]
    V = return_to_symmetric(V, d)
    
    N = len(S)

    log_ll, Gvec = loopfunc(V, N, S, theta, u, r)


    Gvec = extract_upper_triangle(Gvec)

    return -log_ll, -Gvec

In [96]:
Vin = extract_upper_triangle(V)

In [105]:
%%timeit 
outer_neg_logll_grad(Vin, model.theta, model.S, model.u, model.r, 
                     loopfunc = core_logll_loop)

13.1 ms ± 188 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [195]:
%%timeit 
outer_neg_logll_grad(Vin, model.theta, model.S, model.u, model.r,
                     loopfunc = numba_core_logll_loop)

801 µs ± 35.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
