In [87]:
import msprime
import numpy as np
import statistics
import math
import allel
import pandas as pd
import statsmodels.api as sm
from scipy import (stats,ndimage)


In [88]:
d = 36

# dimension of square grid
dim=int(np.sqrt(d))

# define 2d grid to with deme identity
pmat=np.arange(0,d).reshape(dim,dim)

In [89]:
#define function to generate adjacency matrix
#arguments:
#m = migration rate in one direction
#nd = number of demes
def step_mig_mat(m,nd):
    #m is the uni-directional symmetric migration rate
    #NOTE: nd MUST be a squared number
    if(math.sqrt(nd).is_integer()==False):
        raise ValueError("nd must be a squared number (e.g. 4, 9, 16 ...) for the 2D model")
    else:
        nd2=int(math.sqrt(nd))
        #create matrix which will be used to determine which cells are adjacent in 2-dimensions
        #diagonals not considered for now but can be incorporated later if needed
        pmat=np.arange(0,nd).reshape(nd2,nd2)

        #create empty migration matrix to be filled in. This will be the output
        mmat=np.zeros(shape=[nd,nd])

        #go through each cell in pmat and find out which cells are adjacent
        #first define functions to take care of corners and sides
        def contain(ix,max_ix):
            if ix<0:
                return(0)
            if ix>(max_ix-1):
                return(max_ix-1)
            else:
                return(ix)

        for ii in range(nd):
            center_ix=np.where(pmat==ii)
            top_ix=pmat[contain(center_ix[0]-1,nd2),contain(center_ix[1],nd2)]
            bottom_ix=pmat[contain(center_ix[0]+1,nd2),contain(center_ix[1],nd2)]
            left_ix=pmat[contain(center_ix[0],nd2),contain(center_ix[1]-1,nd2)]
            right_ix=pmat[contain(center_ix[0],nd2),contain(center_ix[1]+1,nd2)]

            mmat[ii,top_ix]=mmat[ii,bottom_ix]=mmat[ii,left_ix]=mmat[ii,right_ix]=m
            mmat[top_ix,ii]=mmat[bottom_ix,ii]=mmat[left_ix,ii]=mmat[right_ix,ii]=m

            mmat[ii,ii]=0

    return(mmat)


In [90]:
#generate migration matrix with migration rate provided by user
mig_mat=step_mig_mat(m=0.05,nd=d)

#diploid sample size within each deme
ss=250

#number of haplotypes
nhaps=ss*2

##### define function to simulate genotypes under a stepping stone migration model
def step_geno(ss_each=ss*2,tmove=100,rho=0):
    #N is the population size for each deme
    #ss_each is the haploid sample size for each deme
    #l is the length of the chromosome
    #tmove is the number of generations past which all lineages are moved into one deme.
    	#The is to reduce computational time when the no. of lineages << ndemes
        #also to mimic migration of an ancient population after which structure is established
        #set to 1000 generations by default

    sample_sizes=[ss_each]*d

    population_configurations = [
    msprime.PopulationConfiguration(sample_size=k)
    for k in sample_sizes]


    if tmove==-9:
         ts=msprime.simulate(Ne=1e4,
                          population_configurations=population_configurations,
                          migration_matrix=mig_mat,
                          mutation_rate=1e-08,
                          recombination_rate=rho,
                          length=39000)
    else:
        #specify demographic event - move all lineages to one population after tmove generations
        demog=[
            msprime.MassMigration(
                time=tmove,
                source=i,
                destination=d-1,
                proportion=1.0) for i in range(d-1)]

        demog.append(#change migration rate among demes to be 0
            msprime.MigrationRateChange(
                time=tmove,
                rate=0))


        ts=msprime.simulate(Ne=1e4,
                              population_configurations=population_configurations,
                              migration_matrix=mig_mat,
                              mutation_rate=1e-08,
                              recombination_rate=rho,
                              length=39000,
                           demographic_events=demog)

    return(ts)

In [91]:
print("simulating genealogies")
#simulate!
ts=step_geno(ss_each=ss*2,tmove=100,rho=0)

print("calculating burden for each gene")
#describe the exon structure of the genes
#we don't need the introns or the UTRs as we did in SLIM
gene_ranges=[(201,345),(7291,7435),(10836,10980),(14381,14525),(17926,18070),(21471,21615),(25016,25160),(28561,28705)]


simulating genealogies
calculating burden for each gene


In [92]:
#define vectors, which will be used to select the odd and even haplotypes of an individual
evens=range(0,ss*2*d,2)
odds=range(1,ss*2*d,2)

In [93]:
def calculate_burden(tree_sequence,gene_ranges2):
    #get burden and sample a single rare variant from each gene
    dosage=[]
    for variant in tree_sequence.variants():
        if any(lower<=variant.site.position<=upper for (lower,upper) in gene_ranges2):
            daf=np.mean(variant.genotypes)
            if(daf<0.001):
                dosage.append(variant.genotypes[evens]+variant.genotypes[odds])

    #aggregate across all such variants and calculate burden for each individual
    #check if the dosage is nonzero first
    #found a situation where there were no rare variants in a gene (with rho=1e-08)
    #if dosage==0, append nan. we will deal with this later
    if(len(dosage)==0):
        burden =np.repeat(np.nan,ss*d)
        #variants.append(np.repeat(np.nan,ss*d))
    else:
        burden =np.sum(dosage,axis=0)
        
    return burden

In [94]:
test=calculate_burden(ts,gene_ranges)

In [95]:
burdens=np.empty((9000,len(gene_ranges)))
for i in range(0,len(gene_ranges)):
    burdens[:,i] = calculate_burden(ts,gene_ranges[0:i]) 

In [96]:
burdens=burdens.astype(str)

In [97]:
np.savetxt("/Users/Azaidi/gwas_bias2/burden_msprime/burden_msprime_t100_rho0_clustering_1gene.txt",burdens,fmt="%s",delimiter=",")

In [98]:
print("simulating genealogies with recombination")
#simulate!
ts=step_geno(ss_each=ss*2,tmove=100,rho=1e-08)

simulating genealogies with recombination


In [99]:
test=calculate_burden(ts,gene_ranges)

burdens=np.empty((9000,len(gene_ranges)))
for i in range(0,len(gene_ranges)):
    burdens[:,i] = calculate_burden(ts,gene_ranges[0:i]) 
    
burdens=burdens.astype(str)
np.savetxt("/Users/Azaidi/gwas_bias2/burden_msprime/burden_msprime_t100_rho1e-08_clustering_1gene.txt",burdens,fmt="%s",delimiter=",")