This notebook contains code to measure the burden of deleterious variation from STRs

In [8]:
%pylab inline

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
from matplotlib import pyplot as plt
import sys
import pandas as pd

sys.path.append("/storage/BonnieH/selection_project/helper_functions")
from Simulation_functions import *

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# New mutations - based on models

In [9]:
# For each class of STRs, we want:
# - motif/opt allele
# - number of STRs
# - mutation rate
# - mean s per mutation
# Then will sum across all categories

# Read in joint results
joint_results = {}
sistr2_scores_file = open('/storage/BonnieH/selection_project/analysis/SISTR2_scores_by_motif_corrected_num_loci.txt', 'r')
sistr2_scores_file.readline()
for line in sistr2_scores_file:
    info = line.strip().split('\t')
    period = int(info[0])
    optimal_ru = int(info[1])
    motif = info[2]
    num_loci = int(info[3])
    est_a = float(info[4])
    est_b = float(info[5])
    joint_results[(period,optimal_ru,motif,num_loci)] = (est_a,est_b)
sistr2_scores_file.close()

# List contents: mu, beta, p, l, optimal ru for the mu value
# Mutation model parameters for different motifs
motif_info = {}
motif_info['eurodem_prior2_dinuc_e_1kg_euro'] = [10**-5, 0.3, 0.6, 0.15, 6]
motif_info['eurodem_prior2_dinuc_d_1kg_euro'] = [10**-4.6, 0.3, 0.6, 0.135, 6]
motif_info['eurodem_prior2_trinuc_e_1kg_euro'] = [10**-6, 0.3, 0.9, 0.3, 5] 
motif_info['eurodem_prior2_tetranuc_b_1kg_euro'] = [10**-5, 0.3, 0.9, 0.25, 3]
motif_info['eurodem_prior2_tetranuc_c_1kg_euro'] = [10**-5.5, 0.3, 0.9, 0.42, 3]
motif_info['eurodem_prior2_tetranuc_d_1kg_euro'] = [10**-6, 0.3, 0.9, 0.45, 3]

# Mutation model settings for different motifs (inferred by SISTR2)
trinuc_motifs = ['ACC','AGG','AGC','CCG','AAG','AAC','ATC','AAT']
dinuc_motifs  = ['AC','AG','AT'] 
tetranuc_motifs = ['AAGG','AAAT','ATCC','AATC','AAAC','AATG','ACAT','AGAT','AAAG'] 
mut_setting = {}
mut_setting['AC'] = 'eurodem_prior2_dinuc_e_1kg_euro'
mut_setting['AG'] = 'eurodem_prior2_dinuc_e_1kg_euro'
mut_setting['AT'] = 'eurodem_prior2_dinuc_d_1kg_euro'
for mot in trinuc_motifs:
    mut_setting[mot] = 'eurodem_prior2_trinuc_e_1kg_euro'
for mot in ['AAAG', 'AAGG', 'AGAT']:
    mut_setting[mot] = 'eurodem_prior2_tetranuc_b_1kg_euro'
for mot in ['ACAT', 'AAAT']:
    mut_setting[mot] = 'eurodem_prior2_tetranuc_c_1kg_euro'
for mot in ['AATC', 'AATG', 'ATCC', 'AAAC']:
    mut_setting[mot] = 'eurodem_prior2_tetranuc_d_1kg_euro'

    
motifs_ = []
opts_ = []
numloci_ = []
means_ = []
muts_ = []
# Get burden for each category
for elem in joint_results:
    # Get params for this class
    setting_info = mut_setting[elem[2]]
    mut_info = motif_info[setting_info]
    num_loci = elem[3] # target size
    
    # Get mutation rate   
    log_mu_prime = np.log10(mut_info[0])+mut_info[3]*(elem[1] - mut_info[4])
    mu_prime = 10**log_mu_prime
    
    if mu_prime < 10**-8: mu_prime = 10**-8 
    if mu_prime > 10**-3: mu_prime = 10**-3

    # Get mean s value of a new mutation
    # Sum over all possible step sizes
    mean_s = 0
    for i in range(-20, 20):
        if elem[1] + i <= 0: continue # can't have negative repeat copy nmber
        prob_step = GetStepSizeProb(0, i, mut_info[1], mut_info[2])
        sval = joint_results[elem][0]*joint_results[elem][1]*abs(i) # a*b*step
        mean_s += prob_step*sval
    
    motifs_.append(elem[2])
    opts_.append(elem[1])
    numloci_.append(num_loci)
    means_.append(mean_s)
    muts_.append(mu_prime)

data = pd.DataFrame({"motif": motifs_,
             "opt": opts_,
             "numloci": numloci_,
             "mean.s": means_,
             "mu": muts_})

In [14]:
# Get burden

data["burden"] = data.apply(lambda x: 2*x["mu"]*x["mean.s"]*x["numloci"], 1)
data.to_csv("figure4-burden-numbers.csv")
print(np.sum(data["burden"]))

# Summarize overall
num_strs = np.sum(data["numloci"])
mean_s = np.sum(data.apply(lambda x: x["numloci"]*x["mean.s"], 1))/np.sum(data["numloci"])
mean_mu = np.sum(data.apply(lambda x: x["numloci"]*x["mu"], 1))/np.sum(data["numloci"])
burden = 2*mean_mu*num_strs*mean_s
print("Num STRs: %s"%(num_strs))
print("Mean s/mutation: %s"%(mean_s))
print("Mean mu: %s"%(mean_mu))
print("Overall burden: %s"%burden)

0.051146734481617315
Num STRs: 74119
Mean s/mutation: 0.0006540766876531336
Mean mu: 0.00019606154782613472
Overall burden: 0.01900993554167439


# New mutations - based on observed de novos

In [12]:
joint_results2 = {} # motif, opt -> s
for elem in joint_results:
    joint_results2[(elem[2], elem[1])] = joint_results[elem][0]*joint_results[elem][1]

In [13]:
NUMBS = 100

# Load SISTR scores
def GetSEst(x):
    if x["LRT_p_value"]>1:
        return 0
    else: return x["ABC_s_median"]
    
sistr_scores_file = "/storage/BonnieH/selection_project/1000genomes/euro_allele_freqs/SISTR_per_locus_scores.txt"
sistr = pd.read_csv(sistr_scores_file, sep="\t")
sistr["chrom"] = sistr["chrom"].apply(lambda x: "chr"+str(x))
sistr["s.est"] = sistr.apply(GetSEst,1)
sistr["sistr2"] = sistr.apply(lambda x: joint_results2.get((x["motif"], x["optimal_ru"]), None), 1)

# Load all de novos + filter
denovos_file = "/storage/hziaeija/variant_trio/trio_variants/monstr_all_filters/CEU_all_filters.csv"
denovo = pd.read_csv(denovos_file)
denovo = denovo[(denovo["encl_mother"]==0) & (denovo["encl_father"]==0) & (denovo["encl_child"]>=5)]
denovo["start"] = denovo["pos"]

def BootstrapSVals(all_svals, all_svals_2):
    bs_svals = []
    bs_svals_2 = []
    for i in range(len(all_svals)):
        ind = random.randint(0, len(all_svals))
        bs_svals.append(all_svals[ind])
        bs_svals_2.append(all_svals_2[ind])
    return bs_svals, bs_svals_2

def GetLowerBS(vals):
    return np.percentile(vals, [2.5])[0] #np.min(vals)

def GetUpperBS(vals):
    return np.percentile(vals, [97.5])[0] #np.max(vals)

for sample in ["NA12864", "NA10865","NA10845" ]:
    denovo_ = denovo[denovo["child"]==sample].copy()
    ddata = pd.merge(denovo_[["chrom","start","newallele","mutsize"]], sistr, on=["chrom","start"])
    ddata = ddata[~np.isnan(ddata["ABC_s_median"]) & ~np.isnan(ddata["sistr2"])]

    ddata["s.new"] = ddata.apply(lambda x: (x["newallele"]-x["optimal_ru"])*x["s.est"], 1)
    ddata["s.old"] = ddata.apply(lambda x: (x["newallele"]-x["mutsize"]-x["optimal_ru"])*x["s.est"], 1)
    ddata["s.delta"] = ddata["s.new"]-ddata["s.old"]

    ddata["s2.new"] = ddata.apply(lambda x: (x["newallele"]-x["optimal_ru"])*x["sistr2"], 1)
    ddata["s2.old"] = ddata.apply(lambda x: (x["newallele"]-x["mutsize"]-x["optimal_ru"])*x["sistr2"], 1)
    ddata["s2.delta"] = ddata["s2.new"]-ddata["s2.old"]

    print ("##### %s #####"%sample)
    print("Num mutations: %s"%ddata.shape[0])
    print("Num mutations neg - perlocus: %s"%ddata[ddata["s.delta"]>0].shape[0])
    print("Num mutations neg - sistr2: %s"%ddata[ddata["s2.delta"]>0].shape[0])

    # Get bootstrap CIs for all mutations, only negative - TODO
    all_svals = list(ddata["s.delta"])
    all_svals_2 = list(ddata["s2.delta"])
    
    burden_bs = []
    burden_bs_2 = []
    burden_neg_bs = []
    burden_neg_bs_2 = []
    
    # Bootstrap sample mutations
    for i in range(NUMBS):
        bs_svals, bs_svals_2 = BootstrapSVals(all_svals, all_svals_2)
        burden_bs.append(np.sum(bs_svals))
        burden_bs_2.append(np.sum(bs_svals_2))
        burden_neg_bs.append(np.sum([item for item in bs_svals if item > 0]))
        burden_neg_bs_2.append(np.sum([item for item in bs_svals_2 if item > 0]))
    
    print("Total burden (perlocus): %s (%s-%s)"%(np.sum(ddata["s.delta"]), GetLowerBS(burden_bs), GetUpperBS(burden_bs)))
    print("Total burden (sistr2): %s (%s-%s)"%(np.sum(ddata["s2.delta"]), GetLowerBS(burden_bs_2), GetUpperBS(burden_bs_2)))
    print("Total burden only neg (perlocus): %s (%s-%s)"%(np.sum(ddata[ddata["s.delta"]>0]["s.delta"]), GetLowerBS(burden_neg_bs), GetUpperBS(burden_neg_bs_2)))
    print("Total burden only neg (sistr2): %s (%s-%s)"%(np.sum(ddata[ddata["s2.delta"]>0]["s2.delta"]), GetLowerBS(burden_neg_bs_2), GetUpperBS(burden_neg_bs_2)))

    #ddata[["chrom","start","period","motif","newallele","mutsize","ABC_s_median","s.est","optimal_ru","s.old","s.new","s.delta","LRT_p_value"]].sort_values("s.delta", ascending=False)

##### NA12864 #####
Num mutations: 71
Num mutations neg - perlocus: 28
Num mutations neg - sistr2: 44
Total burden (perlocus): 0.10155 (0.011636000000000002-0.2417424999999997)
Total burden (sistr2): 0.04259394431339999 (0.0008853290558367549-0.07949122466458347)
Total burden only neg (perlocus): 0.14090000000000005 (0.05164875000000001-0.10839943973711944)
Total burden only neg (sistr2): 0.07377828493375 (0.044411613476670504-0.10839943973711944)
##### NA10865 #####
Num mutations: 84
Num mutations neg - perlocus: 34
Num mutations neg - sistr2: 55
Total burden (perlocus): -0.004800000000000003 (-0.05077700000000002-0.031197999999999938)
Total burden (sistr2): 0.01597992023192 (-0.012781489923731248-0.04794682284641523)
Total burden only neg (perlocus): 0.035710000000000006 (0.023088499999999998-0.06398801371731348)
Total burden only neg (sistr2): 0.042243841245500006 (0.026944061604484246-0.06398801371731348)
##### NA10845 #####
Num mutations: 22
Num mutations neg - perlocus: 9
Num mut