In [1]:
import momi
import os
import logging
import argparse
logging.basicConfig(level=logging.INFO,
                    filename="inferecen.log")
os.chdir('/Users/chichun/Desktop/workspace/AdmixtreGraph2020')

In [2]:
# n_alleles per population (n_individuals = n_alleles / ploidy)
sampled_n_dict = {"Outgroup": 4,"S1":8, "S2":8, "Adm":8}
ploidy = 2  

In [4]:
# a dict mapping samples to populations
ind2pop = {}
for pop, n in sampled_n_dict.items():
    for i in range(int(n / ploidy)):
        # in the vcf, samples are named like YRI_0, YRI_1, CHB_0, etc
        ind2pop["{}_{}".format(pop, i)] = pop

with open("data/ind2pop.txt", "w") as f:
    for i, p in ind2pop.items():
        print(i, p, sep="\t", file=f)

!cat data/ind2pop.txt

Outgroup_0	Outgroup
Outgroup_1	Outgroup
S1_0	S1
S1_1	S1
S1_2	S1
S1_3	S1
S2_0	S2
S2_1	S2
S2_2	S2
S2_3	S2
Adm_0	Adm
Adm_1	Adm
Adm_2	Adm
Adm_3	Adm


In [29]:
rep = '1'
prefix = "Adm_bottleneck.rep"
bashCommand = "python -m momi.read_vcf data/{0}{1}.vcf.gz data/ind2pop.txt data/{0}{1}.snpAlleleCounts.gz --bed data/{0}{1}.bed".format(prefix, rep)
os.system(bashCommand)

0

In [30]:
bashCommand = "python -m momi.extract_sfs data/sfs_{0}{1}.gz 100 data/{0}{1}.snpAlleleCounts.gz".format(prefix, rep)
os.system(bashCommand)

0

In [34]:
sfs = momi.Sfs.load("data/sfs_{0}{1}.gz".format(prefix, rep))

In [35]:
no_pulse_model = momi.DemographicModel(N_e=5e4, gen_time=29, muts_per_gen=1.25e-8)
no_pulse_model.set_data(sfs)
# random initial value; user-specified lower bound
no_pulse_model.add_time_param("t_Adm_S2", lower=1e4)
no_pulse_model.add_leaf("Adm")
no_pulse_model.add_leaf("S2")
no_pulse_model.move_lineages("Adm", "S2", t="t_Adm_S2")
no_pulse_model.add_leaf("S1")
no_pulse_model.add_time_param("t_anc", lower=5e4, lower_constraints=["t_Adm_S2"])
no_pulse_model.move_lineages("S2", "S1", t="t_anc")
no_pulse_model.add_leaf("Outgroup")
no_pulse_model.add_time_param("t_out", lower_constraints=["t_anc"])
no_pulse_model.move_lineages("S1", "Outgroup", t="t_out")
add_pulse_model = no_pulse_model.copy()
add_pulse_model.add_pulse_param("p_pulse")
add_pulse_model.move_lineages(
    "Adm", "GhostS1", t=4.5e4, p="p_pulse")
add_pulse_model.add_time_param(
    "t_ghost", lower=5e4,
    upper_constraints=["t_anc"])
add_pulse_model.move_lineages(
    "GhostS1", "S1", t="t_ghost")
add_pulse_model = no_pulse_model.copy()
add_pulse_model.add_pulse_param("p_pulse")
add_pulse_model.add_time_param(
    "t_pulse", upper_constraints=["t_Adm_S2"])
add_pulse_model.move_lineages(
    "Adm", "GhostS1", t="t_pulse", p="p_pulse")
add_pulse_model.add_time_param(
    "t_ghost", lower_constraints=["t_pulse"], 
    upper_constraints=["t_anc"])
add_pulse_model.move_lineages(
    "GhostS1", "S1", t="t_ghost")

In [36]:
add_pulse_model.set_params(no_pulse_model.get_params())
result = add_pulse_model.optimize(options={"maxiter":200})
result

            fun: 0.5328197700857247
            jac: array([-2.38705715e-13,  3.38538631e-15,  2.50626266e-14, -2.79122987e-07,
       -8.33529821e-08,  1.64160848e-08])
  kl_divergence: 0.5328197700857247
 log_likelihood: -83873.80482999113
        message: 'Converged (|f_n-f_(n-1)| ~= 0)'
           nfev: 68
            nit: 14
     parameters: ParamsDict({'t_Adm_S2': 374624.574536008, 't_anc': 815118.6985259142, 't_out': 2629984.6699497555, 'p_pulse': 0.9999510638449295, 't_pulse': 360606.81782460405, 't_ghost': 665349.7302985078})
         status: 1
        success: True
              x: array([3.64624575e+05, 4.40494124e+05, 1.81486597e+06, 9.92494513e+00,
       3.24746335e+00, 7.10374616e-01])

In [47]:
pars = result.parameters
try:
    os.remove(f'output/{prefix}{rep}.txt')
except OSError:
    pass
with open(f'output/{prefix}{rep}.txt', 'a') as output:
    for p in pars:
        output.write('{}\t{:.4f}\n'.format(p,parameters[p]))