In [None]:
import momi
import os
os.chdir('..')
import logging
logging.basicConfig(level=logging.INFO,
                    filename="logs/likelihood_ratio_momi.log")

# Create three population models
Create three population models with different split times

In [10]:
model_1 = momi.DemographicModel(N_e=1e4, gen_time=29, muts_per_gen=1.25e-8)

model_1.add_leaf("A")
model_1.add_leaf("B")
model_1.add_leaf("C")

model_2 = model_1.copy()
model_3 = model_1.copy()
model_4 = model_1.copy()

model_1.move_lineages("A", "B", t=2e4)
model_1.move_lineages("B", "C", t=3e5)
model_2.move_lineages("A", "B", t=2e4)
model_2.move_lineages("B", "C", t=3e4)
model_3.move_lineages("A", "B", t=2e5)
model_3.move_lineages("B", "C", t=3e5)
model_4.move_lineages("A", "B", t=0.999e5)
model_4.move_lineages("B", "C", t=1e5)

In [11]:
def generate_simulated_files(model, vcf_name, n_chrom):
    '''
    model (demo): model name
    n_chrom (int): total number of chromsome
    '''
    recoms_per_gen = 1.25e-8
    bases_per_locus = int(5e6)
    ploidy = 2        
    sampled_n_dict = {"A": 6,"B": 6, "C": 6}

    for rep in range(1, n_chrom+1):
      model.simulate_vcf(
            f"data/{vcf_name}.rep{rep}",
            recoms_per_gen=recoms_per_gen,
            length=bases_per_locus,
            chrom_name=f"1",
            ploidy=ploidy,
            sampled_n_dict=sampled_n_dict,
            force=True)

In [12]:
models = [model_1, model_2, model_3, model_4]
vcfs = ['model_1', 'model_2', 'model_3', 'trifurcation']
n_chrom = [100, 100, 100, 10]

for config in zip(models, vcfs, n_chrom):
    if config[1] == 'trifurcation':
        generate_simulated_files(config[0], config[1], config[2])

# Read in Data

In [13]:
sampled_n_dict = {"A": 6,"B": 6, "C": 6}
ploidy = 2  

# a dict mapping samples to populations
ind2pop = {}
for pop, n in sampled_n_dict.items():
    for i in range(int(n / ploidy)):
        ind2pop["{}_{}".format(pop, i)] = pop

with open("data/3pops_ind2pop.txt", "w") as f:
    for i, p in ind2pop.items():
        print(i, p, sep="\t", file=f)

In [14]:
prefix = 'trifurcation.rep'
rep = '1'

In [15]:
# reading in sfs from data
bashCommand = "python -m momi.read_vcf data/{0}{1}.vcf.gz data/3pops_ind2pop.txt data/{0}{1}.snpAlleleCounts.gz --bed data/{0}{1}.bed".format(prefix, rep)
os.system(bashCommand)
bashCommand = "python -m momi.extract_sfs data/sfs_{0}{1}.gz 100 data/{0}{1}.snpAlleleCounts.gz".format(prefix, rep)
os.system(bashCommand)
sfs = momi.Sfs.load("data/sfs_{0}{1}.gz".format(prefix, rep))

# Inference

In [75]:
model = momi.DemographicModel(N_e=1e4, gen_time=29, muts_per_gen=1.25e-8)
model.set_data(sfs)

model.add_leaf("A")
model.add_leaf("B")
model.add_leaf("C")

model.add_time_param("t_A_B")
model.add_time_param("t_B_C", lower_constraints=["t_A_B"])
model.move_lineages("A", "B", t="t_A_B")
model.move_lineages("B", "C", t="t_B_C")

model.optimize()
print(model.log_likelihood())
print(model.get_params())

-46665.09665053484
ParamsDict({'t_A_B': 6207.5881465552375, 't_B_C': 292606.3846007623})


In [77]:
model = momi.DemographicModel(N_e=1e4, gen_time=29, muts_per_gen=1.25e-8)
model.set_data(sfs)

model.add_leaf("A")
model.add_leaf("B")
model.add_leaf("C")

model.add_time_param("t_C_B")
model.add_time_param("t_B_A", lower_constraints=["t_C_B"])
model.move_lineages("C", "B", t="t_C_B")
model.move_lineages("B", "A", t="t_B_A")

model.optimize()
print(model.log_likelihood())
print(model.get_params())

-50996.763215757004
ParamsDict({'t_C_B': 162241.83168780257, 't_B_A': 162241.83168780257})


In [82]:
with open(f'output/{prefix}{rep}.log.txt', 'w') as logfile:
    param_dict = model.get_params()
    for p in param_dict:
        logfile.write(f'parameter\t{p}\t{param_dict[p]:.4f}\n')
    logfile.write(f'log_likelihood\tlog_likelihood\t{model.log_likelihood():.4f}\n')

## actually the same model (ignore)

In [78]:
model = momi.DemographicModel(N_e=1e4, gen_time=29, muts_per_gen=1.25e-8)
model.set_data(sfs)

model.add_leaf("A")
model.add_leaf("B")
model.add_leaf("C")

model.add_time_param("t_C_A")
model.add_time_param("t_A_B", lower_constraints=["t_C_A"])
model.move_lineages("C", "A", t="t_C_A")
model.move_lineages("A", "B", t="t_A_B")

model.optimize()
print(model.log_likelihood())
print(model.get_params())

-50996.763215757004
ParamsDict({'t_C_A': 162241.8316973647, 't_A_B': 162241.8316973647})
