In [1]:
import msprime, tskit
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import Fst_ts_thread_interval as F

intro

In [2]:
sample_size = 4 #number of samples per deme

mig = 3.8866e-7
seqLength = 32e3 
recr = 1.84675e-8
Ne0 = 2.3241e6
Ne1 = 9.8922e5 
splitT = 4.8580e6
mu = 1.9e-9

population_configurations = [
    msprime.PopulationConfiguration(sample_size=sample_size, initial_size=Ne0),
    msprime.PopulationConfiguration(sample_size=sample_size, initial_size=Ne1),
    ]
    
#demographic events: specify in the order they occur backwards in time
demographic_events = [
    msprime.PopulationParametersChange(time=splitT, initial_size=Ne0, population_id=0),
    msprime.MassMigration(time=splitT, source=1, destination=0, proportion=1.0),
    ]

In [None]:
replicates = msprime.simulate(
        num_replicates = 1,
        length = seqLength, 
        recombination_rate = recr,
        population_configurations = population_configurations,
        demographic_events = demographic_events,
        migration_matrix = [[0,0],
                            [mig,0]],
        mutation_rate = mu)

In [None]:
for index, ts in enumerate(replicates):
    msprime.mutate(ts, rate=mu, keep=True)
    with open('sim{}.vcf'.format(str(index)), 'w') as vcf_file:
        ts.write_vcf(vcf_file, ploidy=2)
    ts.dump('sim{}.trees'.format(str(index)))

or run all this in in script that can be run on multiple cores
change parametes within sim_chuncks.py as desired

In [None]:
%run sim_chuncks.py

run tskit Fst scan and
analyse output using script as provided in the Fst_istogram notebook

checking the impact of recombination rate variation

hapmap is a dataframe with recombination rate estimates (cM/Mb) for our case study
We can draw a random sample of those values, run the msprime simulation again and compare this to the outcome we obtained without recombination rate variation

In [5]:
hapmap = pd.read_csv('hapmap.csv')
hapmap.head()

Unnamed: 0,Start,cM/Mb,cM,Mb,relative length,recombination
0,1,0.0,0.0,1.352503,0.080487,0.0
1,1352504,11.418321,0.401,0.035119,0.00209,3e-06
2,1387623,84.486971,0.415,0.004912,0.000292,0.000172
3,1392535,17.491938,0.857,0.048994,0.002916,4e-06
4,1441529,67.515636,0.896,0.013271,0.00079,5.1e-05


In [None]:
#simulate replicates with different recombination rates cfr. windows 
#specify a distribution to draw values from
#choice can be weighted by the length of each interval, not necessary when windowwise estimates are given

recomb = np.random.choice(hapmap['recombination'],size=20, p=hapmap['relative length'])

replicates = [msprime.simulate(
        length = seqLength, 
        recombination_rate = rate,
        population_configurations = population_configurations,
        demographic_events = demographic_events,
        migration_matrix = [[0,0],
                            [mig,0]],
        mutation_rate = mu) 
for rate in recomb]

In [None]:
#results for when half the chromosome is simulated in its entirety 

building a secondary contact model

In [14]:
splitT = 4.8580e6
secT = 2e3 #moment of secondary contact
proportion =0.1 #proportion of ancestral population mass migrating into the population that split of at time T

population_configurations = [
    msprime.PopulationConfiguration(sample_size=sample_size, initial_size=Ne0),
    msprime.PopulationConfiguration(sample_size=sample_size, initial_size=Ne1),
    ]
    
#demographic events: specify in the order they occur backwards in time
demographic_events = [
    msprime.MassMigration(time=secT, source=1, destination=0, proportion=proportion),
    msprime.PopulationParametersChange(time=splitT, initial_size=Ne0, population_id=0),
    msprime.MassMigration(time=splitT, source=1, destination=0, proportion=1.0)
    ]

reps=20
replicates = msprime.simulate(
        num_replicates = 20,
        length = seqLength, 
        recombination_rate = recr,
        population_configurations = population_configurations,
        demographic_events = demographic_events)

In [15]:
#this step uses the Fst_script, running 20 replicates takes 20 minutes, clearly we need to hope that Jerome comes 
# up with a more efficient implementation (time cost not due to windowmaking here, but because algorithm does not use)
#the fact that trees along the ts are correlated.
Fst = np.zeros(reps)
for index, ts in enumerate(replicates):
    Fst[index] = F.calculate_Fst(ts, windowsize=32e3)[0][0]

determining windows: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]
calculating tree-wise Fst: 100%|██████████| 13969/13969 [01:00<00:00, 230.38it/s]
recombining tree-wise into window-wise Fst values: 100%|██████████| 1/1 [00:00<00:00, 256.34it/s]
determining windows: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s]
calculating tree-wise Fst: 100%|██████████| 14138/14138 [01:00<00:00, 233.00it/s]
recombining tree-wise into window-wise Fst values: 100%|██████████| 1/1 [00:00<00:00, 245.17it/s]
determining windows: 100%|██████████| 1/1 [00:00<00:00,  1.15it/s]
calculating tree-wise Fst: 100%|█████████▉| 14137/14140 [01:00<00:00, 102.84it/s]
recombining tree-wise into window-wise Fst values: 100%|██████████| 1/1 [00:00<00:00, 240.14it/s]
determining windows: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
calculating tree-wise Fst: 100%|██████████| 14670/14670 [01:06<00:00, 220.48it/s]
recombining tree-wise into window-wise Fst values: 100%|██████████| 1/1 [00:00<00:00, 255.81it/s]
determining 