In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats

In [4]:
# Setup; data loading and cleaning
data_file = "/home/davidb/DESTv2_data_paper/15.demographic_inference/output/moments_output_jackknife.tsv"
max_num_params = 8
est_cols = ['est' + str(i) for i in range(max_num_params)]

# Read in data
df = pd.read_csv(data_file, sep='\t', header=None,
                     names=['model', 'pop_of_interest'] + \
                           ['init' + str(i) for i in range(max_num_params)] + \
                           est_cols + \
                           ['upper_bound' + str(i) for i in range(max_num_params)] + \
                           ['log_likelihood', 'collapsed_pop_ll', 'func_calls', 'grad_calls',
                            'maxiter', 'hour_limit',
                            'jackknife_id', 'region'])

# Group by model, as applied to each jackknife replicate of each region
df.groupby(['region', 'model', 'pop_of_interest', 'jackknife_id'],
           dropna=False).mean()

# Get the best fit for each jackknife replicate
df = df.loc[df.groupby(['region', 'model', 'pop_of_interest', 'jackknife_id'],
                       dropna=False)\
            ['log_likelihood'].idxmax()]
df = df[['region', 'model', 'pop_of_interest', 'jackknife_id', 'log_likelihood', 'collapsed_pop_ll'] + \
        est_cols]

def calculate_ci(x):
    return (list(x.nsmallest(2))[0], 
            list(x.nlargest(2))[0])


# df = df.reset_index()
# df.rename(columns={'level_3': 'ci_bound'}, inplace=True)
df = df.replace({'pop_of_interest': {0.0: '0', 
                                     1.0 : '1',
                                     2.0 : '2',
                                     3.0 : '3',
                                     np.nan: 'NA'},
                 'ci_bound': {0: 'lower', 1: 'upper'}})

# Merge model and pop of interest, which do not need to be distinguished now that
# we're only considering models within each region, and not calling general model
# functions.
pop_of_interest_suffices = ['_' + poi if poi != 'NA' else '' for poi in df['pop_of_interest']]
df['model'] = df['model'] + pop_of_interest_suffices
df.drop(columns=['pop_of_interest'], inplace=True)

In [5]:
# Model has a significant effect on collapsed-population log-likelihood
Europe_models = df[df.region == "Europe"].model.unique()
scipy.stats.kruskal(*[df[(df.region == "Europe") & (df.model == model)].collapsed_pop_ll
                      for model in Europe_models]).pvalue

np.float64(1.1659734090792368e-41)

In [7]:
# The split model has the greatest average collapsed-population log-likelihood for
# European models.
[(model, df[(df.region == "Europe") & (df.model == model)].collapsed_pop_ll.mean())
 for model in Europe_models]

[('admixture_0', np.float64(-4.420659018597776)),
 ('admixture_1', np.float64(-4.4199019251863)),
 ('admixture_2', np.float64(-4.422253117397938)),
 ('split', np.float64(-4.406278823502346)),
 ('split_asymmig', np.float64(-4.405005749666792)),
 ('twosplits_0', np.float64(-4.4201607037639175)),
 ('twosplits_1', np.float64(-4.420270875329244)),
 ('twosplits_2', np.float64(-4.419368979506466))]

In [10]:
# The split model gives significantly greater collapsed-population log-likelihood
# than all each other models, indicating that the suture zone does not exist.
for model in Europe_models:
    if model == "split_asymmig":
        continue

    p = scipy.stats.wilcoxon(df[(df.region == "Europe") & (df.model == "split_asymmig")].collapsed_pop_ll,
                             df[(df.region == "Europe") & (df.model == model)].collapsed_pop_ll,
                             alternative='greater').pvalue
    print(model, p)

admixture_0 9.094947017729282e-13
admixture_1 9.094947017729282e-13
admixture_2 9.094947017729282e-13
split 1.8189894035458565e-12
twosplits_0 9.094947017729282e-13
twosplits_1 9.094947017729282e-13
twosplits_2 9.094947017729282e-13


In [11]:
# The two_epoch model gives significantly greater collapsed-population log-likelihood
# than the split model in the mainland samples, indicating that non-Caribbean American
# samples all fall into one cluster.
scipy.stats.wilcoxon(df[(df.region == "mainland") & (df.model == "two_epoch")].collapsed_pop_ll,
                        df[(df.region == "mainland") & (df.model == "split")].collapsed_pop_ll,
                        alternative='greater').pvalue

np.float64(7.025937520666048e-07)

In [12]:
# The two_epoch model gives significantly greater collapsed-population log-likelihood
# than the split model in the Americas, indicating that the Caribbean cluster is 
# not valid.
scipy.stats.wilcoxon(df[(df.region == "Americas") & (df.model == "two_epoch")].collapsed_pop_ll,
                        df[(df.region == "Americas") & (df.model == "split")].collapsed_pop_ll,
                        alternative='greater').pvalue

np.float64(6.897607818245888e-09)

In [17]:
# Model has a significant effect on collapsed-population log-likelihood for Transatlantic
# models.
Transatlantic_models = df[df.region == "Transatlantic_expandedAfr"].model.unique()
scipy.stats.kruskal(*[df[(df.region == "Transatlantic_expandedAfr") & (df.model == model)].collapsed_pop_ll
                      for model in Transatlantic_models]).pvalue

np.float64(1.4705182536706714e-29)

In [18]:
# The admixture_2 model, i.e. Americas as an admixture of Eastern Europe and Zambia,
# has the greatest average collapsed-population log-likelihood for the Transatlantic
# region.
[(model, df[(df.region == "Transatlantic_expandedAfr") & (df.model == model)].log_likelihood.mean())
 for model in Transatlantic_models]

[('admixture_0', np.float64(-6.326867131449948)),
 ('admixture_1', np.float64(-6.298916300128928)),
 ('admixture_2', np.float64(-6.076376404132022)),
 ('admixture_3', np.float64(-6.035658276768975))]

In [21]:
# The admixture_2 model gives significantly greater collapsed-population log-likelihood
# than all each other model, indicating that American flies are better described
# as an admixture of Zambia and Eastern Europe than as any other combination of
# Western or Eastern Europe and Guinea or Zambia.
putative_best_model = "admixture_3"
for model in Transatlantic_models:
    if model == putative_best_model:
        continue

    p = scipy.stats.wilcoxon(df[(df.region == "Transatlantic_expandedAfr") & (df.model == putative_best_model)].collapsed_pop_ll,
                             df[(df.region == "Transatlantic_expandedAfr") & (df.model == model)].collapsed_pop_ll,
                             alternative='greater').pvalue
    print(model, p)

admixture_0 9.094947017729282e-13
admixture_1 9.094947017729282e-13
admixture_2 9.094947017729282e-13


In [26]:
print("95% CIs for parameter estimates for split-asymmig model of Europe:")
print(f"L\t{np.array(df[(df.region == 'Europe') & (df.model == 'split_asymmig')]['collapsed_pop_ll'].quantile([0.025, 0.975]))}")
for i, param_name in enumerate(['nu_EUW', 'nu_EUE', 'T_split', 'mE_to_W', 'mW_to_E']):
    ci = np.array(df[(df.region == "Europe") & (df.model == "split_asymmig")]['est' + str(i)].quantile([0.025, 0.975]))
    print(f"{param_name}\t{ci}")

95% CIs for parameter estimates for split-asymmig model of Europe:
L	[-4.40834645 -4.40234982]
nu_EUW	[0.45125461 0.82471854]
nu_EUE	[0.27163499 0.50206239]
T_split	[0.81020623 1.03809993]
mE_to_W	[ 62.67444704 114.29616777]
mW_to_E	[0.00100921 0.00186056]


In [29]:
print("95% CIs for parameter estimates for two_epoch model of mainland Americas:")
print(f"L\t{np.array(df[(df.region == 'Americas') & (df.model == 'two_epoch')]['collapsed_pop_ll'].quantile([0.025, 0.975]))}")
for i, param_name in enumerate(['nu', 'T']):
    ci = np.array(df[(df.region == "mainland") & (df.model == "two_epoch")]['est' + str(i)].quantile([0.025, 0.975]))
    print(f"{param_name}\t{ci}")

95% CIs for parameter estimates for two_epoch model of mainland Americas:
L	[-2.61207082 -2.60715977]
nu	[1.85794596 2.10597247]
T	[0.02297162 0.02874046]


In [28]:
print("95% CIs for parameter estimates for two_epoch model of the Americas:")
print(f"L\t{np.array(df[(df.region == 'Americas') & (df.model == 'two_epoch')]['collapsed_pop_ll'].quantile([0.025, 0.975]))}")
for i, param_name in enumerate(['nu', 'T']):
    ci = np.array(df[(df.region == "Americas") & (df.model == "two_epoch")]['est' + str(i)].quantile([0.025, 0.975]))
    print(f"{param_name}\t{ci}")

95% CIs for parameter estimates for two_epoch model of the Americas:
L	[-2.61207082 -2.60715977]
nu	[1.58762755 1.8860231 ]
T	[0.01854634 0.03073693]


In [28]:
# p_admix is the proportion of genetic material from the African population in the
# admixture event.
print("95% CIs for parameter estimates for admixture model of the Americas as admixture of EUW and Zambia+Zimbabwe:")
print(f"L\t{np.array(df[(df.region == 'Transatlantic_expandedAfr') & (df.model == 'admixture_3')]['collapsed_pop_ll'].quantile([0.025, 0.975]))}")
for i, param_name in enumerate(['nu_Afr', 'nu_EUE', 'nu_Am', 'T_split', 'T_admix', 'm2', 'm3', 'p_admix']):
    ci = np.array(df[(df.region == "Transatlantic_expandedAfr") & (df.model == "admixture_3")]['est' + str(i)].quantile([0.025, 0.975]))
    print(f"{param_name}\t{ci}")

95% CIs for parameter estimates for admixture model of the Americas as admixture of EUW and Zambia+Zimbabwe:
L	[-6.23550877 -5.8667782 ]
nu_Afr	[0.05321871 0.50061038]
nu_EUE	[0.51206821 1.16831047]
nu_Am	[0.97344772 1.97800206]
T_split	[0.01462968 0.47775615]
T_admix	[0.01846828 0.02982267]
m2	[0.23253592 2.78676757]
m3	[1.00221290e-03 3.02915354e+00]
p_admix	[0.00608448 0.10156844]
