# Supplemental Tables

In [1]:
# Load packages.
import numpy as np
import pandas as pd
# Intialize the pandas preferences.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Print version numbers.
print('numpy', np.__version__)
print('pandas', pd.__version__)

numpy 1.22.3
pandas 1.4.2


In [2]:
# Define a function to convert a number in scientific notation for LaTeX.
def convert_sci_notation_to_latex(value):
    # Convert to scientific notation with 3 decimal places.
    sci_notation = '{:.3e}'.format(value)
    # Split into mantissa and exponent.
    mantissa, exponent = sci_notation.split('e')
    # Convert the exponent to an integer to remove leading zeros,
    exponent = int(exponent)
    # Construct the LaTeX string.
    latex_str = r'${}e^{{{}}}$'.format(mantissa, exponent)
    return latex_str

## S1

In [3]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_muc19_introgressed_tract_frequency_per_super_population.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
cells={halign=c, valign=m},
hlines={solid, 1pt},
vlines={solid, 1pt},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S1. Frequency of introgressed tracts overlapping \textit{MUC19} among 1000 Genomes Project super populations.} \newline The frequency of introgressed tracts---i.e., the number of introgressed tracts normalized by the total number of chromosomes---and the mean tract length stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Introgressed tracts at \textit{MUC19} are significantly enriched in AMR individuals (Fisher's Exact Test, Odds Ratio: 2.354, \textit{P-value}: $2.144e^{-12}$), with AMR populations exhibiting a higher proportion of introgressed tracts compared to non-AMR populations, excluding AFR (Proportions \textit{Z}-Test, \textit{Z}-statistic: 7.441, \textit{P-value}: $5.011e^{-14}$).  Note that a "---" denotes that there are no introgressed tracts overlapping \textit{MUC19} for that group.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_muc19_introgressed_tract_frequency_per_super_population.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        f'{mat[i, 0]}'+r' & '\
        + r' & '.join((
            '---' if np.isnan(mat[i, j]) else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(1, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S1. Frequency of introgressed tracts overlapping \textit{MUC19} among 1000 Genomes Project super populations.} \newline The frequency of introgressed tracts---i.e., the number of introgressed tracts normalized by the total number of chromosomes---and the mean tract length stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Introgressed tracts at \textit{MUC19} are significantly enriched in AMR individuals (Fisher's Exact Test, Odds Ratio: 2.354, \textit{P-value}: $2.144e^{-12}$), with AMR populations exhibiting a higher proportion of introgressed tracts compared to non-AMR populations, excluding AFR (Proportions \textit{Z}-Test, \textit{Z}-statistic: 7.441, \textit{P-value}: $5.011e^{-14}$).  Note that a "---" denotes that there are no introgressed tracts overlapping \textit{MUC19} for that group.

In [4]:
df

Unnamed: 0,Super Population,Total Number of Chromosomes,Number of Introgressed Tracts,Introgressed Tract Frequency,Mean Tract Length
0,AMR,694,127,0.182997,609692.913386
1,SAS,978,145,0.148262,266268.965517
2,EAS,1008,88,0.087302,349102.272727
3,EUR,1006,27,0.026839,467037.037037
4,AFR,1008,0,0.0,


## S2

In [5]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_muc19_introgressed_tract_frequency_per_population.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S2. Frequency of introgressed tracts overlapping \textit{MUC19} among 1000 Genomes Project populations.} \newline The frequency of introgressed tracts---i.e., the number of introgressed tracts normalized by the total number of chromosomes---and the mean tract length for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Note that a "---" denotes that there are no introgressed tracts overlapping \textit{MUC19} for that group.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_muc19_introgressed_tract_frequency_per_population.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join((
            '---' if np.isnan(mat[i, j]) else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(2, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S2. Frequency of introgressed tracts overlapping \textit{MUC19} among 1000 Genomes Project populations.} \newline The frequency of introgressed tracts---i.e., the number of introgressed tracts normalized by the total number of chromosomes---and the mean tract length for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Note that a "---" denotes that there are no introgressed tracts overlapping \textit{MUC19} for that group.

In [6]:
df

Unnamed: 0,Super Population,Population,Total Number of Chromosomes,Number of Introgressed Tracts,Introgressed Tract Frequency,Mean Tract Length
0,AMR,MXL,128,39,0.304688,710666.666667
1,AMR,PEL,170,42,0.247059,574714.285714
2,AMR,CLM,188,23,0.12234,443304.347826
3,AMR,PUR,208,23,0.110577,668739.130435
4,SAS,BEB,172,34,0.197674,289941.176471
5,SAS,STU,204,33,0.161765,220878.787879
6,SAS,ITU,204,35,0.171569,227800.0
7,SAS,PJL,192,18,0.09375,238277.777778
8,SAS,GIH,206,25,0.121359,368000.0
9,EAS,CHB,206,11,0.053398,413454.545455


## S3

In [7]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_denisovan_specific_snp_denisty_742kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S3. Number of Denisovan-specific SNPs within the focal 742kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The number of Denisovan-specific SNPs---i.e., SNPs rare or absent in African populations ($<1\%$), present in the non-African population ($>1\%$), and uniquely shared with the Denisovan---observed in the focal 742kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the Denisovan-specific SNP genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the number of Denisovan-specific SNPs is greater than or equal to what is observed at the focal 742kb region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_denisovan_specific_snp_denisty_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, 7))+r' & '\
        + (
            r'$<3.164e^{-4}$' if mat[i, 7][0] == '<' else
            r'$>0.99968$' if mat[i, 7][0] == '>' else
            convert_sci_notation_to_latex(float(mat[i, 7])) if float(mat[i, 7]) < 0.001 else
            '{:.3f}'.format(round(float(mat[i, 7]), 3)).rstrip('0').rstrip('.')
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S3. Number of Denisovan-specific SNPs within the focal 742kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The number of Denisovan-specific SNPs---i.e., SNPs rare or absent in African populations ($<1\%$), present in the non-African population ($>1\%$), and uniquely shared with the Denisovan---observed in the focal 742kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the Denisovan-specific SNP genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the number of Denisovan-specific SNPs is greater than or equal to what is observed at the focal 742kb region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [8]:
df

Unnamed: 0,Super Population,Population,Focal 742kb Region (Denisovan-specific SNPs),742kb Non-overlapping Windows $\left( \mu \right)$,742kb Non-overlapping Windows $\left( \sigma \right)$,742kb Non-overlapping Windows $\left( SEM \right)$,742kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,AMR,MXL,135,1.750079,5.623103,0.100031,0.196131,<3.164e-04
1,AMR,PEL,135,1.663398,5.824271,0.103609,0.203148,<3.164e-04
2,AMR,CLM,135,2.001265,6.004427,0.106814,0.209432,<3.164e-04
3,AMR,PUR,135,1.608352,4.908564,0.087319,0.171209,<3.164e-04
4,SAS,BEB,135,4.686808,10.990951,0.19552,0.383359,0.00031635558367605187
5,SAS,STU,135,3.723505,9.674721,0.172106,0.33745,0.00031635558367605187
6,SAS,ITU,135,3.767479,9.997154,0.177841,0.348696,0.00031635558367605187
7,SAS,PJL,135,4.422335,10.531272,0.187343,0.367326,0.00031635558367605187
8,SAS,GIH,135,3.390383,8.953539,0.159276,0.312295,0.00031635558367605187
9,EAS,CHB,135,2.757039,9.942565,0.17687,0.346792,0.0006327111673521037


## S4

In [9]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_denisovan_specific_snp_denisty_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S4. Number of Denisovan-specific SNPs within the focal 72kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The number of Denisovan-specific SNPs---i.e., SNPs rare or absent in African populations ($<1\%$), present in the non-African population ($>1\%$), and uniquely shared with the Denisovan---observed in the focal 72kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the Denisovan-specific SNP genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the number of Denisovan-specific SNPs is greater than or equal to what is observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_denisovan_specific_snp_denisty_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, 7))+r' & '\
        + (
            r'$<3.389e^{-5}$' if mat[i, 7][0] == '<' else
            r'$>0.9999966$' if mat[i, 7][0] == '>' else
            convert_sci_notation_to_latex(float(mat[i, 7])) if float(mat[i, 7]) < 0.001 else
            '{:.3f}'.format(round(float(mat[i, 7]), 3)).rstrip('0').rstrip('.')
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S4. Number of Denisovan-specific SNPs within the focal 72kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The number of Denisovan-specific SNPs---i.e., SNPs rare or absent in African populations ($<1\%$), present in the non-African population ($>1\%$), and uniquely shared with the Denisovan---observed in the focal 72kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the Denisovan-specific SNP genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the number of Denisovan-specific SNPs is greater than or equal to what is observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [10]:
df

Unnamed: 0,Super Population,Population,Focal 72kb Region (Denisovan-specific SNPs),72kb Non-overlapping Windows $\left( \mu \right)$,72kb Non-overlapping Windows $\left( \sigma \right)$,72kb Non-overlapping Windows $\left( SEM \right)$,72kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,AMR,MXL,135,0.178079,1.536302,0.008943,0.017528,<3.389e-05
1,AMR,PEL,135,0.167711,1.503239,0.00875,0.017151,<3.389e-05
2,AMR,CLM,135,0.201321,1.6307,0.009492,0.018605,<3.389e-05
3,AMR,PUR,135,0.161782,1.441084,0.008388,0.016441,<3.389e-05
4,SAS,BEB,135,0.467118,2.449108,0.014256,0.027942,<3.389e-05
5,SAS,STU,135,0.371438,2.120086,0.012341,0.024188,<3.389e-05
6,SAS,ITU,135,0.368491,2.159334,0.012569,0.024636,<3.389e-05
7,SAS,PJL,135,0.432831,2.292055,0.013342,0.02615,<3.389e-05
8,SAS,GIH,135,0.334338,2.064443,0.012017,0.023553,<3.389e-05
9,EAS,CHB,135,0.2804,2.009982,0.0117,0.022932,<3.389e-05


## S5

In [11]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_neanderthal_specific_snp_denisty_742kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S5. Number of Neanderthal-specific SNPs within the focal 742kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The number of Neanderthal-specific SNPs---i.e., SNPs rare or absent in African populations ($<1\%$), present in the non-African population ($>1\%$), and uniquely shared with at least one of the three high-coverage Neanderthals---observed in the focal 742kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the Neanderthal-specific SNP genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the number of Neanderthal-specific SNPs is greater than or equal to what is observed at the focal 742kb region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_neanderthal_specific_snp_denisty_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S5. Number of Neanderthal-specific SNPs within the focal 742kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The number of Neanderthal-specific SNPs---i.e., SNPs rare or absent in African populations ($<1\%$), present in the non-African population ($>1\%$), and uniquely shared with at least one of the three high-coverage Neanderthals---observed in the focal 742kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the Neanderthal-specific SNP genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the number of Neanderthal-specific SNPs is greater than or equal to what is observed at the focal 742kb region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [12]:
df

Unnamed: 0,Super Population,Population,Focal 742kb Region (Neanderthal-specific SNPs),742kb Non-overlapping Windows $\left( \mu \right)$,742kb Non-overlapping Windows $\left( \sigma \right)$,742kb Non-overlapping Windows $\left( SEM \right)$,742kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,AMR,MXL,80,35.18317,45.209628,0.804243,1.576892,0.158811
1,AMR,PEL,95,31.069915,42.421685,0.754648,1.479649,0.096805
2,AMR,CLM,95,40.197722,48.246151,0.858261,1.682804,0.146156
3,AMR,PUR,95,35.199937,45.629117,0.811706,1.591523,0.116735
4,SAS,BEB,95,52.221765,56.294081,1.001427,1.963512,0.230623
5,SAS,STU,95,44.078773,52.299243,0.930362,1.824174,0.177475
6,SAS,ITU,93,43.830117,52.051303,0.925951,1.815526,0.185384
7,SAS,PJL,95,50.680165,55.329451,0.984267,1.929867,0.217336
8,SAS,GIH,94,42.474217,51.579525,0.917559,1.799071,0.169567
9,EAS,CHB,82,32.609301,47.829347,0.850846,1.668266,0.168301


## S6

In [13]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_neanderthal_specific_snp_denisty_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S6. Number of Neanderthal-specific SNPs within the focal 72kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The number of Neanderthal-specific SNPs---i.e., SNPs rare or absent in African populations ($<1\%$), present in the non-African population ($>1\%$), and uniquely shared with at least one of the three high-coverage Neanderthals---observed in the focal 72kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the Neanderthal-specific SNP genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the number of Neanderthal-specific SNPs is greater than or equal to what is observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_neanderthal_specific_snp_denisty_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, 7))+r' & '\
        + (
            r'$<3.389e^{-5}$' if mat[i, 7][0] == '<' else
            r'$>0.9999966$' if mat[i, 7][0] == '>' else
            convert_sci_notation_to_latex(float(mat[i, 7])) if float(mat[i, 7]) < 0.001 else
            '{:.3f}'.format(round(float(mat[i, 7]), 3)).rstrip('0').rstrip('.')
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S6. Number of Neanderthal-specific SNPs within the focal 72kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The number of Neanderthal-specific SNPs---i.e., SNPs rare or absent in African populations ($<1\%$), present in the non-African population ($>1\%$), and uniquely shared with at least one of the three high-coverage Neanderthals---observed in the focal 72kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the Neanderthal-specific SNP genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the number of Neanderthal-specific SNPs is greater than or equal to what is observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [14]:
df

Unnamed: 0,Super Population,Population,Focal 72kb Region (Neanderthal-specific SNPs),72kb Non-overlapping Windows $\left( \mu \right)$,72kb Non-overlapping Windows $\left( \sigma \right)$,72kb Non-overlapping Windows $\left( SEM \right)$,72kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,AMR,MXL,4,3.427444,7.030794,0.040925,0.080215,0.26257834999152974
1,AMR,PEL,4,3.024632,6.598628,0.03841,0.075284,0.23452481788920887
2,AMR,CLM,4,3.913569,7.441163,0.043314,0.084897,0.2957818058614264
3,AMR,PUR,4,3.426664,7.039147,0.040974,0.08031,0.2618668473657462
4,SAS,BEB,4,5.110215,8.376788,0.04876,0.095572,0.3750974080975775
5,SAS,STU,4,4.306048,7.852042,0.045706,0.089585,0.31997289513806537
6,SAS,ITU,4,4.29236,7.800725,0.045407,0.088999,0.31966796544130105
7,SAS,PJL,4,4.964527,8.33727,0.04853,0.095121,0.3642554633237337
8,SAS,GIH,4,4.158597,7.775598,0.045261,0.088713,0.3106555988480434
9,EAS,CHB,4,3.178181,6.90596,0.040199,0.078791,0.2376079959342707


## S7

In [15]:
# Load the dataframe.
df = pd.read_csv('./dataframes/u30_afr_b_den_742kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
cols[2] = r'Focal 742kb Region ($U_{AFR:B:DEN}$ $(1\%, 30\%, 100\%)$)'
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXp{3cm}XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S7. $\boldsymbol{U_{AFR:B:DEN}(1\%, 30\%, 100\%)}$ values for the focal 742kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ values observed in the focal 742kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). The $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ statistic quantifies the number of sites where: 1) the Denisovan allele is found in the homozygous state, 2) the Denisovan allele is at low frequency ($<1\%$) in the African super population, and 3) the Denisovan allele is at high frequency ($>30\%$) in the non-African population (\textit{B}). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ value is greater than or equal to that observed at the focal 742kb region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/u30_afr_b_den_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, 7))+r' & '\
        + (
            r'$<3.139e^{-4}$' if mat[i, 7][0] == '<' else
            r'$>0.999686$' if mat[i, 7][0] == '>' else
            convert_sci_notation_to_latex(float(mat[i, 7])) if float(mat[i, 7]) < 0.001 else
            '{:.3f}'.format(round(float(mat[i, 7]), 3)).rstrip('0').rstrip('.')
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S7. $\boldsymbol{U_{AFR:B:DEN}(1\%, 30\%, 100\%)}$ values for the focal 742kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ values observed in the focal 742kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). The $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ statistic quantifies the number of sites where: 1) the Denisovan allele is found in the homozygous state, 2) the Denisovan allele is at low frequency ($<1\%$) in the African super population, and 3) the Denisovan allele is at high frequency ($>30\%$) in the non-African population (\textit{B}). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ value is greater than or equal to that observed at the focal 742kb region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [16]:
df

Unnamed: 0,Super Population,Population $\left( B \right)$,"Focal 742kb Region ($U_{AFR:B:DEN}$ $(1\%, 30\%, 100\%)$)",742kb Non-overlapping Windows $\left( \mu \right)$,742kb Non-overlapping Windows $\left( \sigma \right)$,742kb Non-overlapping Windows $\left( SEM \right)$,742kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,AMR,MXL,136,0.312304,2.77896,0.049241,0.096547,<3.139e-04
1,AMR,PEL,0,0.666981,4.357879,0.077218,0.151403,>0.999686
2,AMR,CLM,0,0.155995,1.829475,0.032417,0.06356,>0.999686
3,AMR,PUR,0,0.124922,1.640308,0.029065,0.056988,>0.999686
4,SAS,BEB,0,0.175455,1.829084,0.03241,0.063547,>0.999686
5,SAS,STU,0,0.129944,1.299017,0.023018,0.045131,>0.999686
6,SAS,ITU,0,0.180791,1.895734,0.033591,0.065862,>0.999686
7,SAS,PJL,0,0.161645,1.771979,0.031398,0.061563,>0.999686
8,SAS,GIH,0,0.174513,1.843787,0.032671,0.064057,>0.999686
9,EAS,CHB,0,0.438481,2.281262,0.040422,0.079256,>0.999686


## S8

In [17]:
# Load the dataframe.
df = pd.read_csv('./dataframes/u30_afr_b_den_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
cols[2] = r'Focal 72kb Region ($U_{AFR:B:DEN}$ $(1\%, 30\%, 100\%)$)'
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXp{3cm}XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S8. $\boldsymbol{U_{AFR:B:DEN}(1\%, 30\%, 100\%)}$ values for the focal 72kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ values observed in the focal 72kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). The $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ statistic quantifies the number of sites where: 1) the Denisovan allele is found in the homozygous state, 2) the Denisovan allele is at low frequency ($<1\%$) in the African super population, and 3) the Denisovan allele is at high frequency ($>30\%$) in the non-African population (\textit{B}). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ value is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/u30_afr_b_den_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(mat[i, j]).rstrip('0').rstrip('.') for j in range(2, 7))+r' & '\
        + (
            r'$<3.284e^{-5}$' if mat[i, 7][0] == '<' else
            r'$>0.999967$' if mat[i, 7][0] == '>' else
            convert_sci_notation_to_latex(float(mat[i, 7])) if float(mat[i, 7]) < 0.001 else
            '{:.3f}'.format(round(float(mat[i, 7]), 3)).rstrip('0').rstrip('.')
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S8. $\boldsymbol{U_{AFR:B:DEN}(1\%, 30\%, 100\%)}$ values for the focal 72kb \textit{MUC19} region among non-African populations in the 1000 Genomes Project.} \newline The $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ values observed in the focal 72kb region for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). The $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ statistic quantifies the number of sites where: 1) the Denisovan allele is found in the homozygous state, 2) the Denisovan allele is at low frequency ($<1\%$) in the African super population, and 3) the Denisovan allele is at high frequency ($>30\%$) in the non-African population (\textit{B}). For each non-African population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the $U_{AFR:B:DEN}(1\%, 30\%, 100\%)$ value is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [18]:
df

Unnamed: 0,Super Population,Population $\left( B \right)$,"Focal 72kb Region ($U_{AFR:B:DEN}$ $(1\%, 30\%, 100\%)$)",72kb Non-overlapping Windows $\left( \mu \right)$,72kb Non-overlapping Windows $\left( \sigma \right)$,72kb Non-overlapping Windows $\left( SEM \right)$,72kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,AMR,MXL,136,0.030045,0.859689,0.004926,0.009656,<3.284e-05
1,AMR,PEL,0,0.066428,0.932168,0.005342,0.01047,>0.999967
2,AMR,CLM,0,0.015564,0.504027,0.002888,0.005661,>0.999967
3,AMR,PUR,0,0.012084,0.456902,0.002618,0.005132,>0.999967
4,SAS,BEB,0,0.017469,0.48246,0.002765,0.005419,>0.999967
5,SAS,STU,0,0.012511,0.379597,0.002175,0.004264,>0.999967
6,SAS,ITU,0,0.016451,0.472907,0.00271,0.005312,>0.999967
7,SAS,PJL,0,0.016221,0.476064,0.002728,0.005347,>0.999967
8,SAS,GIH,0,0.017633,0.486757,0.002789,0.005467,>0.999967
9,EAS,CHB,0,0.040881,0.545617,0.003127,0.006128,>0.999967


## S9

In [19]:
# Load the dataframe.
df = pd.read_csv('./dataframes/q95_u30_afr_b_den_72kb_and_742kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
cols[-2] = r'Focal 72kb Region ($Q95_{AFR:B:DEN}$ $(1\%, 100\%)$)'
cols[-1] = r'Focal 742kb Region ($Q95_{AFR:B:DEN}$ $(1\%, 100\%)$)'
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S9. $\boldsymbol{Q95_{AFR:B:DEN}(1\%, 100\%)}$ values for the focal 72kb and 742kb \textit{MUC19} regions among non-African populations in the 1000 Genomes Project.} \newline The $Q95_{AFR:B:DEN}(1\%, 100\%)$ values observed in the focal 72kb and 742kb regions for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). The $Q95_{AFR:B:DEN}(1\%, 100\%)$ statistic quantifies $95^{th}$ percentile of the Denisovan alleles in the non-African population (\textit{B}) conditioned on: 1) the Denisovan allele is found in the homozygous state, and 2) the Denisovan allele is at low frequency ($<1\%$) in the African super population.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/q95_u30_afr_b_den_72kb_and_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S9. $\boldsymbol{Q95_{AFR:B:DEN}(1\%, 100\%)}$ values for the focal 72kb and 742kb \textit{MUC19} regions among non-African populations in the 1000 Genomes Project.} \newline The $Q95_{AFR:B:DEN}(1\%, 100\%)$ values observed in the focal 72kb and 742kb regions for each non-African population, stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). The $Q95_{AFR:B:DEN}(1\%, 100\%)$ statistic quantifies $95^{th}$ percentile of the Denisovan alleles in the non-African population (\textit{B}) conditioned on: 1) the Denisovan allele is found in the homozygous state, and 2) the Denisovan allele is at low frequency ($<1\%$) in the African super population.

In [20]:
df

Unnamed: 0,Super Population,Population $\left( B \right)$,"Focal 72kb Region ($Q95_{AFR:B:DEN}$ $(1\%, 100\%)$)","Focal 742kb Region ($Q95_{AFR:B:DEN}$ $(1\%, 100\%)$)"
0,AMR,MXL,0.304688,0.304688
1,AMR,PEL,0.217647,0.217647
2,AMR,CLM,0.074468,0.074468
3,AMR,PUR,0.086538,0.086538
4,SAS,BEB,0.156977,0.156977
5,SAS,STU,0.112745,0.112745
6,SAS,ITU,0.107843,0.107843
7,SAS,PJL,0.052083,0.052083
8,SAS,GIH,0.097087,0.097087
9,EAS,CHB,0.033981,0.033981


## S10

In [21]:
# Load the dataframe.
df = pd.read_csv('./dataframes/mxl_chb_ceu_pbs_per_region_742kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
mat[:, 0] = np.array([r'$A =$ All MXL Inds.', r'$A =$ MXL Inds. $>50\%$ IAA', r'$A =$ MXL Inds. $<50\%$ IAA'])
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S10. $\boldsymbol{PBS_{A:CHB:CEU}}$ values for the focal 742kb \textit{MUC19} region among different Indigenous American ancestry (IAA) demes in MXL.} \newline The $PBS_{A:CHB:CEU}$ values observed in the focal 742kb region for different IAA-partitioned demes in MXL: all individuals in the MXL population ($n = 64$; \textit{A} = All MXL Individuals), MXL individuals with genome-wide IAA $> 50\%$ ($n = 27$; \textit{A} = MXL Individuals $> 50\%$ IAA), and MXL individuals with genome-wide IAA $< 50\%$ ($n = 37$; \textit{A} = MXL Individuals $< 50\%$ IAA). The $PBS_{A:CHB:CEU}$ statistic uses the logarithmic transformation of pairwise $F_{ST}$ estimates to measure the branch length in deme \textit{A} since its divergence from the two control populations: CHB and CEU. For each IAA-partitioned deme, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $PBS_{A:CHB:CEU}$  genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the $PBS_{A:CHB:CEU}$ value is greater than or equal to that observed at the focal 742kb region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/mxl_chb_ceu_pbs_per_region_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(1, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S10. $\boldsymbol{PBS_{A:CHB:CEU}}$ values for the focal 742kb \textit{MUC19} region among different Indigenous American ancestry (IAA) demes in MXL.} \newline The $PBS_{A:CHB:CEU}$ values observed in the focal 742kb region for different IAA-partitioned demes in MXL: all individuals in the MXL population ($n = 64$; \textit{A} = All MXL Individuals), MXL individuals with genome-wide IAA $> 50\%$ ($n = 27$; \textit{A} = MXL Individuals $> 50\%$ IAA), and MXL individuals with genome-wide IAA $< 50\%$ ($n = 37$; \textit{A} = MXL Individuals $< 50\%$ IAA). The $PBS_{A:CHB:CEU}$ statistic uses the logarithmic transformation of pairwise $F_{ST}$ estimates to measure the branch length in deme \textit{A} since its divergence from the two control populations: CHB and CEU. For each IAA-partitioned deme, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $PBS_{A:CHB:CEU}$  genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the $PBS_{A:CHB:CEU}$ value is greater than or equal to that observed at the focal 742kb region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [22]:
df

Unnamed: 0,$PBS_{A:CHB:CEU}$,Focal 742kb Region $\left( PBS \right)$,742kb Non-overlapping Windows $\left( \mu \right)$,742kb Non-overlapping Windows $\left( \sigma \right)$,742kb Non-overlapping Windows $\left( SEM \right)$,742kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,$A =$ All MXL Inds.,0.065842,0.005256,0.011149,0.000195,0.000382,0.003972
1,$A =$ MXL Inds. >50% IAA,0.133193,0.018499,0.023611,0.000413,0.000809,0.003361
2,$A =$ MXL Inds. <50% IAA,0.029065,0.002137,0.007044,0.000123,0.000241,0.009777


## S11

In [23]:
# Load the dataframe.
df = pd.read_csv('./dataframes/mxl_chb_ceu_pbs_per_region_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
mat[:, 0] = np.array([r'$A =$ All MXL Inds.', r'$A =$ MXL Inds. $>50\%$ IAA', r'$A =$ MXL Inds. $<50\%$ IAA'])
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S11. $\boldsymbol{PBS_{A:CHB:CEU}}$ values for the focal 72kb \textit{MUC19} region among different Indigenous American ancestry (IAA) demes in MXL.} \newline The $PBS_{A:CHB:CEU}$ values observed in the focal 72kb region for different IAA-partitioned demes in MXL: all individuals in the MXL population ($n = 64$; \textit{A} = All MXL Individuals), MXL individuals with genome-wide IAA $> 50\%$ ($n = 27$; \textit{A} = MXL Individuals $> 50\%$ IAA), and MXL individuals with genome-wide IAA $< 50\%$ ($n = 37$; \textit{A} = MXL Individuals $< 50\%$ IAA). The $PBS_{A:CHB:CEU}$ statistic uses the logarithmic transformation of pairwise $F_{ST}$ estimates to measure the branch length in deme \textit{A} since its divergence from the two control populations: CHB and CEU. For each IAA-partitioned deme, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $PBS_{A:CHB:CEU}$  genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the $PBS_{A:CHB:CEU}$ value is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/mxl_chb_ceu_pbs_per_region_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(1, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S11. $\boldsymbol{PBS_{A:CHB:CEU}}$ values for the focal 72kb \textit{MUC19} region among different Indigenous American ancestry (IAA) demes in MXL.} \newline The $PBS_{A:CHB:CEU}$ values observed in the focal 72kb region for different IAA-partitioned demes in MXL: all individuals in the MXL population ($n = 64$; \textit{A} = All MXL Individuals), MXL individuals with genome-wide IAA $> 50\%$ ($n = 27$; \textit{A} = MXL Individuals $> 50\%$ IAA), and MXL individuals with genome-wide IAA $< 50\%$ ($n = 37$; \textit{A} = MXL Individuals $< 50\%$ IAA). The $PBS_{A:CHB:CEU}$ statistic uses the logarithmic transformation of pairwise $F_{ST}$ estimates to measure the branch length in deme \textit{A} since its divergence from the two control populations: CHB and CEU. For each IAA-partitioned deme, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $PBS_{A:CHB:CEU}$  genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the $PBS_{A:CHB:CEU}$ value is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [24]:
df

Unnamed: 0,$PBS_{A:CHB:CEU}$,Focal 72kb Region $\left( PBS \right)$,72kb Non-overlapping Windows $\left( \mu \right)$,72kb Non-overlapping Windows $\left( \sigma \right)$,72kb Non-overlapping Windows $\left( SEM \right)$,72kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,$A =$ All MXL Inds.,0.127226,0.008982,0.01851,0.000104,0.000204,0.002241
1,$A =$ MXL Inds. >50% IAA,0.21744,0.022913,0.036177,0.000203,0.000398,0.003378
2,$A =$ MXL Inds. <50% IAA,0.071933,0.005169,0.013232,7.4e-05,0.000146,0.006345


## S12

In [25]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_ihs_critical_scores_proportions_742kb.csv.gz')
df = df[df['SNP Set'] == 'All SNPs']
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 3, 4, 5, 6, 7, 8, 9]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 3, 4, 5, 6, 7, 8, 9]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S12. Integrated Haplotype Scores (\textit{iHS}) for the focal 742kb \textit{MUC19} region among 1000 Genomes Project populations.} \newline The normalized $\mid iHS \mid$ scores observed at the focal 742kb region for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). For each population, the normalized \textit{iHS} scores were calculated for all SNPs with a minor allele frequency $> 5\%$. For each population, the table reports the total number of \textit{iHS} scores observed, the number of extreme \textit{iHS} scores (i.e., $\mid iHS \mid > 2$), the observed proportion of extreme \textit{iHS} scores (calculated as the number of extreme \textit{iHS} scores normalized by the total number of \textit{iHS} scores), the $99^{th}$ percentile of the proportion of extreme \textit{iHS} scores determined from the genomic background distribution (i.e., non-overlapping 742kb windows with more than 10 SNPs), the total number of 742kb windows in the genomic background distribution, the number of 742kb windows where the observed proportion of extreme \textit{iHS} scores is greater than the genomic background distribution, and the percentile rank of the observed proportion of extreme \textit{iHS} scores (i.e., the percentage of the genomic background distribution that is less than the observed proportion of extreme \textit{iHS} scores). An observed proportion of extreme \textit{iHS} scores greater than the $99^{th}$ percentile of the genomic background distribution is considered evidence of positive selection.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_ihs_critical_scores_proportions_all_snps_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join((
            '---' if np.isnan(mat[i, j]) else
            '0' if mat[i, j] == 0 else
            convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S12. Integrated Haplotype Scores (\textit{iHS}) for the focal 742kb \textit{MUC19} region among 1000 Genomes Project populations.} \newline The normalized $\mid iHS \mid$ scores observed at the focal 742kb region for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). For each population, the normalized \textit{iHS} scores were calculated for all SNPs with a minor allele frequency $> 5\%$. For each population, the table reports the total number of \textit{iHS} scores observed, the number of extreme \textit{iHS} scores (i.e., $\mid iHS \mid > 2$), the observed proportion of extreme \textit{iHS} scores (calculated as the number of extreme \textit{iHS} scores normalized by the total number of \textit{iHS} scores), the $99^{th}$ percentile of the proportion of extreme \textit{iHS} scores determined from the genomic background distribution (i.e., non-overlapping 742kb windows with more than 10 SNPs), the total number of 742kb windows in the genomic background distribution, the number of 742kb windows where the observed proportion of extreme \textit{iHS} scores is greater than the genomic background distribution, and the percentile rank of the observed proportion of extreme \textit{iHS} scores (i.e., the percentage of the genomic background distribution that is less than the observed proportion of extreme \textit{iHS} scores). An observed proportion of extreme \textit{iHS} scores greater than the $99^{th}$ percentile of the genomic background distribution is considered evidence of positive selection.

In [26]:
df.iloc[:, [0, 1, 3, 4, 5, 6, 7, 8, 9]]

Unnamed: 0,Super Population,Population,Focal 742kb Region (Total SNPs),Focal 742kb Region (SNPs with $\mid iHS \mid > 2$),Focal 742kb Region (Prop. of SNPs with $\mid iHS \mid > 2$),742kb Non-overlapping Windows ($99^{th}$ Percentile),742kb Non-overlapping Windows (Total SNPs $> 10$),Focal 742kb Region $>$ 742kb Non-overlapping Windows,Focal 742kb Region (Percentile Rank)
0,AMR,MXL,2248,599,0.266459,0.210119,3595.0,3578.0,99.527121
4,AMR,PEL,2158,162,0.07507,0.28009,3591.0,2983.0,83.068783
8,AMR,CLM,2397,53,0.022111,0.237278,3596.0,1165.0,32.397108
12,AMR,PUR,2407,68,0.028251,0.213893,3597.0,1503.0,41.784821
16,SAS,BEB,2177,78,0.035829,0.273717,3595.0,2106.0,58.581363
20,SAS,STU,2197,65,0.029586,0.279665,3596.0,1781.0,49.527253
24,SAS,ITU,1976,23,0.01164,0.259856,3596.0,668.0,18.576196
28,SAS,PJL,2064,32,0.015504,0.272384,3596.0,945.0,26.279199
32,SAS,GIH,2195,24,0.010934,0.262338,3596.0,596.0,16.573971
36,EAS,CHB,1591,48,0.03017,0.233153,3594.0,1795.0,49.944352


## S13

In [27]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_ihs_critical_scores_proportions_72kb.csv.gz')
df = df[df['SNP Set'] == 'All SNPs']
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 3, 4, 5, 6, 7, 8, 9]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 3, 4, 5, 6, 7, 8, 9]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S13. Integrated Haplotype Scores (\textit{iHS}) for the focal 72kb \textit{MUC19} region among 1000 Genomes Project populations.} \newline The normalized $\mid iHS \mid$ scores observed at the focal 742kb region for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). For each population, the normalized \textit{iHS} scores were calculated for all SNPs with a minor allele frequency $> 5\%$. For each population, the table reports the total number of \textit{iHS} scores observed, the number of extreme \textit{iHS} scores (i.e., $\mid iHS \mid > 2$), the observed proportion of extreme \textit{iHS} scores (calculated as the number of extreme \textit{iHS} scores normalized by the total number of \textit{iHS} scores), the $99^{th}$ percentile of the proportion of extreme \textit{iHS} scores determined from the genomic background distribution (i.e., non-overlapping 72kb windows with more than 10 SNPs), the total number of 72kb windows in the genomic background distribution, the number of 72kb windows where the observed proportion of extreme \textit{iHS} scores is greater than the genomic background distribution, and the percentile rank of the observed proportion of extreme \textit{iHS} scores (i.e., the percentage of the genomic background distribution that is less than the observed proportion of extreme \textit{iHS} scores). An observed proportion of extreme \textit{iHS} scores greater than the $99^{th}$ percentile of the genomic background distribution is considered evidence of positive selection.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_ihs_critical_scores_proportions_all_snps_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join((
            '---' if np.isnan(mat[i, j]) else
            '0' if mat[i, j] == 0 else
            convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S13. Integrated Haplotype Scores (\textit{iHS}) for the focal 72kb \textit{MUC19} region among 1000 Genomes Project populations.} \newline The normalized $\mid iHS \mid$ scores observed at the focal 742kb region for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). For each population, the normalized \textit{iHS} scores were calculated for all SNPs with a minor allele frequency $> 5\%$. For each population, the table reports the total number of \textit{iHS} scores observed, the number of extreme \textit{iHS} scores (i.e., $\mid iHS \mid > 2$), the observed proportion of extreme \textit{iHS} scores (calculated as the number of extreme \textit{iHS} scores normalized by the total number of \textit{iHS} scores), the $99^{th}$ percentile of the proportion of extreme \textit{iHS} scores determined from the genomic background distribution (i.e., non-overlapping 72kb windows with more than 10 SNPs), the total number of 72kb windows in the genomic background distribution, the number of 72kb windows where the observed proportion of extreme \textit{iHS} scores is greater than the genomic background distribution, and the percentile rank of the observed proportion of extreme \textit{iHS} scores (i.e., the percentage of the genomic background distribution that is less than the observed proportion of extreme \textit{iHS} scores). An observed proportion of extreme \textit{iHS} scores greater than the $99^{th}$ percentile of the genomic background distribution is considered evidence of positive selection.

In [28]:
df.iloc[:, [0, 1, 3, 4, 5, 6, 7, 8, 9]]

Unnamed: 0,Super Population,Population,Focal 72kb Region (Total SNPs),Focal 72kb Region (SNPs with $\mid iHS \mid > 2$),Focal 72kb Region (Prop. of SNPs with $\mid iHS \mid > 2$),72kb Non-overlapping Windows ($99^{th}$ Percentile),72kb Non-overlapping Windows (Total SNPs $> 10$),Focal 72kb Region $>$ 72kb Non-overlapping Windows,Focal 72kb Region (Percentile Rank)
0,AMR,MXL,425,229,0.538824,0.462339,36411.0,36202.0,99.425998
4,AMR,PEL,403,86,0.2134,0.4916,36293.0,34299.0,94.505828
8,AMR,CLM,431,7,0.016241,0.473293,36459.0,19972.0,54.779341
12,AMR,PUR,441,1,0.002268,0.447304,36474.0,11772.0,32.275045
16,SAS,BEB,420,6,0.014286,0.525785,36388.0,21536.0,59.184346
20,SAS,STU,413,0,0.0,0.523772,36376.0,0.0,0.0
24,SAS,ITU,403,0,0.0,0.52032,36383.0,0.0,0.0
28,SAS,PJL,413,0,0.0,0.516129,36409.0,0.0,0.0
32,SAS,GIH,415,0,0.0,0.5,36354.0,0.0,0.0
36,EAS,CHB,168,1,0.005952,0.489533,36193.0,17256.0,47.677728


## S14

In [29]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_short_read_repeat_region_introgressed_tract_frequency_per_super_population.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
cells={halign=c, valign=m},
hlines={solid, 1pt},
vlines={solid, 1pt},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S14. Frequency of introgressed tracts overlapping \textit{MUC19} short-read repeat region among 1000 Genomes Project super populations.} \newline The frequency of introgressed tracts---i.e., the number of introgressed tracts normalized by the total number of chromosomes---and the mean tract length stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Introgressed tracts at the \textit{MUC19} short-read repeat region (hg19, Chr12:40876395-40885001) are significantly enriched in AMR individuals (Fisher's Exact Test, Odds Ratio: 5.940, \textit{P-value}: $1.953e^{-31}$), with AMR populations exhibiting a higher proportion of introgressed tracts compared to non-AMR populations, excluding AFR (Proportions \textit{Z}-Test, \textit{Z}-statistic: 13.269, \textit{P-value}: $1.742e^{-40}$). Note that a "---" denotes that there are no introgressed tracts overlapping short-read repeat region for that group.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_short_read_repeat_region_introgressed_tract_frequency_per_super_population.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        f'{mat[i, 0]}'+r' & '\
        + r' & '.join((
            '---' if np.isnan(mat[i, j]) else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(1, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S14. Frequency of introgressed tracts overlapping \textit{MUC19} short-read repeat region among 1000 Genomes Project super populations.} \newline The frequency of introgressed tracts---i.e., the number of introgressed tracts normalized by the total number of chromosomes---stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Introgressed tracts at the \textit{MUC19} short-read repeat region (hg19, Chr12:40875941-40885367) are significantly enriched in AMR individuals (Fisher's Exact Test, Odds Ratio: 5.940, \textit{P-value}: $1.953e^{-31}$), with AMR populations exhibiting a higher proportion of introgressed tracts compared to non-AMR populations, excluding AFR (Proportions \textit{Z}-Test, \textit{Z}-statistic: 13.269, \textit{P-value}: $1.742e^{-40}$).

In [30]:
df

Unnamed: 0,Super Population,Total Number of Chromosomes,Number of Introgressed Tracts,Introgressed Tract Frequency,Mean Tract Length
0,AMR,694,109,0.157061,547954.12844
1,SAS,978,27,0.027607,283666.666667
2,EAS,1008,58,0.05754,335327.586207
3,EUR,1006,6,0.005964,460500.0
4,AFR,1008,0,0.0,


## S15

In [31]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_short_read_repeat_region_introgressed_tract_frequency_per_population.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S15. Frequency of introgressed tracts overlapping \textit{MUC19} short-read repeat region among 1000 Genomes Project populations.} \newline The frequency of introgressed tracts---i.e., the number of introgressed tracts normalized by the total number of chromosomes---and the mean tract length for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Note that a "---" denotes that there are no introgressed tracts overlapping short-read repeat region for that group.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_short_read_repeat_region_introgressed_tract_frequency_per_population.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join((
            '---' if np.isnan(mat[i, j]) else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(2, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S15. Frequency of introgressed tracts overlapping \textit{MUC19} short-read repeat region among 1000 Genomes Project populations.} \newline The frequency of introgressed tracts---i.e., the number of introgressed tracts normalized by the total number of chromosomes---and the mean tract length for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Note that a "---" denotes that there are no introgressed tracts overlapping short-read repeat region for that group.

In [32]:
df

Unnamed: 0,Super Population,Population,Total Number of Chromosomes,Number of Introgressed Tracts,Introgressed Tract Frequency,Mean Tract Length
0,AMR,MXL,128,36,0.28125,653250.0
1,AMR,PEL,170,38,0.223529,511368.421053
2,AMR,CLM,188,18,0.095745,381166.666667
3,AMR,PUR,208,17,0.081731,583352.941176
4,SAS,BEB,172,4,0.023256,427750.0
5,SAS,STU,204,3,0.014706,136333.333333
6,SAS,ITU,204,8,0.039216,271250.0
7,SAS,PJL,192,7,0.036458,223142.857143
8,SAS,GIH,206,5,0.024272,361400.0
9,EAS,CHB,206,9,0.043689,363666.666667


## S16

In [33]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_short_read_vntr_population_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S16. Estimated repeat copies of the \textit{MUC19} 30bp variable number tandem repeat from short-read sequencing data among 1000 Genomes Project populations.} \newline Summary of the distribution of repeat copy numbers for \textit{MUC19} per population stratified by super populations: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR).  For each population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the repeat copy distribution are reported. We also report the total number of individuals, the number of individuals with elevated repeat copies (i.e., those with $> 487$ repeat copies), and the proportion of individuals with elevated repeat copies per population. Populations are sorted by their mean repeat copy number.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_short_read_vntr_population_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S16. Estimated repeat copies of the \textit{MUC19} 30bp variable number tandem repeat from short-read sequencing data among 1000 Genomes Project populations.} \newline Summary of the distribution of repeat copy numbers for \textit{MUC19} per population stratified by super populations: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR).  For each population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the repeat copy distribution are reported. We also report the total number of individuals, the number of individuals with elevated repeat copies (i.e., those with $> 487$ repeat copies), and the proportion of individuals with elevated repeat copies per population.  Populations are sorted by their mean repeat copy number.

In [34]:
df

Unnamed: 0,Super Population,Population,Number of Inds.,Repeat Copies $\left( \mu \right)$,Repeat Copies $\left( \sigma \right)$,Repeat Copies $\left( SEM \right)$,Repeat Copies $\left( \pm CI_{95\%} \right)$,Number of Inds. with Elevated Repeats,Proportion of Inds. with Elevated Repeats
0,AMR,MXL,64,492.84375,147.127731,18.536352,37.041943,32,0.5
1,AMR,PEL,85,409.941176,140.779456,15.360298,30.545637,24,0.282353
2,AMR,CLM,94,398.882979,101.087265,10.482261,20.815693,20,0.212766
3,AMR,PUR,104,391.346154,95.725546,9.432118,18.706382,15,0.144231
4,EAS,CDX,93,365.032258,61.999385,6.463883,12.837829,6,0.064516
5,EAS,KHV,99,361.262626,50.016883,5.052468,10.026459,2,0.020202
6,EUR,TSI,107,358.803738,44.555843,4.327646,8.579979,1,0.009346
7,EUR,CEU,99,357.050505,48.049926,4.853776,9.63216,1,0.010101
8,SAS,GIH,103,356.902913,51.543462,5.103563,10.122893,3,0.029126
9,EAS,JPT,104,354.788462,38.661916,3.809472,7.555189,1,0.009615


## S17

In [35]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_short_read_vntr_superpopulation_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S17. Estimated repeat copies of the \textit{MUC19} 30bp variable number tandem repeat from short-read sequencing data among 1000 Genomes Project super populations.} \newline Summary of the distribution of repeat copy numbers for \textit{MUC19} per super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR).  For each super population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the repeat copy distribution are reported. We also report the total number of individuals, the number of individuals with elevated repeat copies (i.e., those with $> 487$ repeat copies), and the proportion of individuals with elevated repeat copies per super population. Super populations are sorted by their mean repeat copy number.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_short_read_vntr_superpopulation_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S18. Estimated repeat copies of the \textit{MUC19} 30bp variable number tandem repeat from short-read sequencing data among 1000 Genomes Project super populations.} \newline Summary of the distribution of repeat copy numbers for \textit{MUC19} per super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR).  For each super population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the repeat copy distribution are reported. We also report the total number of individuals, the number of individuals with elevated repeat copies (i.e., those with $> 487$ repeat copies), and the proportion of individuals with elevated repeat copies per super population. Super populations are sorted by their mean repeat copy number.

In [36]:
df

Unnamed: 0,Super Population,Number of Inds.,Repeat Copies $\left( \mu \right)$,Repeat Copies $\left( \sigma \right)$,Repeat Copies $\left( SEM \right)$,Repeat Copies $\left( \pm CI_{95\%} \right)$,Number of Inds. with Elevated Repeats,Proportion of Inds. with Elevated Repeats
0,AMR,347,416.662824,125.383165,6.740641,13.257788,91,0.262248
1,EAS,504,355.172619,49.514147,2.207727,4.337502,11,0.021825
2,EUR,503,351.747515,43.319905,1.933462,3.798674,3,0.005964
3,SAS,489,351.713701,46.596776,2.109337,4.144503,10,0.02045
4,AFR,504,344.96627,37.93793,1.691569,3.323411,3,0.005952


## S18

In [37]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_short_read_vntr_group_comparisons_all_individuals.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 8, 9, 10, 11, 12, 13, 14, 15]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 8, 9, 10, 11, 12, 13, 14, 15]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S18. Distribution of repeat copies for different focal groups among all individuals in the 1000 Genome Project.} \newline Summary of the distribution of repeat copy numbers for individuals with at least one introgressed tract overlapping the repeat region vs individuals with no introgressed tracts overlapping the repeat region and admixed American vs non-admixed American individuals. For each focal group, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the repeat copy distribution are reported. },
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_short_read_vntr_group_comparisons_all_individuals_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S17. Distribution of repeat copies for different focal groups among all individuals in the 1000 Genome Project.} \newline Summary of the distribution of repeat copy numbers for individuals with at least one introgressed tract overlapping the repeat region vs individuals with no introgressed tracts overlapping the repeat region and admixed American vs non-admixed American individuals. For each focal group, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the repeat copy distribution are reported. 

In [38]:
df.iloc[:, [0, 1, 8, 9, 10, 11, 12, 13, 14, 15]]

Unnamed: 0,Group 1,Group 2,Group 1 Repeat Copies $\left( \mu \right)$,Group 2 Repeat Copies $\left( \mu \right)$,Group 1 Repeat Copies $\left( \sigma \right)$,Group 2 Repeat Copies $\left( \sigma \right)$,Group 1 Repeat Copies $\left( SEM \right)$,Group 2 Repeat Copies $\left( SEM \right)$,Group 1 Repeat Copies $\left( \pm CI_{95\%} \right)$,Group 2 Repeat Copies $\left( \pm CI_{95\%} \right)$
0,All Inds.: $\geq 1$ Overlapping Tract,All Inds.: No Overlapping Tract,517.886486,347.160037,114.083294,39.384456,8.410332,0.847222,16.593085,1.661456
1,All AMR Inds.,All non-AMR Inds.,416.662824,350.8935,125.383165,44.689743,6.740641,0.999543,13.257788,1.960255


## S19

In [39]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_short_read_vntr_group_comparisons_all_individuals.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXp{1.55cm}XXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S19. Comparisons of the distributions of repeat copies between different focal groups among all individuals in the 1000 Genome Project.} \newline Comparing the distributions of repeat copy numbers between focal groups: individuals with at least one introgressed tract overlapping the repeat region vs individuals with no introgressed tracts overlapping the repeat region and admixed American vs non-admixed American individuals. For each comparison, the total number of individuals, the number of individuals with elevated repeat copies (i.e., those with $> 487$ repeat copies), the proportion of individuals with elevated repeat copies per population, and the results of the Proportions \textit{Z}-Test, and Mann-Whitney \textit{U}-Test is reported. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_short_read_vntr_group_comparisons_all_individuals_comparison.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join((
            convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(2, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S18. Comparisons of the distributions of repeat copies between different focal groups among all individuals in the 1000 Genome Project.} \newline Comparing the distributions of repeat copy numbers between focal groups: individuals with at least one introgressed tract overlapping the repeat region vs individuals with no introgressed tracts overlapping the repeat region and admixed American vs non-admixed American individuals. For each comparison, the total number of individuals, the number of individuals with elevated repeat copies (i.e., those with $> 487$ repeat copies), the proportion of individuals with elevated repeat copies per population, and the results of the Proportions \textit{Z}-Test, and Mann-Whitney \textit{U}-Test is reported. A \textit{P-value} less than 0.05 is considered statistically significant.

In [40]:
df.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19]]

Unnamed: 0,Group 1,Group 2,Number of Inds. (Group 1),Number of Inds. (Group 2),Number of Inds. with Elevated Repeats (Group 1),Number of Inds. with Elevated Repeats (Group 2),Prop. of Inds. with Elevated Repeats (Group 1),Prop. of Inds. with Elevated Repeats (Group 2),$Z-Statistic$,$P-value$ (Prop. $Z$-Test),$U-Statistic$,$P-value$ (Mann-Whitney $U$-Test)
0,All Inds.: $\geq 1$ Overlapping Tract,All Inds.: No Overlapping Tract,185,2162,102,16,0.551351,0.007401,32.496294,6.014519e-232,375108.5,1.5966649999999999e-87
1,All AMR Inds.,All non-AMR Inds.,347,2000,91,27,0.262248,0.0135,19.5749,1.2656589999999999e-85,445495.0,1.424411e-17


## S20

In [41]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_short_read_vntr_group_comparisons_outlier_individuals.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 7, 8, 9, 10, 11, 12, 13, 14]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 7, 8, 9, 10, 11, 12, 13, 14]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S20. Distribution of repeat copies for different focal groups among individuals with an elevated number of repeat copies in the 1000 Genome Project.} \newline Summary of the distribution of repeat copy numbers for different focal groups of outlier individuals: individuals with at least one introgressed tract overlapping the repeat region vs individuals with no introgressed tracts overlapping the repeat region and admixed American vs non-admixed American individuals. For each focal group, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the repeat copy distribution are reported.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_short_read_vntr_group_comparisons_outlier_individuals_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S19. Distribution of repeat copies for different focal groups among individuals with an elevated number of repeat copies in the 1000 Genome Project.} \newline Summary of the distribution of repeat copy numbers for different focal groups of outlier individuals: individuals with at least one introgressed tract overlapping the repeat region vs individuals with no introgressed tracts overlapping the repeat region and admixed American vs non-admixed American individuals. For each focal group, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the repeat copy distribution are reported. 

In [42]:
df.iloc[:, [0, 1, 7, 8, 9, 10, 11, 12, 13, 14]]

Unnamed: 0,Group 1,Group 2,Group 1 Repeat Copies $\left( \mu \right)$,Group 2 Repeat Copies $\left( \mu \right)$,Group 1 Repeat Copies $\left( \sigma \right)$,Group 2 Repeat Copies $\left( \sigma \right)$,Group 1 Repeat Copies $\left( SEM \right)$,Group 2 Repeat Copies $\left( SEM \right)$,Group 1 Repeat Copies $\left( \pm CI_{95\%} \right)$,Group 2 Repeat Copies $\left( \pm CI_{95\%} \right)$
0,Outlier Inds.: $\geq 1$ Overlapping Tract,Outlier Inds.: No Overlapping Tract,594.480392,510.6875,92.117011,16.115671,9.165985,4.161048,18.182849,8.869064
1,Outlier AMR Inds.,Outlier non-AMR Inds.,600.956044,523.0,94.367869,33.259919,9.947247,6.522807,19.761942,13.407821


## S21

In [43]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_short_read_vntr_group_comparisons_outlier_individuals.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S21. Comparisons of the distributions of repeat copies between different focal groups among individuals with an elevated number of repeat copies in the 1000 Genome Project.} \newline Comparing the distributions of repeat copy numbers between focal groups of outlier individuals: individuals with at least one introgressed tract overlapping the repeat region vs individuals with no introgressed tracts overlapping the repeat region and admixed American vs non-admixed American individuals. For each comparison, the total number of individuals, the number of individuals with elevated repeat copies (i.e., those with $> 487$ repeat copies), the proportion of individuals with elevated repeat copies per population, and the results of the Proportions \textit{Z}-Test, and Mann-Whitney \textit{U}-Test is reported. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_short_read_vntr_group_comparisons_outlier_individuals_comparison.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join((
            convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(2, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S20. Comparisons of the distributions of repeat copies between different focal groups among individuals with an elevated number of repeat copies in the 1000 Genome Project.} \newline Comparing the distributions of repeat copy numbers between focal groups of outlier individuals: individuals with at least one introgressed tract overlapping the repeat region vs individuals with no introgressed tracts overlapping the repeat region and admixed American vs non-admixed American individuals. For each comparison, the total number of individuals, the number of individuals with elevated repeat copies (i.e., those with $> 487$ repeat copies), the proportion of individuals with elevated repeat copies per population, and the results of the Proportions \textit{Z}-Test, and Mann-Whitney \textit{U}-Test is reported. A \textit{P-value} less than 0.05 is considered statistically significant.

In [44]:
df.iloc[:, [0, 1, 2, 3, 4, 5, 6, 15, 16, 17, 18]]

Unnamed: 0,Group 1,Group 2,Total Number of Inds. with Elevated Repeats,Number of Inds. (Group 1),Number of Inds. (Group 2),Prop. of Inds. (Group 1),Prop. of Inds. (Group 2),$Z-Statistic$,$P-value$ (Prop. $Z$-Test),$U-Statistic$,$P-value$ (Mann-Whitney $U$-Test)
0,Outlier Inds.: $\geq 1$ Overlapping Tract,Outlier Inds.: No Overlapping Tract,118,102,16,0.864407,0.135593,11.196246,2.126889e-29,1412.5,1.39845e-06
1,Outlier AMR Inds.,Outlier non-AMR Inds.,118,91,27,0.771186,0.228814,8.33209,3.971366e-17,1988.0,5.788634e-07


## S22

In [45]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_short_read_vntr_introgressed_tract_correlations.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S22. Correlation between the introgressed ancestry and repeat copies in the 1000 Genome Project.} \newline Spearman's correlation coefificients and associated \textit{P-values} between the number of introgressed tracts overlapping the repeat region and the number of repeat copies for all individuals in the 1000 Genome Project, individuals stratified by super population, and individuals stratified by population. A \textit{P-value} less than 0.05 is considered statistically significant, and a "---" denotes that there are no introgressed tracts overlapping the repeat region for that group.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_short_read_vntr_introgressed_tract_correlations.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join((
            '---' if np.isnan(mat[i, j]) else
            convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(1, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S21. Correlation between the introgressed ancestry and repeat copies in the 1000 Genome Project.} \newline Spearman's correlation coefificients and associated \textit{P-values} between the number of introgressed tracts overlapping the repeat region and the number of repeat copies for all individuals in the 1000 Genome Project, individuals stratified by super population, and individuals stratified by population. A \textit{P-value} less than 0.05 is considered statistically significant, and a "---" denotes that there are no introgressed tracts overlapping the repeat region for that group.

In [46]:
df

Unnamed: 0,Group,Spearman's $\rho$,$P-value$
0,TGP,0.409165,2.022428e-95
1,AFR,,
2,AMR,0.736302,1.741238e-60
3,SAS,0.21573,1.471162e-06
4,EAS,0.531768,3.914002e-38
5,EUR,0.104177,0.01943935
6,LWK,,
7,GWD,,
8,MSL,,
9,ESN,,


## S23

In [47]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_short_read_vntr_amr_ancestry_proportions_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S23. Ancestry component distribution for the \textit{MUC19} short-read repeat region among admixed American populations in 1000 Genomes Project.} \newline Summary of the per-individual ancestry proportions at the \textit{MUC19} short-read repeat region (hg19, Chr12:40876395-40885001) for each of the admixed American populations in the 1000 Genomes project. For each ancestry component per population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the per-individual ancestry proportion distribution are reported.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_short_read_vntr_amr_ancestry_proportions_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S22. Ancestry component distribution for the \textit{MUC19} short-read repeat region among admixed American populations in 1000 Genomes Project.} \newline Summary of the per-individual ancestry proportions at the \textit{MUC19} short-read repeat region (hg19, Chr12:40875941-40885367) for each of the admixed American populations in the 1000 Genomes project. For each ancestry component per population, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the per-individual ancestry proportion distribution are reported.

In [48]:
df

Unnamed: 0,Population,Ancestry Componenet,Ancestry Proportion $\left( \mu \right)$,Ancestry Proportion $\left( \sigma \right)$,Ancestry Proportion $\left( SEM \right)$,Ancestry Proportion $\left( \pm CI_{95\%} \right)$
0,MXL,Indigenous American,0.601562,0.344371,0.043387,0.086701
1,MXL,European,0.335938,0.331364,0.041748,0.083427
2,MXL,African,0.054688,0.156055,0.019661,0.039289
3,PEL,Indigenous American,0.694118,0.343803,0.037512,0.074597
4,PEL,European,0.270588,0.339142,0.037003,0.073585
5,PEL,African,0.029412,0.117647,0.012836,0.025526
6,CLM,Indigenous American,0.281915,0.305608,0.03169,0.06293
7,CLM,European,0.648936,0.340758,0.035335,0.070168
8,CLM,African,0.06383,0.182098,0.018883,0.037497
9,PUR,Indigenous American,0.153846,0.250739,0.024706,0.048999


## S24

In [49]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_short_read_vntr_amr_ancestry_proportions_correlations.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S24. Correlation between ancestry components and repeat copies among Admixed American populations in the 1000 Genome Project.} \newline Spearman's correlation coefificients and associated \textit{P-values} between the ancestry proportion (per ancestry component) and the number of repeat copies for all admixed American individuals in the 1000 Genome Project stratified by population. After correcting for three multiple comparisons per admixed American population---i.e., one per ancestry component---a \textit{P-value} less than 0.0167 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_short_read_vntr_amr_ancestry_proportions_correlations.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join((
            convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(2, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S23.Correlation between ancestry components and repeat copies among Admixed American populations in the 1000 Genome Project.} \newline Spearman's correlation coefificients and associated \textit{P-values} between the ancestry proportion (per ancestry component) and the number of repeat copies for all admixed American individuals in the 1000 Genome Project stratified by population. After correcting for three multiple comparisons per admixed American population---i.e., one per ancestry component---a \textit{P-value} less than 0.0167 is considered statistically significant.

In [50]:
df

Unnamed: 0,Population,Ancestry Componenet,Spearman's $\rho$,$P-value$
0,MXL,Indigenous American,0.4381,0.0002939568
1,MXL,European,-0.353268,0.004191332
2,MXL,African,-0.288633,0.02072134
3,PEL,Indigenous American,0.390762,0.0002177656
4,PEL,European,-0.313211,0.003514602
5,PEL,African,-0.199727,0.06685667
6,CLM,Indigenous American,0.553194,7.377577e-09
7,CLM,European,-0.494947,3.956818e-07
8,CLM,African,-0.047328,0.650569
9,PUR,Indigenous American,0.413539,1.28121e-05


## S25

In [51]:
# Load the dataframe.
df = pd.read_csv('../meta_data/muc19_72kb_denisovan_coding_muts_info.csv')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
mat[:, -1] = np.round(mat[:, -1].astype(float), 3)
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={Xp{2cm}p{2.25cm}XXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S25. Denisovan-specific coding mutations in the focal 72kb \textit{MUC19} region.} \newline Allelic and functional information for the 11 Denisovan-specific coding mutations found within the focal 72kb region. Each variant’s effect on protein structure is categorized using Grantham scores as follows: Grantham score $< 50$ = conservative, Grantham score 51–100 = moderately conservative, Grantham score 101–150 = moderately radical, and Grantham score $> 150$ = radical. The PhyloP score measures the conservation of the reference base-pair, with higher positive scores indicating stronger conservation.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/muc19_72kb_denisovan_coding_muts_info.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(mat.shape[1]))
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S24. Denisovan-specific coding mutations in the focal 72kb \textit{MUC19} region.} \newline Allelic and functional information for the 11 Denisovan-specific coding mutations found within the focal 72kb region. Each variant’s effect on protein structure is categorized using Grantham scores as follows: Grantham score $< 50$ = conservative, Grantham score 51–100 = moderately conservative, Grantham score 101–150 = moderately radical, and Grantham score $> 150$ = radical. The PhyloP score measures the conservation of the mutation, with higher positive scores indicating stronger conservation.

In [52]:
df

Unnamed: 0,Chr12 Position (Hg19),rsID,Mut. Type,Ref. Allele,Ref. Amino Acid,Denisovan Allele,Denisovan Amino Acid,Protein Position,Grantham Score,PhyloP Score
0,40808672,rs4768252,Missense,C,Ser,T,Leu,275,145,-1.22099
1,40808726,rs4768253,Missense,C,Thr,T,Met,293,81,-0.879299
2,40809983,rs142268259,Missense,C,Asp,A,Glu,410,45,-0.060835
3,40814107,rs2114566,Missense,A,Ile,G,Val,511,29,-0.188913
4,40814197,rs17467164,Missense,G,Val,A,Ile,541,29,-0.336425
5,40815060,rs149221842,Missense,C,His,T,Tyr,572,83,-0.569071
6,40820403,rs11564249,Synonymous,C,Asp,T,Asp,711,0,0.818386
7,40821795,rs61736852,Missense,G,Glu,A,Lys,766,56,0.581063
8,40821871,rs17467284,Missense,G,Arg,T,Leu,791,102,5.15
9,40826201,rs11564125,Missense,G,Val,A,Ile,883,29,-1.72208


## S26

In [53]:
# Load the dataframe.
df = pd.read_csv('../meta_data/ancient_sample_info.csv')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S26. Metadata for the ancient Indigenous American genomes.} \newline The location, sample ID, age, genomic coverage, and reference for the 23 ancient Indigenous American individuals that predate European colonization and the African slave trade used in this study.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/ancient_sample_info.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(mat.shape[1]))
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S25. Metadata for the ancient Indigenous American genomes.} \newline The location, sample ID, age, genomic coverage, and reference for the 23 ancient Indigenous American individuals that predate European colonization and the African slave trade used in this study.

In [54]:
df

Unnamed: 0,Location,ID,Age (years),Coverage (x),Reference
0,Alaska,USR1,11500,17.0,Moreno-Mayar et al. 2018 Nature
1,California,CR-01,1111,8.36,Scheib et al. 2018 Science
2,California,CT-01,387,6.68,Scheib et al. 2018 Science
3,California,NC\_C,1450,1.12,Scheib et al. 2018 Science
4,California,PS-06,1570,4.12,Scheib et al. 2018 Science
5,California,SC-05,1101,13.67,Scheib et al. 2018 Science
6,California,SM-02,826,7.38,Scheib et al. 2018 Science
7,California,SN-13,881,1.74,Scheib et al. 2018 Science
8,California,SN-17,4517,7.46,Scheib et al. 2018 Science
9,California,SN-44,4646,9.36,Scheib et al. 2018 Science


## S27

In [55]:
# Load the dataframe.
df = pd.read_csv('./dataframes/amr_v_non_amr_denisovan_specific_missense_mutation_proportions_ztest_fet.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 2, 3, 5, 6, 7, 8, 9, 10]]
# Intialize the columns.
cols = np.where(
    df.columns.values[[0, 2, 3, 5, 6, 7, 8, 9, 10]] == 'Denisovan-specific Mis. Mut. Pos.',
    'Position', df.columns.values[[0, 2, 3, 5, 6, 7, 8, 9, 10]],
)
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S27. Comparison of the per-individual frequency distributions for each Denisovan-specific missense mutation in the focal 72kb \textit{MUC19} region between admixed American and non-admixed American individuals in 1000 Genomes Project} \newline Comparisons of the per-individual frequency distribution for each of the nine Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region between the 347 admixed American (AMR) individuals ($n = 694$ chromosomes) and 1,496 non-admixed American (Non-AMR) individuals ($n = 2,992$ chromosomes), excluding African populations, in the 1000 Genomes Project. For each comparison, the number of chromosomes harboring the respective Denisovan-specific missense mutation, the frequency of the Denisovan-specific missense mutation, and the results of the Proportions \textit{Z}-Test and Fisher's Exact Test are reported. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/amr_v_non_amr_denisovan_specific_missense_mutation_proportions_ztest_fet.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join((
            f'{int(mat[i, j])}' if mat[i, j] > 100 else
            convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S26. Comparison of the per-individual frequency distributions for each Denisovan-specific missense mutation in the focal 72kb \textit{MUC19} region between admixed American and non-admixed American individuals in 1000 Genomes Project} \newline Comparisons of the per-individual frequency distribution for each of the nine Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region between the 347 admixed American (AMR) individuals ($n = 694$ chromosomes) and 1,496 non-admixed American (Non-AMR) individuals ($n = 2,992$ chromosomes), excluding African populations, in the 1000 Genomes Project. For each comparison, the number of chromosomes harboring the respective Denisovan-specific missense mutation, the frequency of the Denisovan-specific missense mutation, and the results of the Proportions \textit{Z}-Test and Fisher's Exact Test are reported. A \textit{P-value} less than 0.05 is considered statistically significant.

In [56]:
df.iloc[:, [0, 2, 3, 5, 6, 7, 8, 9, 10]]

Unnamed: 0,Denisovan-specific Mis. Mut. Pos.,Denisovan-specific Mis. Mut. Count (AMR),Denisovan-specific Mis. Freq. (AMR),Denisovan-specific Mis. Mut. Count (Non-AMR),Denisovan-specific Mis. Freq. (Non-AMR),$Z-Statistic$,$P-value$ (Prop. $Z$-Test),Odds Ratio,$P-value$ (Fisher's Exact Test)
0,40808672,109,0.157061,187,0.0625,8.258522,7.374357e-17,2.794872,1.939194e-14
1,40808726,108,0.15562,190,0.063503,8.020432,5.268677e-16,2.717945,9.213214e-14
2,40809983,108,0.15562,186,0.062166,8.187167,1.33723e-16,2.780359,3.175081e-14
3,40814107,108,0.15562,186,0.062166,8.187167,1.33723e-16,2.780359,3.175081e-14
4,40814197,108,0.15562,188,0.062834,8.103488,2.670271e-16,2.74882,5.430066e-14
5,40815060,108,0.15562,186,0.062166,8.187167,1.33723e-16,2.780359,3.175081e-14
6,40821795,107,0.154179,180,0.06016,8.327868,4.115575e-17,2.847662,1.319832e-14
7,40821871,107,0.154179,180,0.06016,8.327868,4.115575e-17,2.847662,1.319832e-14
8,40826201,108,0.15562,181,0.060495,8.399161,2.2483960000000003e-17,2.862256,8.010657e-15


## S28

In [57]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_denisovan_specific_missense_mutation_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [10, 11, 12, 13, 14]].T
idx = df.iloc[:, [10, 11, 12, 13, 14]].T.index.values
# Intialize the columns.
cols = ['Super Population']
cols.extend(f'{pos}' for pos in df['Chr12 Position (Hg19)'].values)
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S28. Frequency of the Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region among super populations in the 1000 Genomes Project.} \newline Frequencies of the nine Denisovan-specific missense mutations found within the focal 72kb \textit{MUC19} region per super population in the 1000 Genomes Project.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_denisovan_specific_missense_mutation_summary_per_spop.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        f'{idx[i]} & '\
        + r' & '.join(f'{round(mat[i, j], 3)}' for j in range(mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S27. Frequency of the Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region among super populations in the 1000 Genomes Project.} \newline Frequencies of the nine Denisovan-specific missense mutations found within the focal 72kb \textit{MUC19} region per super population in the 1000 Genomes Project.

In [58]:
df.iloc[:, [10, 11, 12, 13, 14]].T

Unnamed: 0,0,1,2,3,4,5,6,7,8
AFR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AMR,0.157061,0.15562,0.15562,0.15562,0.15562,0.15562,0.154179,0.154179,0.15562
SAS,0.105317,0.108384,0.104294,0.104294,0.105317,0.104294,0.09816,0.09816,0.09816
EAS,0.060516,0.060516,0.060516,0.060516,0.061508,0.060516,0.060516,0.060516,0.061508
EUR,0.022863,0.022863,0.022863,0.022863,0.022863,0.022863,0.022863,0.022863,0.022863


## S29

In [59]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_denisovan_specific_missense_mutation_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, 15:].T
idx = df.iloc[:, 15:].T.index.values
# Intialize the columns.
cols = ['Population']
cols.extend(f'{pos}' for pos in df['Chr12 Position (Hg19)'].values)
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S29. Frequency of the Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region among populations in the 1000 Genomes Project.} \newline Frequencies of the nine Denisovan-specific missense mutations found within the focal 72kb \textit{MUC19} region per population in the 1000 Genomes Project.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_denisovan_specific_missense_mutation_summary_per_pop.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        f'{idx[i]} & '\
        + r' & '.join(f'{round(mat[i, j], 3)}' for j in range(mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S28. Frequency of the Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region among populations in the 1000 Genomes Project.} \newline Frequencies of the nine Denisovan-specific missense mutations found within the focal 72kb \textit{MUC19} region per population in the 1000 Genomes Project.

In [60]:
df.iloc[:, 15:].T

Unnamed: 0,0,1,2,3,4,5,6,7,8
GWD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ESN,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MSL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YRI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LWK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PUR,0.086538,0.086538,0.086538,0.086538,0.086538,0.086538,0.086538,0.086538,0.091346
CLM,0.079787,0.074468,0.074468,0.074468,0.074468,0.074468,0.069149,0.069149,0.069149
PEL,0.217647,0.217647,0.217647,0.217647,0.217647,0.217647,0.217647,0.217647,0.217647
MXL,0.304688,0.304688,0.304688,0.304688,0.304688,0.304688,0.304688,0.304688,0.304688
PJL,0.052083,0.052083,0.052083,0.052083,0.052083,0.052083,0.052083,0.052083,0.052083


## S29

In [61]:
# Load the dataframe.
df = pd.read_csv('./dataframes/ancient_americans_denisovan_specific_missense_mutation_aac.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, 1:].T
idx = np.where(df.columns.values[1:] == 'NC_C', 'NC\_C', df.columns.values[1:])
# Intialize the columns.
cols = ['ID']
cols.extend(f'{int(pos)}' for pos in df['POS'].values)
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={p{2.25cm}XXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S30. Denisovan-specific missense mutation allele counts in the focal 72kb \textit{MUC19} region among 23 ancient Indigenous American individuals.} \newline Allele counts (i.e., 0, 1, or 2) of the nine Denisovan-specific missense mutations found within the focal 72kb \textit{MUC19} region for each of the 23 ancient Indigenous American individuals that predate European colonization and the African slave trade. A "---"  denotes that the site did not pass quality control.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/ancient_americans_denisovan_specific_missense_mutation_aac.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        f'{idx[i]} & '\
        + r' & '.join((
            '---' if np.isnan(mat[i, j]) else
            f'{int(mat[i, j])}'
            for j in range(mat.shape[1])
        ))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S29. Denisovan-specific missense mutation allele counts in the focal 72kb \textit{MUC19} region among 23 ancient Indigenous American individuals.} \newline Allele counts (i.e., 0, 1, or 2) of the nine Denisovan-specific missense mutations found within the focal 72kb \textit{MUC19} region for each of the 23 ancient Indigenous American individuals that predate European colonization and the African slave trade. A "---"  denotes that the site did not pass quality control.

In [62]:
df.iloc[:, 1:].T

Unnamed: 0,0,1,2,3,4,5,6,7,8
USR1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CK-13,,2.0,0.0,,0.0,,,,
Anzick,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CR-01,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,2.0
CT-01,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
NC_C,,0.0,,,,,0.0,0.0,
PS-06,0.0,0.0,0.0,,,0.0,0.0,0.0,
SC-05,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0
SM-02,2.0,2.0,2.0,2.0,2.0,,,2.0,2.0
SN-13,,2.0,,2.0,,2.0,,,


## S31

In [63]:
# Load the dataframe.
df = pd.read_csv('./dataframes/mxl_sgdp_amr_anc_amr_denisovan_specific_missense_mutation_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = np.where(df.columns.values == 'Denisovan-specific Mis. Mut. Pos.', 'Position', df.columns.values)
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S31. Per-individual frequency distribution for each Denisovan-specific missense mutation in the focal 72kb \textit{MUC19} region among Mexicans in 1000 Genomes Project, 23 ancient Indigenous American individuals, and Indigenous American individuals in the Simons Genome Diversity Project.} \newline Summary of the per-individual frequency (i.e., 0, 0.5, or 1) distribution for each of the nine Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region per group: 64 Mexican individuals in the 1000 Genomes project ($n = 128$ chromosomes), 23 ancient Indigenous American individuals ($n = 46$ chromosomes), and 22 Indigenous American individuals in the Simons Genome Diversity Project ($n = 44$ chromosomes). For each group, the total number of chromosomes that passed quality control, the mean (i.e., frequency), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the per-individual Denisovan-specific missense mutation frequency distribution are reported.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/mxl_sgdp_amr_anc_amr_denisovan_specific_missense_mutation_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join((
            f'{int(mat[i, j])}' if mat[i, j] > 1 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(2, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S30. Per-individual frequency distribution for each Denisovan-specific missense mutation in the focal 72kb \textit{MUC19} region among Mexicans in 1000 Genomes Project, 23 ancient Indigenous American individuals, and Indigenous American individuals in the Simons Genome Diversity Project.} \newline Summary of the per-individual frequency (i.e., 0, 0.5, or 1) distribution for each of the nine Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region per group: 64 Mexican individuals in the 1000 Genomes project ($n = 128$ chromosomes), 23 ancient Indigenous American individuals ($n = 46$ chromosomes), and 22 Indigenous American individuals in the Simons Genome Diversity Project ($n = 44$ chromosomes). For each group, the total number of chromosomes that passed quality control, the mean (i.e., frequency), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the per-individual Denisovan-specific missense mutation frequency distribution are reported.

In [64]:
df

Unnamed: 0,Denisovan-specific Mis. Mut. Pos.,Group,Number of Chroms.,Denisovan-specific Mis. Mut. Freq.,Denisovan-specific Mis. Mut. $\left( \sigma \right)$,Denisovan-specific Mis. Mut. $\left( SEM \right)$,Denisovan-specific Mis. Mut. $\left( \pm CI_{95\%} \right)$
0,40808672,MXL,128,0.304688,0.301263,0.037956,0.075848
1,40808672,SDGP AMR,44,0.363636,0.308288,0.067274,0.139904
2,40808672,Ancient Americans,36,0.361111,0.401579,0.097397,0.20549
3,40808726,MXL,128,0.304688,0.301263,0.037956,0.075848
4,40808726,SDGP AMR,44,0.363636,0.308288,0.067274,0.139904
5,40808726,Ancient Americans,40,0.375,0.414578,0.095111,0.199069
6,40809983,MXL,128,0.304688,0.301263,0.037956,0.075848
7,40809983,SDGP AMR,44,0.363636,0.308288,0.067274,0.139904
8,40809983,Ancient Americans,32,0.375,0.414578,0.107044,0.228158
9,40814107,MXL,128,0.304688,0.301263,0.037956,0.075848


## S32

In [65]:
# Load the dataframe.
df = pd.read_csv('./dataframes/mxl_sgdp_amr_anc_amr_denisovan_specific_missense_mutation_proportions_ztest_fet.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 12]]
# Intialize the columns.
cols = np.where(
    df.columns.values[[0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 12]] == 'Denisovan-specific Mis. Mut. Pos.',
    'Position', df.columns.values[[0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 12]]
)
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S32. Comparison of the per-individual frequency distributions for each Denisovan-specific missense mutation in the focal 72kb \textit{MUC19} region between Mexicans in 1000 Genomes Project, 23 ancient Indigenous American individuals, and Indigenous American individuals in the Simons Genome Diversity Project.} \newline Comparisons of the per-individual frequency distribution for each of the nine Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region between groups: 64 Mexican individuals in the 1000 Genomes project ($n = 128$ chromosomes), 23 ancient Indigenous American individuals ($n = 46$ chromosomes), and 22 Indigenous American individuals in the Simons Genome Diversity Project ($n = 44$ chromosomes). For each comparison, the total number of chromosomes that passed quality control, the number of chromosomes harboring the respective Denisovan-specific missense mutation, and the results of the Proportions \textit{Z}-Test and Fisher's Exact Test are reported. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/mxl_sgdp_amr_anc_amr_denisovan_specific_missense_mutation_proportions_ztest_fet.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join((
            f'{int(mat[i, j])}' if mat[i, j] > 1 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S31. Comparison of the per-individual frequency distributions for each Denisovan-specific missense mutation in the focal 72kb \textit{MUC19} region between Mexicans in 1000 Genomes Project, 23 ancient Indigenous American individuals, and Indigenous American individuals in the Simons Genome Diversity Project.} \newline Comparisons of the per-individual frequency distribution for each of the nine Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region between groups: 64 Mexican individuals in the 1000 Genomes project ($n = 128$ chromosomes), 23 ancient Indigenous American individuals ($n = 46$ chromosomes), and 22 Indigenous American individuals in the Simons Genome Diversity Project ($n = 44$ chromosomes). For each comparison, the total number of chromosomes that passed quality control, the number of chromosomes harboring the respective Denisovan-specific missense mutation, and the results of the Proportions \textit{Z}-Test and Fisher's Exact Test are reported. A \textit{P-value} less than 0.05 is considered statistically significant.

In [66]:
df.iloc[:, [0, 1, 2, 3, 4, 6, 7, 9, 10, 11, 12]]

Unnamed: 0,Denisovan-specific Mis. Mut. Pos.,Group 1,Group 2,Number of Chroms. (Group 1),Denisovan-specific Mis. Mut. Count (Group 1),Number of Chroms. (Group 2),Denisovan-specific Mis. Mut. Count (Group 2),$Z-Statistic$,$P-value$ (Prop. $Z$-Test),Odds Ratio,$P-value$ (Fisher's Exact Test)
0,40808672,MXL,Ancient Americans,128,39.0,36,13.0,-0.64273,0.5204,0.775281,0.546776
1,40808726,MXL,Ancient Americans,128,39.0,40,15.0,-0.831137,0.405896,0.730337,0.440712
2,40809983,MXL,Ancient Americans,128,39.0,32,12.0,-0.763439,0.445202,0.730337,0.525291
3,40814107,MXL,Ancient Americans,128,39.0,36,11.0,-0.009995,0.992025,0.995914,1.0
4,40814197,MXL,Ancient Americans,128,39.0,32,13.0,-1.097134,0.272583,0.640449,0.295742
5,40815060,MXL,Ancient Americans,128,39.0,34,13.0,-0.862247,0.388552,0.707865,0.412878
6,40821795,MXL,Ancient Americans,128,39.0,28,9.0,-0.173859,0.861976,0.925094,0.825686
7,40821871,MXL,Ancient Americans,128,39.0,38,12.0,-0.130262,0.896359,0.949438,1.0
8,40826201,MXL,Ancient Americans,128,39.0,30,11.0,-0.656966,0.511203,0.756895,0.519337
9,40808672,MXL,SDGP AMR,128,39.0,44,16.0,-0.723264,0.469518,0.766854,0.461491


## S34

In [67]:
# Load the dataframe.
df = pd.read_csv('./dataframes/sgdp_denisovan_specific_missense_mutation_summary.csv.gz')
# Intialize the african columns.
afr_cols = [
    'AFR', 'Khomani_San', 'Ju_hoan_North', 'Dinka', 'Mandenka',
    'Mbuti', 'Yoruba', 'BantuHerero', 'BantuKenya', 'BantuTswana', 'Biaka', 'Esan',
    'Gambian', 'Luhya', 'Luo', 'Masai', 'Mende', 'Mozabite', 'Saharawi', 'Somali'
]
# Convert the dataframe to a numpy matrix.
mat = df[afr_cols].to_numpy().T
idx = [
    'AFR', 'Khomani San', r"Ju$|$'hoan North", 'Dinka', 'Mandenka',
    'Mbuti', 'Yoruba', 'Bantu Herero', 'Bantu Kenya', 'Bantu Tswana', 'Biaka', 'Esan',
    'Gambian', 'Luhya', 'Luo', 'Masai', 'Mende', 'Mozabite', 'Saharawi', 'Somali'
]
# Intialize the columns.
cols = ['Group']
cols.extend(f'{pos}' for pos in df['Chr12 Position (Hg19)'].values)
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S33. Frequency of the Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region among the African super population and African populations in the Simons Genome Diversity Project.} \newline Frequencies of the nine Denisovan-specific missense mutations found within the focal 72kb \textit{MUC19} region for the entire African super population and stratified by African population in the Simons Genome Diversity Project.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/sgdp_denisovan_specific_missense_mutation_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        f'{idx[i]} & '\
        + r' & '.join(f'{round(mat[i, j], 3)}' for j in range(mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S32. Frequency of the Denisovan-specific missense mutations in the focal 72kb \textit{MUC19} region among the African super population and African populations in the Simons Genome Diversity Project.} \newline Frequencies of the nine Denisovan-specific missense mutations found within the focal 72kb \textit{MUC19} region for the entire African super population and stratified by African population in the Simons Genome Diversity Project.

In [68]:
df[afr_cols].T

Unnamed: 0,0,1,2,3,4,5,6,7,8
AFR,0.011364,0.011364,0.011364,0.011364,0.011364,0.011364,0.011364,0.011364,0.011364
Khomani_San,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25
Ju_hoan_North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dinka,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mandenka,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mbuti,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Yoruba,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BantuHerero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BantuKenya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BantuTswana,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## S34

In [69]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_muc19_introgressed_hap_frequency_per_super_population_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXX},
cells={halign=c, valign=m},
hlines={solid, 1pt},
vlines={solid, 1pt},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S34. Frequency of introgressed haplotypes at the focal 72kb \textit{MUC19} region among 1000 Genomes Project super populations.} \newline The frequency of introgressed haplotypes---i.e., the number of introgressed haplotypes normalized by the total number of chromosomes---stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Introgressed haplotypes at the focal 72kb \textit{MUC19} region are significantly enriched in AMR individuals (Fisher's Exact Test, Odds Ratio: 2.848, \textit{P-value}: $1.320e^{-14}$), with AMR populations exhibiting a higher proportion of introgressed tracts compared to non-AMR populations, excluding AFR (Proportions \textit{Z}-Test, \textit{Z}-statistic: 8.328, \textit{P-value}: $4.116e^{-17}$).},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_muc19_introgressed_hap_frequency_per_super_population_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        f'{mat[i, 0]}'+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(1, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S33. Frequency of introgressed haplotypes at the focal 72kb \textit{MUC19} region among 1000 Genomes Project super populations.} \newline The frequency of introgressed haplotypes---i.e., the number of introgressed haplotypes normalized by the total number of chromosomes---stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Introgressed haplotypes at the focal 72kb \textit{MUC19} region are significantly enriched in AMR individuals (Fisher's Exact Test, Odds Ratio: 2.848, \textit{P-value}: $1.320e^{-14}$), with AMR populations exhibiting a higher proportion of introgressed tracts compared to non-AMR populations, excluding AFR (Proportions \textit{Z}-Test, \textit{Z}-statistic: 8.328, \textit{P-value}: $4.116e^{-17}$).

In [70]:
df

Unnamed: 0,Super Population,Total Number of Chromosomes,Number of Introgressed Haps.,Introgressed Hap. Frequency
0,AMR,694,107,0.154179
1,SAS,978,96,0.09816
2,EAS,1008,61,0.060516
3,EUR,1006,23,0.022863
4,AFR,1008,0,0.0


## S35

In [71]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_muc19_introgressed_hap_frequency_per_population_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S35. Frequency of introgressed haplotypes at the focal 72kb \textit{MUC19} region among 1000 Genomes Project populations.} \newline The frequency of introgressed haplotypes---i.e., the number of introgressed haplotypes normalized by the total number of chromosomes---for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR).},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_muc19_introgressed_hap_frequency_per_population_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S35. Frequency of introgressed haplotypes at the focal 72kb \textit{MUC19} region among 1000 Genomes Project populations.} \newline The frequency of introgressed haplotypes---i.e., the number of introgressed haplotypes normalized by the total number of chromosomes---for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR).

In [72]:
df

Unnamed: 0,Super Population,Population,Total Number of Chromosomes,Number of Introgressed Haps.,Introgressed Hap. Frequency
0,AMR,MXL,128,39,0.304688
1,AMR,PEL,170,37,0.217647
2,AMR,CLM,188,13,0.069149
3,AMR,PUR,208,18,0.086538
4,SAS,BEB,172,24,0.139535
5,SAS,STU,204,23,0.112745
6,SAS,ITU,204,21,0.102941
7,SAS,PJL,192,10,0.052083
8,SAS,GIH,206,18,0.087379
9,EAS,CHB,206,7,0.033981


## S36

In [73]:
# Load the dataframe.
df = pd.read_csv('./dataframes/altai_nean_afr_v_denisovan_seq_div_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S36. Sequence divergence from the Denisovan for the Altai Neanderthal and all African Individuals in the 1000 Genomes Project (1KG) at the focal 72kb \textit{MUC19} region.} \newline The observed sequence divergence from the Densisovan for the Altai Neanderthal (i.e., the number of pairwise differences between the Altai Neanderthal's and Denisovan's unphased genotypes normalized by the effective sequence length) and all Africans in the 1KG (i.e., the average number of pairwise differences between all African haplotypes in the 1KG and the Denisovan's unphased genotypes normalized by the effective sequence length). For each comparison to the Denisovan, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the sequence divergence genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the sequence divergence is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant. Note that at the focal 72kb region, the effective sequence length with respect to the Denisovan is 46136bp and 48435bp for the Altai Neanderthal and African comparison, respectively.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/altai_nean_afr_v_denisovan_seq_div_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(1, 2))+r' & '\
        + r' & '.join('{:.6f}'.format(round(mat[i, j], 6)).rstrip('0').rstrip('.') for j in range(2, 3))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S36. Sequence divergence from the Denisovan for the Altai Neanderthal and all African Individuals in the 1000 Genomes Project (1KG) at the focal 72kb \textit{MUC19} region.} \newline The observed sequence divergence from the Densisovan for the Altai Neanderthal (i.e., the number of pairwise differences between the Altai Neanderthal's and Denisovan's unphased genotypes normalized by the effective sequence length) and all Africans in the 1KG (i.e., the average number of pairwise differences between all African haplotypes in the 1KG and the Denisovan's unphased genotypes normalized by the effective sequence length). For each comparison to the Denisovan, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the sequence divergence genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the sequence divergence is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant. Note that at the focal 72kb region, the effective sequence length with respect to the Denisovan is 46136bp and 48435bp for the Altai Neanderthal and African comparison, respectively.

In [74]:
df

Unnamed: 0,Comparison,Focal 72kb Region (Pairwise Diffs.),Focal 72kb Region (Seq. Div.),72kb Nonoverlapping Windows $\left( \mu \right)$,72kb Nonoverlapping Windows $\left( \sigma \right)$,72kb Nonoverlapping Windows $\left( SEM \right)$,72kb Nonoverlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,Altai Nean.,174.5,0.003782,0.001076,0.000601,4e-06,7e-06,0.002305
1,AFR,194.884921,0.004024,0.001416,0.000553,3e-06,6e-06,0.001905


## S37

In [75]:
# Load the dataframe.
df = pd.read_csv('./dataframes/focal_mxl_sequence_divergence_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]
# Intialize the columns.
cols = df.columns.values[[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={p{2cm}XXXXp{2cm}p{2cm}p{2cm}p{2cm}p{1cm}p{1cm}X},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S37. Sequence divergence between the \textit{Denisovan-like} haplotype in MXL and unphased archaic individuals at the focal 72kb \textit{MUC19} region.} \newline The observed sequence divergence---i.e., the number of pairwise differences between a modern human haplotype and an archaic genotype normalized by the effective sequence length---between the \textit{Denisovan-like} haplotype in the focal MXL individual (i.e., NA19664), who harbors two \textit{Denisovan-like} haplotypes and has no heterozygous sites at the focal 72kb region, and the four unphased high-coverage archaic individuals. For each archaic individual, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the sequence divergence genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the sequence divergence is less than or equal to that observed at the focal 72kb region. After correcting for two multiple comparisons---i.e., one per haplotype---a \textit{P-value} less than 0.025 is considered statistically significant. Note that at the focal 72kb region, the effective sequence length with respect to the NA19664 individual is 48435bp, 48654bp, 48121bp, and 48447bp for the Denisovan, Altai Neanderthal, Chagyrskaya Neanderthal, and Vindija Neanderthal, respectively.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/focal_mxl_sequence_divergence_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join('{:.6f}'.format(round(mat[i, j], 6)).rstrip('0').rstrip('.') for j in range(1, 5))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(5, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S37. Sequence divergence between the \textit{Denisovan-like} haplotype in MXL and unphased archaic individuals at the focal 72kb \textit{MUC19} region.} \newline The observed sequence divergence---i.e., the number of pairwise differences between a modern human haplotype and an archaic genotype normalized by the effective sequence length---between the \textit{Denisovan-like} haplotype in the focal MXL individual (i.e., NA19664), who harbors two \textit{Denisovan-like} haplotypes and has no heterozygous sites at the focal 72kb region, and the four unphased high-coverage archaic individuals. For each archaic individual, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the sequence divergence genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the sequence divergence is less than or equal to that observed at the focal 72kb region. After correcting for two multiple comparisons---i.e., one per haplotype---a \textit{P-value} less than 0.025 is considered statistically significant. Note that at the focal 72kb region, the effective sequence length with respect to the NA19664 individual is 48435bp, 48654bp, 48121bp, and 48447bp for the Denisovan, Altai Neanderthal, Chagyrskaya Neanderthal, and Vindija Neanderthal, respectively.

In [76]:
df

Unnamed: 0,Individual,Super Population,Population,Archaic,Focal 72kb Region (Pairwise Diffs. Hap. 1),Focal 72kb Region (Pairwise Diffs. Hap. 2),Focal 72kb Region (Seq. Div. Hap. 1),Focal 72kb Region (Seq. Div. Hap. 2),72kb Non-overlapping Windows $\left( \mu \right)$,72kb Non-overlapping Windows $\left( \sigma \right)$,72kb Non-overlapping Windows $\left( SEM \right)$,72kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$ (Hap. 1),$P-value$ (Hap. 2)
0,NA19664,AMR,MXL,Denisovan,47.0,47.0,0.00097,0.00097,0.001413,0.000613,2e-06,5e-06,0.236521,0.236521
1,NA19664,AMR,MXL,Altai Nean.,177.5,177.5,0.003648,0.003648,0.001357,0.000603,2e-06,5e-06,0.994895,0.994895
2,NA19664,AMR,MXL,Chagyrskaya Nean.,87.5,87.5,0.001818,0.001818,0.001371,0.000607,2e-06,5e-06,0.810828,0.810828
3,NA19664,AMR,MXL,Vindija Nean.,88.0,88.0,0.001816,0.001816,0.001381,0.000604,2e-06,5e-06,0.806098,0.806098


## S38

In [77]:
# Load the dataframe.
df = pd.read_csv('./dataframes/dplus_yri_mxl_arcs_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 2, 9, 10, 11, 12, 13, 14]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 2, 9, 10, 11, 12, 13, 14]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S38. \textit{D+} tests for introgression at the focal 72kb \textit{MUC19}.} \newline \textit{D+} results for the focal 72kb \textit{MUC19} region using the Yoruba in Ibadan, Nigeria population (YRI) as \textit{P1}, the introgressed haplotype in the focal MXL individual (i.e., NA19664) as \textit{P2}, the four high-coverage archaics as \textit{P3}, and the EPO ancestral sequence to polarize ancestral states. For each archaic(\textit{P3}), the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the \textit{D+} genomic background distribution used to compute the \textit{P-value} are also reported. \textit{P-values} were computed by building a \textit{Z}-distribution of \textit{D+} values from the non-overlapping 72kb windows of comparable effective sequence length. A \textit{P-value} less than 0.05 is considered statistically significant and suggests that introgressed haplotype in MXL (\textit{P2}) shares more derived and ancestral alleles with the given archaic individual (\textit{P3}) than expected under a model of no gene flow.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/dplus_yri_mxl_arcs_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S38. \textit{D+} tests for introgression at the focal 72kb \textit{MUC19}.} \newline \textit{D+} results for the focal 72kb \textit{MUC19} region using the Yoruba in Ibadan, Nigeria population (YRI) as \textit{P1}, the introgressed haplotype in the focal MXL individual (i.e., NA19664) as \textit{P2}, the four high-coverage archaics as \textit{P3}, and the EPO ancestral sequence to polarize ancestral states. For each archaic(\textit{P3}), the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the \textit{D+} genomic background distribution used to compute the \textit{P-value} are also reported. \textit{P-values} were computed by building a \textit{Z}-distribution of \textit{D+} values from the non-overlapping 72kb windows of comparable effective sequence length. A \textit{P-value} less than 0.05 is considered statistically significant and suggests that introgressed haplotype in MXL (\textit{P2}) shares more derived and ancestral alleles with the given archaic individual (\textit{P3}) than expected under a model of no gene flow.

In [78]:
df.iloc[:, [0, 1, 2, 9, 10, 11, 12, 13, 14]]

Unnamed: 0,$P1$,$P2$,$P3$,Focal 72kb Region $\left( D+ \right)$,72kb Non-overlapping Windows $\left( \mu \right)$,72kb Non-overlapping Windows $\left( \sigma \right)$,72kb Non-overlapping Windows $\left( SEM \right)$,72kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,YRI,NA19664,Denisovan,0.742784,0.006973,0.177212,0.00103,0.291487,1.4e-05
1,YRI,NA19664,Altai Nean.,-0.621697,0.019682,0.191165,0.001111,0.314438,0.999427
2,YRI,NA19664,Chagyrskaya Nean.,0.174662,0.020329,0.193022,0.001124,0.317493,0.182764
3,YRI,NA19664,Vindija Nean.,0.18196,0.021133,0.19356,0.001125,0.318379,0.173591


## S39

In [79]:
# Load the dataframe.
df = pd.read_csv('./dataframes/focal_mxl_sequence_divergence_742kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]
# Intialize the columns.
cols = df.columns.values[[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={p{2cm}XXXXp{2cm}p{2cm}p{2cm}p{2cm}p{1cm}p{1cm}X},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S39. Sequence divergence between the longest introgressed tract in MXL and unphased archaic individuals at the focal 742kb \textit{MUC19} region.} \newline The observed sequence divergence---i.e., the number of pairwise differences between a modern human haplotype and an archaic genotype normalized by the effective sequence length---between the MXL individual (i.e., NA19725) harboring the longest introgressed tract in MXL (i.e., 742kb) and the four unphased high-coverage archaic individuals. For each archaic individual, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the sequence divergence genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the sequence divergence is less than or equal to that observed at the focal 742kb region. After correcting for two multiple comparisons---i.e., one per haplotype---a \textit{P-value} less than 0.025 is considered statistically significant. Note that the 742kb longest introgressed tract in MXL corresponds to NA19725's second haplotype (i.e., Hap. 2). Note that at the focal 742kb region, the effective sequence length with respect to the NA19725 individual is 495788bp, 497642bp, 494606bp, and 499368bp for the Denisovan, Altai Neanderthal, Chagyrskaya Neanderthal, and Vindija Neanderthal, respectively.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/focal_mxl_sequence_divergence_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join('{:.6f}'.format(round(mat[i, j], 6)).rstrip('0').rstrip('.') for j in range(1, 5))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(5, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S39. Sequence divergence between the longest introgressed tract in MXL and unphased archaic individuals at the focal 742kb \textit{MUC19} region.} \newline The observed sequence divergence---i.e., the number of pairwise differences between a modern human haplotype and an archaic genotype normalized by the effective sequence length---between the MXL individual (i.e., NA19725) harboring the longest introgressed tract in MXL (i.e., 742kb) and the four unphased high-coverage archaic individuals. For each archaic individual, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the sequence divergence genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the sequence divergence is less than or equal to that observed at the focal 742kb region. After correcting for two multiple comparisons---i.e., one per haplotype---a \textit{P-value} less than 0.025 is considered statistically significant. Note that the 742kb longest introgressed tract in MXL corresponds to NA19725's second haplotype (i.e., Hap. 2). Note that at the focal 742kb region, the effective sequence length with respect to the NA19725 individual is 495788bp, 497642bp, 494606bp, and 499368bp for the Denisovan, Altai Neanderthal, Chagyrskaya Neanderthal, and Vindija Neanderthal, respectively.

In [80]:
df.iloc[:, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]

Unnamed: 0,Archaic,Focal 742kb Region (Pairwise Diffs. Hap. 1),Focal 742kb Region (Pairwise Diffs. Hap. 2),Focal 742kb Region (Seq. Div. Hap. 1),Focal 742kb Region (Seq. Div. Hap. 2),742kb Non-overlapping Windows $\left( \mu \right)$,742kb Non-overlapping Windows $\left( \sigma \right)$,742kb Non-overlapping Windows $\left( SEM \right)$,742kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$ (Hap. 1),$P-value$ (Hap. 2)
0,Denisovan,792.5,399.5,0.001598,0.000806,0.001407,0.000346,4e-06,8e-06,0.755493,0.018832
1,Altai Nean.,580.5,450.5,0.001167,0.000905,0.001352,0.000338,4e-06,8e-06,0.29421,0.064632
2,Chagyrskaya Nean.,662.0,327.0,0.001338,0.000661,0.001365,0.00034,4e-06,8e-06,0.511927,0.006434
3,Vindija Nean.,658.5,327.5,0.001319,0.000656,0.001374,0.00034,4e-06,8e-06,0.469669,0.006879


## S40

In [81]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_muc19_introgressed_hap_frequency_per_super_population_742kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXX},
cells={halign=c, valign=m},
hlines={solid, 1pt},
vlines={solid, 1pt},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S40. Frequency of introgressed haplotypes at the focal 742kb \textit{MUC19} region among 1000 Genomes Project super populations.} \newline The frequency of introgressed haplotypes---i.e., the number of introgressed haplotypes normalized by the total number of chromosomes---stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Introgressed haplotypes at the focal 742kb \textit{MUC19} region are significantly enriched in AMR individuals (Fisher's Exact Test, Odds Ratio: 10.033, \textit{P-value}: $1.601e^{-37}$), with AMR populations exhibiting a higher proportion of introgressed tracts compared to non-AMR populations, excluding AFR (Proportions \textit{Z}-Test, \textit{Z}-statistic: 14.909, \textit{P-value}: $1.441e^{-50}$).},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_muc19_introgressed_hap_frequency_per_super_population_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        f'{mat[i, 0]}'+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(1, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S40. Frequency of introgressed haplotypes at the focal 742kb \textit{MUC19} region among 1000 Genomes Project super populations.} \newline The frequency of introgressed haplotypes---i.e., the number of introgressed haplotypes normalized by the total number of chromosomes---stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR). Introgressed haplotypes at the focal 742kb \textit{MUC19} region are significantly enriched in AMR individuals (Fisher's Exact Test, Odds Ratio: 10.033, \textit{P-value}: $1.601e^{-37}$), with AMR populations exhibiting a higher proportion of introgressed tracts compared to non-AMR populations, excluding AFR (Proportions \textit{Z}-Test, \textit{Z}-statistic: 14.909, \textit{P-value}: $1.441e^{-50}$).

In [82]:
df

Unnamed: 0,Super Population,Total Number of Chromosomes,Number of Introgressed Haps.,Introgressed Hap. Frequency
0,AMR,694,94,0.135447
1,SAS,978,21,0.021472
2,EAS,1008,23,0.022817
3,EUR,1006,2,0.001988
4,AFR,1008,0,0.0


## S41

In [83]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_muc19_introgressed_hap_frequency_per_population_742kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S41. Frequency of introgressed haplotypes at the focal 742kb \textit{MUC19} region among 1000 Genomes Project populations.} \newline The frequency of introgressed haplotypes---i.e., the number of introgressed haplotypes normalized by the total number of chromosomes--- for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR).},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_muc19_introgressed_hap_frequency_per_population_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(2, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S41. Frequency of introgressed haplotypes at the focal 742kb \textit{MUC19} region among 1000 Genomes Project populations.} \newline The frequency of introgressed haplotypes---i.e., the number of introgressed haplotypes normalized by the total number of chromosomes--- for each population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), Europeans (EUR), and Africans (AFR).

In [84]:
df

Unnamed: 0,Super Population,Population,Total Number of Chromosomes,Number of Introgressed Haps.,Introgressed Hap. Frequency
0,AMR,MXL,128,40,0.3125
1,AMR,PEL,170,30,0.176471
2,AMR,CLM,188,10,0.053191
3,AMR,PUR,208,14,0.067308
4,SAS,BEB,172,6,0.034884
5,SAS,STU,204,5,0.02451
6,SAS,ITU,204,3,0.014706
7,SAS,PJL,192,1,0.005208
8,SAS,GIH,206,6,0.029126
9,EAS,CHB,206,3,0.014563


## S42

In [85]:
# Load the dataframe.
df = pd.read_csv('./dataframes/dplus_yri_mxl_arcs_742kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 2, 9, 10, 11, 12, 13, 14]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 2, 9, 10, 11, 12, 13, 14]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S42. \textit{D+} tests for introgression at the focal 742kb \textit{MUC19}.} \newline \textit{D+} results for the focal 742kb \textit{MUC19} region using the Yoruba in Ibadan, Nigeria population (YRI) as \textit{P1}, the introgressed haplotype in the focal MXL individual (i.e., NA19664) as \textit{P2}, the four high-coverage archaics as \textit{P3}, and the EPO ancestral sequence to polarize ancestral states. For each archaic(\textit{P3}), the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the \textit{D+} genomic background distribution used to compute the \textit{P-value} are also reported. \textit{P-values} were computed by building a \textit{Z}-distribution of \textit{D+} values from the non-overlapping 742kb windows of comparable effective sequence length. A \textit{P-value} less than 0.05 is considered statistically significant and suggests that introgressed haplotype in MXL (\textit{P2}) shares more derived and ancestral alleles with the given archaic individual (\textit{P3}) than expected under a model of no gene flow.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/dplus_yri_mxl_arcs_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S42. \textit{D+} tests for introgression at the focal 742kb \textit{MUC19}.} \newline \textit{D+} results for the focal 742kb \textit{MUC19} region using the Yoruba in Ibadan, Nigeria population (YRI) as \textit{P1}, the introgressed haplotype in the focal MXL individual (i.e., NA19664) as \textit{P2}, the four high-coverage archaics as \textit{P3}, and the EPO ancestral sequence to polarize ancestral states. For each archaic(\textit{P3}), the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the \textit{D+} genomic background distribution used to compute the \textit{P-value} are also reported. \textit{P-values} were computed by building a \textit{Z}-distribution of \textit{D+} values from the non-overlapping 742kb windows of comparable effective sequence length. A \textit{P-value} less than 0.05 is considered statistically significant and suggests that introgressed haplotype in MXL (\textit{P2}) shares more derived and ancestral alleles with the given archaic individual (\textit{P3}) than expected under a model of no gene flow.

In [86]:
df.iloc[:, [0, 1, 2, 9, 10, 11, 12, 13, 14]]

Unnamed: 0,$P1$,$P2$,$P3$,Focal 742kb Region $\left( D+ \right)$,742kb Non-overlapping Windows $\left( \mu \right)$,742kb Non-overlapping Windows $\left( \sigma \right)$,742kb Non-overlapping Windows $\left( SEM \right)$,742kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,YRI,NA19664,Denisovan,0.376505,0.002145,0.072385,0.001291,0.119063,9.88884e-08
1,YRI,NA19664,Altai Nean.,0.090846,0.016794,0.08556,0.001526,0.140733,0.1441673
2,YRI,NA19664,Chagyrskaya Nean.,0.381192,0.017834,0.087986,0.001572,0.144725,7.374818e-06
3,YRI,NA19664,Vindija Nean.,0.382582,0.01855,0.088386,0.001575,0.145382,7.50523e-06


## S43

In [87]:
# Load the dataframe.
df = pd.read_csv('./dataframes/archaic_heterozygous_sites_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S43. The number of heterozygous sites in the focal 72kb \textit{MUC19} region among archaic individuals.} \newline The number of heterozygous observed in the focal 72kb \textit{MUC19} region for each of the high-coverage archaic individuals. For each archaic individual, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the heterozygous sites genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the number of heterozygous sites is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/archaic_heterozygous_sites_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(1, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S43. The number of heterozygous sites in the focal 72kb \textit{MUC19} region among archaic individuals.} \newline The number of heterozygous observed in the focal 72kb \textit{MUC19} region for each of the high-coverage archaic individuals. For each archaic individual, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the heterozygous sites genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the number of heterozygous sites is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [88]:
df

Unnamed: 0,Archaic,Focal 72kb Region (Het. Sites),Non-overlapping Windows $\left( \mu \right)$,Non-overlapping Windows $\left( \sigma \right)$,Non-overlapping Windows $\left( SEM \right)$,Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,Denisovan,6,8.746692,14.168528,0.081193,0.159141,0.454668
1,Altai Nean.,1,7.861556,15.261282,0.087445,0.171395,0.678825
2,Chagyrskaya Nean.,168,6.534416,13.241708,0.076011,0.148986,0.000231
3,Vindija Nean.,171,7.837157,13.597947,0.077908,0.152702,0.000328


## S44

In [89]:
# Load the dataframe.
df = pd.read_csv('./dataframes/afr_het_hom_heterozygous_sites_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S44. The average number of heterozygous sites in the focal 72kb \textit{MUC19} region among African individuals and individuals carrying a \textit{Denisovan-like} haplotype in the 1000 Genomes Project.} \newline The average number of heterozygous observed in the focal 72kb \textit{MUC19} region for each focal group in the 1000 Genomes Project: all African individuals ($n = 504$), individuals harboring one \textit{Denisovan-like} haplotype ($n = 255$), and individuals harboring two \textit{Denisovan-like} haplotypes ($n = 16$). For each group, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the average number of heterozygous sites genomic background distribution used to compute the \textit{P-value} are also reported. For the African individuals and individuals harboring one \textit{Denisovan-like} haplotype groups, the \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the average number of heterozygous sites is greater than or equal to that observed at the focal 72kb region. For the individuals harboring two \textit{Denisovan-like} haplotypes group, the \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the average number of heterozygous sites is less than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/afr_het_hom_heterozygous_sites_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(1, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S44. The average number of heterozygous sites in the focal 72kb \textit{MUC19} region among African individuals and individuals carrying a \textit{Denisovan-like} haplotype in the 1000 Genomes Project.} \newline The average number of heterozygous observed in the focal 72kb \textit{MUC19} region for each focal group in the 1000 Genomes Project: all African individuals ($n = 504$), individuals harboring one \textit{Denisovan-like} haplotype ($n = 255$), and individuals harboring two \textit{Denisovan-like} haplotypes ($n = 16$). For each group, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the average number of heterozygous sites genomic background distribution used to compute the \textit{P-value} are also reported. For the African individuals and individuals harboring one \textit{Denisovan-like} haplotype groups, the \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the average number of heterozygous sites is greater than or equal to that observed at the focal 72kb region. For the individuals harboring two \textit{Denisovan-like} haplotypes group, the \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the average number of heterozygous sites is less than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [90]:
df

Unnamed: 0,Group,Focal 72kb Region (Average Number of Het. Sites),Non-overlapping Windows $\left( \mu \right)$,Non-overlapping Windows $\left( \sigma \right)$,Non-overlapping Windows $\left( SEM \right)$,Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,African Inds.,75.315476,73.318893,25.707996,0.144443,0.283114,0.423606
1,Inds. with One $Denisovan-like$ Hap.,287.341176,54.639081,23.127764,0.129946,0.254698,3.2e-05
2,Inds. with Two $Denisovan-like$ Haps.,4.1875,52.579116,24.89072,0.139851,0.274113,0.000694


## S45

In [91]:
# Load the dataframe.
df = pd.read_csv('./dataframes/afr_archaic_heterozygosity_genome_wide.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S45. Genome-wide heterozygosity among archaic individuals and African individuals in the 1000 Genomes Project.} \newline The genome-wide heterozygosity (i.e., the number of heterozygous sites normalized by the effective sequence length) for each of the high coverage archaic individuals and the mean genome-wide heterozygosity among all African individuals in the 1000 Genomes Project.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/afr_archaic_heterozygosity_genome_wide.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join(
            '{:.6f}'.format(round(mat[i, j], 6)).rstrip('0').rstrip('.')
        for j in range(1, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S45. Genome-wide heterozygosity among archaic individuals and African individuals in the 1000 Genomes Project.} \newline The genome-wide heterozygosity (i.e., the number of heterozygous sites normalized by the effective sequence length) for each of the high coverage archaic individuals and the mean genome-wide heterozygosity among all African individuals in the 1000 Genomes Project.

In [92]:
df

Unnamed: 0,Group,Genome-Wide Heterozygosity
0,AFR,0.000985
1,Denisovan,0.000191
2,Altai Nean.,0.000171
3,Chagyrskaya Nean.,0.000144
4,Vindija Nean.,0.00017


## S46

In [93]:
# Load the dataframe.
df = pd.read_csv('./dataframes/dplus_altai_neanderthal_p2_denisovan_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 2, 9, 10, 11, 12, 13, 14]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 2, 9, 10, 11, 12, 13, 14]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S46. \textit{D+} tests for Denisovan introgression at the focal 72kb \textit{MUC19} region among the late Neanderthal individuals.} \newline \textit{D+} results for the focal 72kb \textit{MUC19} region using Altai Neanderthal as \textit{P1}, each of the late Neanderthals as \textit{P2}, the Denisovan as \textit{P3}, and the EPO ancestral sequence to polarize ancestral states. For each late Neanderthal (\textit{P2}), the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the \textit{D+} genomic background distribution used to compute the \textit{P-value} are also reported. \textit{P-values} were computed by building a \textit{Z}-distribution of \textit{D+} values from the non-overlapping 72kb windows of comparable effective sequence length. A \textit{P-value} less than 0.05 is considered statistically significant and suggests that the late Neanderthal (\textit{P2}) shares more derived and ancestral alleles with the Denisovan (\textit{P3}) than expected under a model of no gene flow.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/dplus_altai_neanderthal_p2_denisovan_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S46. \textit{D+} tests for Denisovan introgression at the focal 72kb \textit{MUC19} region among the late Neanderthal individuals.} \newline \textit{D+} results for the focal 72kb \textit{MUC19} region using Altai Neanderthal as \textit{P1}, each of the late Neanderthals as \textit{P2}, the Denisovan as \textit{P3}, and the EPO ancestral sequence to polarize ancestral states. For each late Neanderthal (\textit{P2}), the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the \textit{D+} genomic background distribution used to compute the \textit{P-value} are also reported. \textit{P-values} were computed by building a \textit{Z}-distribution of \textit{D+} values from the non-overlapping 72kb windows of comparable effective sequence length. A \textit{P-value} less than 0.05 is considered statistically significant and suggests that the late Neanderthal (\textit{P2}) shares more derived and ancestral alleles with the Denisovan (\textit{P3}) than expected under a model of no gene flow.

In [94]:
df.iloc[:, [0, 1, 2, 9, 10, 11, 12, 13, 14]]

Unnamed: 0,$P1$,$P2$,$P3$,Focal 72kb Region $\left( D+ \right)$,72kb Non-overlapping Windows $\left( \mu \right)$,72kb Non-overlapping Windows $\left( \sigma \right)$,72kb Non-overlapping Windows $\left( SEM \right)$,72kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,Altai Nean.,Chagyrskaya Nean.,Denisovan,0.783439,-0.113348,0.414336,0.002454,0.681522,0.029323
1,Altai Nean.,Vindija Nean.,Denisovan,0.81875,-0.169137,0.391438,0.002314,0.643858,0.018235


## S47

In [95]:
# Load the dataframe.
df = pd.read_csv('./dataframes/phased_late_neanderthals_archaic_pairwise_diffs_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S47. The number of pairwise differences at the focal 72kb \textit{MUC19} region among the Denisovan, Altai Neanderthal, and the phased late Neanderthal individuals.} \newline The number of pairwise differences observed at the focal 72kb \textit{MUC19} region between all pairwise possibilities of the unphased Denisovan genotypes, unphased Altai Neanderthal genotypes, phased Chagyrskaya Neanderthal haplotypes, and phased Vindija Neanderthal haplotypes.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/phased_late_neanderthals_archaic_pairwise_diffs_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join(
            '{:.6f}'.format(round(mat[i, j], 1)).rstrip('0').rstrip('.')
        for j in range(2, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S47. The number of pairwise differences at the focal 72kb \textit{MUC19} region among the Denisovan, Altai Neanderthal, and the phased late Neanderthal individuals.} \newline The number of pairwise differences observed at the focal 72kb \textit{MUC19} region between all pairwise possibilities of the unphased Denisovan genotypes, unphased Altai Neanderthal genotypes, phased Chagyrskaya Neanderthal haplotypes, and phased Vindija Neanderthal haplotypes.

In [96]:
df

Unnamed: 0,Archaic 1,Archaic 2,Focal 72kb Region (Pairwise Diffs.)
0,Denisovan,Altai Nean.,173.5
1,Denisovan,Chagyrskaya Nean. Hap. 1,164.0
2,Denisovan,Chagyrskaya Nean. Hap. 2,43.0
3,Denisovan,Vindija Nean. Hap. 1,170.0
4,Denisovan,Vindija Nean. Hap. 2,41.0
5,Altai Nean.,Chagyrskaya Nean. Hap. 1,3.5
6,Altai Nean.,Chagyrskaya Nean. Hap. 2,159.5
7,Altai Nean.,Vindija Nean. Hap. 1,4.0
8,Altai Nean.,Vindija Nean. Hap. 2,161.0
9,Chagyrskaya Nean. Hap. 1,Chagyrskaya Nean. Hap. 2,165.0


## S48

In [97]:
# Load the dataframe.
df = pd.read_csv('./dataframes/focal_mxl_haplotype_divergence_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]
# Intialize the columns.
cols = df.columns.values[[3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={p{2cm}XXXXp{2cm}p{2cm}p{2cm}p{2cm}p{1cm}p{1cm}X},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S48. Sequence divergence between the \textit{Denisovan-like} haplotype in MXL and phased late Neanderthal haplotypes at the focal 72kb \textit{MUC19} region.} \newline The observed sequence divergence---i.e., the number of pairwise differences between a modern human haplotype and a late Neanderthal haplotype normalized by the effective sequence length---between the \textit{Denisovan-like} haplotype in the focal MXL individual (i.e., NA19664), who harbors two \textit{Denisovan-like} haplotypes and has no heterozygous sites at the focal 72kb region, and the phased haplotypes for both of the late Neanderthals. To assess the significance, we built a distribution of sequence divergence between each of the modern human's two chromosomes and the late Neanderthal's pseudo-haplotype, which was generated by randomly sampling one allele at each position from the unphased late Neanderthal's genotypes. For each late Neanderthal, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the pseudo-haplotype divergence genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the sequence divergence is less than or equal to that observed at the focal 72kb region. After correcting for four multiple comparisons---i.e., two per each modern human haplotype---a \textit{P-value} less than 0.0125 is considered statistically significant. Note that the \textit{Denisovan-like} in each of the late Neanderthal's corresponds to their second haplotype (i.e., Chagyrskaya Nean. Hap. 2 and Vindija Nean. Hap. 2). Also, note that at the phased 72kb region, the effective sequence length with respect to the NA19664 individual is 48119bp and 48444bp for the  Chagyrskaya and Vindija Neanderthal, respectively.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/focal_mxl_haplotype_divergence_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join('{:.6f}'.format(round(mat[i, j], 6)).rstrip('0').rstrip('.') for j in range(1, 5))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(5, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S48. Sequence divergence between the \textit{Denisovan-like} haplotype in MXL and phased late Neanderthal haplotypes at the focal 72kb \textit{MUC19} region.} \newline The observed sequence divergence---i.e., the number of pairwise differences between a modern human haplotype and a late Neanderthal haplotype normalized by the effective sequence length---between the \textit{Denisovan-like} haplotype in the focal MXL individual (i.e., NA19664), who harbors two \textit{Denisovan-like} haplotypes and has no heterozygous sites at the focal 72kb region, and the phased haplotypes for both of the late Neanderthals. To assess the significance, we built a distribution of sequence divergence between each of the modern human's two chromosomes and the late Neanderthal's pseudo-haplotype, which was generated by randomly sampling one allele at each position from the unphased late Neanderthal's genotypes. For each late Neanderthal, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the pseudo-haplotype divergence genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the sequence divergence is less than or equal to that observed at the focal 72kb region. After correcting for four multiple comparisons---i.e., two per each modern human haplotype---a \textit{P-value} less than 0.0125 is considered statistically significant. Note that the \textit{Denisovan-like} in each of the late Neanderthal's corresponds to their second haplotype (i.e., Chagyrskaya Nean. Hap. 2 and Vindija Nean. Hap. 2). Also, note that at the phased 72kb region, the effective sequence length with respect to the NA19664 individual is 48119bp and 48444bp for the  Chagyrskaya and Vindija Neanderthal, respectively.

In [98]:
df

Unnamed: 0,Individual,Super Population,Population,Archaic Hap.,Focal 72kb Region (Pairwise Diffs. Hap. 1),Focal 72kb Region (Pairwise Diffs. Hap. 2),Focal 72kb Region (Seq. Div. Hap. 1),Focal 72kb Region (Seq. Div. Hap. 2),72kb Non-overlapping Windows $\left( \mu \right)$,72kb Non-overlapping Windows $\left( \sigma \right)$,72kb Non-overlapping Windows $\left( SEM \right)$,72kb Nonoverlapping Windows $\left( \pm CI_{{95\%}} \right)$,$P-value$ (Hap. 1),$P-value$ (Hap. 2)
0,NA19664,AMR,MXL,Chagyrskaya Nean. Hap. 1,168.0,168.0,0.003491,0.003491,0.001371,0.000608,2e-06,5e-06,0.993163,0.993163
1,NA19664,AMR,MXL,Chagyrskaya Nean. Hap. 2,5.0,5.0,0.000104,0.000104,0.001371,0.000608,2e-06,5e-06,0.002636,0.002636
2,NA19664,AMR,MXL,Vindija Nean. Hap. 1,169.0,169.0,0.003489,0.003489,0.001381,0.000605,2e-06,5e-06,0.99319,0.99319
3,NA19664,AMR,MXL,Vindija Nean. Hap. 2,4.0,4.0,8.3e-05,8.3e-05,0.001381,0.000605,2e-06,5e-06,0.001723,0.001723


## S49

In [99]:
# Load the dataframe.
df = pd.read_csv('./dataframes/pseudo_ancestry_painting_scores_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S49. Pseudo-Ancestry Painting scores for the focal 72kb \textit{MUC19} region.} \newline The Pseudo-Ancestry Painting (\textit{PAP}) scores observed in the focal 72kb \textit{MUC19} region. The \textit{PAP} score quantifies the number of heterozygous sites in the target individual (i.e., \textit{Target}) that can be explained by the two source individuals (i.e., $Source^{1}$ and $Source^{2}$) being fixed different allelic states and then normalized by the total number of heterozygous sites in the target individual. \textit{PAP} scores were first computed for the late Neanderthals as target individuals using the Denisovan and Altai Neanderthal as sources as well as a pairing of an MXL individual (i.e., NA19664) who is homozygous for the \textit{Denisovan-like} haplotype at the 72kb region and a YRI individual (i.e., NA19190) who is homozygous for the \textit{Human-like} haplotype. Additionally, as a negative control, we computed additional \textit{PAP} configurations where the Denisovan and Altai Neanderthal are the target individuals and the focal MXL (i.e., NA19664) and YRI (i.e., NA19190) individuals are sources. For each \textit{PAP} configuration, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the \textit{PAP} score genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the \textit{PAP} score is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/pseudo_ancestry_painting_scores_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join(
            r'$>0.999967$' if mat[i, j] == 1 else
            '0' if mat[i, j] == 0 else
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S49. Pseudo-Ancestry Painting scores for the focal 72kb \textit{MUC19} region.} \newline The Pseudo-Ancestry Painting (\textit{PAP}) scores observed in the focal 72kb \textit{MUC19} region. The \textit{PAP} score quantifies the number of heterozygous sites in the target individual (i.e., \textit{Target}) that can be explained by the two source individuals (i.e., $Source^{1}$ and $Source^{2}$) being fixed different allelic states and then normalized by the total number of heterozygous sites in the target individual. \textit{PAP} scores were first computed for the late Neanderthals as target individuals using the Denisovan and Altai Neanderthal as sources as well as a pairing of an MXL individual (i.e., NA19664) who is homozygous for the \textit{Denisovan-like} haplotype at the 72kb region and a YRI individual (i.e., NA19190) who is homozygous for the \textit{Human-like} haplotype. Additionally, as a negative control, we computed additional \textit{PAP} configurations where the Denisovan and Altai Neanderthal are the target individuals and the focal MXL (i.e., NA19664) and YRI (i.e., NA19190) individuals are sources. For each \textit{PAP} configuration, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the \textit{PAP} score genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the \textit{PAP} score is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [100]:
df

Unnamed: 0,$Source^{1}$,$Target$,$Source^{2}$,Focal 72kb Region $\left( PAP \text{ Score} \right)$,72kb Non-overlapping Windows $\left( \mu \right)$,72kb Non-overlapping Windows $\left( \sigma \right)$,72kb Non-overlapping Windows $\left( SEM \right)$,72kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,Denisovan,Chagyrskaya Nean.,Altai Nean.,0.791667,0.104511,0.173347,0.001255,0.00246,0.005398
1,Denisovan,Vindija Nean.,Altai Nean.,0.80117,0.084911,0.153893,0.001011,0.001982,0.00298
2,MXL (NA19664),Chagyrskaya Nean.,YRI (NA19190),0.94012,0.007686,0.046153,0.000335,0.000656,0.000368
3,MXL (NA19664),Vindija Nean.,YRI (NA19190),0.929412,0.006178,0.038118,0.000251,0.000492,8.7e-05
4,MXL (NA19664),Denisovan,YRI (NA19190),0.0,0.004781,0.031838,0.0002,0.000392,1.0
5,MXL (NA19664),Altai Nean.,YRI (NA19190),0.0,0.008424,0.043898,0.000311,0.00061,1.0


## S50

In [101]:
# Load the dataframe.
df = pd.read_csv('./dataframes/sprime_mxl_chb_ceu_pbs_per_region_742kb_and_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S50. $\boldsymbol{PBS_{MXL:CHB:CEU}}$ values for the focal 742kb and 72kb \textit{MUC19} using SPrime sites.} \newline The $PBS_{MXL:CHB:CEU}$ values observed in the focal 742kb and 72kb region for putatively introgressed sites in MXL that is a match with either the Altai Denisovan or the Altai Neanderthal as defined by the SPrime introgression map. For each focal region, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $PBS_{MXL:CHB:CEU}$  genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping windows of comparable effective sequence length where the $PBS_{MXL:CHB:CEU}$ value, when only considering putatively introgressed SPrime sites, is greater than or equal to that observed at the focal respective focal region. A \textit{P-value} less than 0.05 is considered statistically significant.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/sprime_mxl_chb_ceu_pbs_per_region_742kb_and_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(1, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S50. $\boldsymbol{PBS_{MXL:CHB:CEU}}$ values for the focal 742kb and 72kb \textit{MUC19} using SPrime sites.} \newline The $PBS_{MXL:CHB:CEU}$ values observed in the focal 742kb and 72kb region for putatively introgressed sites in MXL that is a match with either the Altai Denisovan or the Altai Neanderthal as defined by the SPrime introgression map. For each focal region, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $PBS_{MXL:CHB:CEU}$  genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping windows of comparable effective sequence length where the $PBS_{MXL:CHB:CEU}$ value, when only considering putatively introgressed SPrime sites, is greater than or equal to that observed at the focal respective focal region. A \textit{P-value} less than 0.05 is considered statistically significant.

In [102]:
df

Unnamed: 0,Focal Region,$PBS_{MXL:CHB:CEU}$,Non-overlapping Windows $\left( \mu \right)$,Non-overlapping Windows $\left( \sigma \right)$,Non-overlapping Windows $\left( SEM \right)$,Non-overlapping Windows $\left( \pm CI_{{95\%}} \right)$,$P-value$
0,742kb,0.291486,0.009563,0.027237,0.000769,0.001508,0.000796
1,72kb,0.279663,0.010036,0.030212,0.000416,0.000815,0.002083


## S51

In [103]:
# Load the dataframe.
df = pd.read_csv('./dataframes/amr_asn_eur_pbs_per_region_742kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S51. $\boldsymbol{PBS_{AMR:EAS/SAS:EUR}}$ values for the focal 742kb \textit{MUC19} region.} \newline The $PBS_{A:B:C}$ values observed in the focal 742kb region for all unique combinations of: AMR populations as the target population (\textit{A}), EAS and SAS populations as one of the control populations (\textit{B}), and EUR populations as the second control population (\textit{C}). For each unique $PBS_{A:B:C}$ configuration, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $PBS_{A:B:C}$  genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the $PBS_{A:B:C}$ value is greater than or equal to that observed at the focal 742kb region. A \textit{P-value} less than 0.05 is considered statistically significant. Note that when the target population is MXL, $PBS_{MXL:B:C}$ is always significant, and is never significant any other AMR population is the target population.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/amr_asn_eur_pbs_per_region_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join(
            ('0' if mat[i, j] == 0 else
            convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, 8))+r' & '\
        + (
            r'$>0.99969$' if mat[i, 8] == 1 else
            convert_sci_notation_to_latex(float(mat[i, 8])) if float(mat[i, 8]) < 0.001 else
            '{:.3f}'.format(round(float(mat[i, 8]), 3)).rstrip('0').rstrip('.')
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S51. $\boldsymbol{PBS_{AMR:EAS/SAS:EUR}}$ values for the focal 742kb \textit{MUC19} region.} \newline The $PBS_{A:B:C}$ values observed in the focal 742kb region for all unique combinations of: AMR populations as the target population (\textit{A}), EAS and SAS populations as one of the control populations (\textit{B}), and EUR populations as the second control population (\textit{C}). For each unique $PBS_{A:B:C}$ configuration, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $PBS_{A:B:C}$  genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 742kb windows of comparable effective sequence length where the $PBS_{A:B:C}$ value is greater than or equal to that observed at the focal 742kb region. A \textit{P-value} less than 0.05 is considered statistically significant. Note that when the target population is MXL, $PBS_{MXL:B:C}$ is always significant, and is never significant any other AMR population is the target population.

In [104]:
df

Unnamed: 0,$A$ Pop.,$B$ Pop.,$C$ Pop.,Focal 742kb Region $\left( PBS_{A:B:C} \right)$,742kb Non-overlapping Windows $\left( \mu \right)$,742kb Non-overlapping Windows $\left( \sigma \right)$,742kb Non-overlapping Windows $\left( SEM \right)$,742kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,CLM,BEB,CEU,0.00601,0.004268,0.006232,0.000109,0.000214,0.257562
1,CLM,BEB,FIN,0.0,0.005621,0.007057,0.000123,0.000242,1.0
2,CLM,BEB,GBR,0.007634,0.004288,0.006229,0.000109,0.000214,0.201344
3,CLM,BEB,IBS,0.002293,0.00376,0.005823,0.000102,0.0002,0.425298
4,CLM,BEB,TSI,0.005953,0.004534,0.006249,0.000109,0.000214,0.289337
5,CLM,CDX,CEU,0.0,0.001491,0.004624,8.1e-05,0.000159,1.0
6,CLM,CDX,FIN,0.0,0.003462,0.007061,0.000123,0.000242,1.0
7,CLM,CDX,GBR,0.0,0.001566,0.004953,8.7e-05,0.00017,1.0
8,CLM,CDX,IBS,0.0,0.001345,0.00435,7.6e-05,0.000149,1.0
9,CLM,CDX,TSI,0.0,0.001596,0.004769,8.3e-05,0.000163,1.0


## S52

In [105]:
# Load the dataframe.
df = pd.read_csv('./dataframes/amr_asn_eur_pbs_per_region_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S52. $\boldsymbol{PBS_{AMR:EAS/SAS:EUR}}$ values for the focal 72kb \textit{MUC19} region.} \newline The $PBS_{A:B:C}$ values observed in the focal 72kb region for all unique combinations of: AMR populations as the target population (\textit{A}), EAS and SAS populations as one of the control populations (\textit{B}), and EUR populations as the second control population (\textit{C}). For each unique $PBS_{A:B:C}$ configuration, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $PBS_{A:B:C}$  genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the $PBS_{A:B:C}$ value is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant. Note that when the target population is MXL, $PBS_{MXL:B:C}$ is always significant, and is never significant any other AMR population is the target population.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/amr_asn_eur_pbs_per_region_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join(
            ('0' if mat[i, j] == 0 else
            convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, 8))+r' & '\
        + (
            r'$>0.999968$' if mat[i, 8] == 1 else
            convert_sci_notation_to_latex(float(mat[i, 8])) if float(mat[i, 8]) < 0.001 else
            '{:.3f}'.format(round(float(mat[i, 8]), 3)).rstrip('0').rstrip('.')
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S52. $\boldsymbol{PBS_{AMR:EAS/SAS:EUR}}$ values for the focal 72kb \textit{MUC19} region.} \newline The $PBS_{A:B:C}$ values observed in the focal 72kb region for all unique combinations of: AMR populations as the target population (\textit{A}), EAS and SAS populations as one of the control populations (\textit{B}), and EUR populations as the second control population (\textit{C}). For each unique $PBS_{A:B:C}$ configuration, the mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the $PBS_{A:B:C}$  genomic background distribution used to compute the \textit{P-value} are also reported. The \textit{P-value} represents the proportion of non-overlapping 72kb windows of comparable effective sequence length where the $PBS_{A:B:C}$ value is greater than or equal to that observed at the focal 72kb region. A \textit{P-value} less than 0.05 is considered statistically significant. Note that when the target population is MXL, $PBS_{MXL:B:C}$ is always significant, and is never significant any other AMR population is the target population.

In [106]:
df

Unnamed: 0,$A$ Pop.,$B$ Pop.,$C$ Pop.,Focal 72kb Region $\left( PBS_{A:B:C} \right)$,72kb Non-overlapping Windows $\left( \mu \right)$,72kb Non-overlapping Windows $\left( \sigma \right)$,72kb Non-overlapping Windows $\left( SEM \right)$,72kb Non-overlapping Windows $\left( \pm CI_{95\%} \right)$,$P-value$
0,CLM,BEB,CEU,0.0,0.0062,0.010694,6e-05,0.000118,1.0
1,CLM,BEB,FIN,0.0,0.007522,0.011968,6.7e-05,0.000132,1.0
2,CLM,BEB,GBR,0.0,0.006258,0.010736,6e-05,0.000118,1.0
3,CLM,BEB,IBS,0.0,0.005641,0.010115,5.7e-05,0.000111,1.0
4,CLM,BEB,TSI,0.0,0.00643,0.010848,6.1e-05,0.000119,1.0
5,CLM,CDX,CEU,0.0,0.004148,0.010036,5.6e-05,0.000111,1.0
6,CLM,CDX,FIN,0.0,0.006781,0.013645,7.7e-05,0.00015,1.0
7,CLM,CDX,GBR,0.0,0.004312,0.010417,5.9e-05,0.000115,1.0
8,CLM,CDX,IBS,0.0,0.003848,0.00949,5.3e-05,0.000105,1.0
9,CLM,CDX,TSI,0.0,0.00421,0.010086,5.7e-05,0.000111,1.0


## S53

In [107]:
# Load the dataframe.
df = pd.read_csv('./dataframes/pbs_empirical_significance_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S53. Summary of per-SNP $\boldsymbol{PBS_{MXL:CHB:CEU}}$ values for the focal 742kb \textit{MUC19} region.} \newline The total number of SNPs, the number of SNPs with a $PBS_{MXL:CHB:CEU}$ value greater than the $99.95^{th}$ genome-wide percentile, and those identified as significant based on different statistical criteria: uncorrected \textit{P-values} less than 0.05, Bonferroni adjusted \textit{P-values} less than 0.05, and Benjamini-Hochberg adjusted \textit{P-values} less than 0.01. We report the results for all SNPs, as well as stratified by SNP set (see Methods for a description of these SNP sets).},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/pbs_empirical_significance_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(mat.shape[1]))
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S53. Summary of per-SNP $\boldsymbol{PBS_{MXL:CHB:CEU}}$ values for the focal 742kb \textit{MUC19} region.} \newline The total number of SNPs, the number of SNPs with a $PBS_{MXL:CHB:CEU}$ value greater than the $99.95^{th}$ genome-wide percentile, and those identified as significant based on different statistical criteria: uncorrected \textit{P-values} less than 0.05, Bonferroni adjusted \textit{P-values} less than 0.05, and Benjamini-Hochberg adjusted \textit{P-values} less than 0.01. We report the results for all SNPs, as well as stratified by SNP set (see Methods for a description of these SNP sets).

In [108]:
df

Unnamed: 0,SNP Set,Total SNPs,$PBS > 99.95^{th}$ Genome-Wide Percentile,Uncorrected $P-values < 0.05$,Bonferroni Adjusted $P-values < 0.05$,Benjamini-Hochberg Adjusted $P-values < 0.01$
0,All SNPs,6144,417,1111,0,485
1,Archaic SNPs,217,208,217,0,217
2,Denisovan-specific SNPs,135,135,135,0,135
3,Neanderthal-specific SNPs,80,72,80,0,80
4,Human-specific SNPs,5420,141,677,0,200
5,Shared Hominin SNPs,507,68,217,0,68


## S54

In [109]:
# Load the dataframe.
df = pd.read_csv('./dataframes/pbs_empirical_outlier_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 2, 5, 8, 11, 14]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 2, 5, 8, 11, 14]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S54. Summary of per-SNP $\boldsymbol{PBS_{MXL:CHB:CEU}}$ outliers for the focal 742kb \textit{MUC19} region.} \newline The total number of SNPs, the number of outlier SNPs with a $PBS_{MXL:CHB}$ value greater than the $99.95^{th}$ genome-wide percentile, the mean $PBS_{MXL:CHB}$ value among these outliers, and the associated 95\% confidence intervals ($\pm CI_{95\%}$) for the mean $PBS_{MXL:CHB}$ value. Additionally, the mean percentile rank of outlier SNPs and the associated 95\% confidence intervals ($\pm CI_{95\%}$) for the mean percentile rank is reported. We report the results for all SNPs, as well as stratified by SNP set (see Methods for a description of these SNP sets).},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/pbs_empirical_outlier_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(1, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S54. Summary of per-SNP $\boldsymbol{PBS_{MXL:CHB:CEU}}$ outliers for the focal 742kb \textit{MUC19} region.} \newline The total number of SNPs, the number of outlier SNPs with a $PBS_{MXL:CHB}$ value greater than the $99.95^{th}$ genome-wide percentile, the mean $PBS_{MXL:CHB}$ value among these outliers, and the associated 95\% confidence intervals ($\pm CI_{95\%}$) for the mean $PBS_{MXL:CHB}$ value. Additionally, the mean percentile rank of outlier SNPs and the associated 95\% confidence intervals ($\pm CI_{95\%}$) for the mean percentile rank is reported. We report the results for all SNPs, as well as stratified by SNP set (see Methods for a description of these SNP sets).

In [110]:
df.iloc[:, [0, 1, 2, 5, 8, 11, 14]]

Unnamed: 0,SNP Set,Total SNPs,$PBS > 99.95^{th}$ Genome-Wide Percentile,$PBS \; \left( \mu \right)$,$PBS \; \left( \pm CI_{95\%} \right)$,Percentile Rank $\left( \mu \right)$,Percentile Rank $\left( \pm CI_{95\%} \right)$
0,All SNPs,6144,417,0.301969,0.003209,99.975275,0.001031
1,Archaic SNPs,217,208,0.293409,0.003612,99.972262,0.001309
2,Denisovan-specific SNPs,135,135,0.279338,0.000319,99.966727,0.000229
3,Neanderthal-specific SNPs,80,72,0.320095,0.007143,99.982795,0.002248
4,Human-specific SNPs,5420,141,0.313188,0.006865,99.978522,0.001937
5,Shared Hominin SNPs,507,68,0.30489,0.006364,99.977758,0.002286


## S55

In [111]:
# Load the dataframe.
df = pd.read_csv('./dataframes/pbs_empirical_bhp_significance_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 2, 5, 8, 11, 14]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 2, 5, 8, 11, 14]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S55. Summary of significant per-SNP $\boldsymbol{PBS_{MXL:CHB:CEU}}$ values for the focal 742kb \textit{MUC19} region.} \newline The total number of SNPs, the number of significant SNPs (i.e., SNPs with a Benjamini-Hochberg adjusted \textit{P-values} less than 0.01), the mean $PBS_{MXL:CHB}$ value among these significant SNPs, and the associated 95\% confidence intervals ($\pm CI_{95\%}$) for the mean $PBS_{MXL:CHB}$ value. Additionally, the mean percentile rank of significant SNPs and the associated 95\% confidence intervals ($\pm CI_{95\%}$) for the mean percentile rank is reported. We report the results for all SNPs, as well as stratified by SNP set (see Methods for a description of these SNP sets).},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/pbs_empirical_bhp_significance_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(1, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S55. Summary of significant per-SNP $\boldsymbol{PBS_{MXL:CHB:CEU}}$ values for the focal 742kb \textit{MUC19} region.} \newline The total number of SNPs, the number of significant SNPs (i.e., SNPs with a Benjamini-Hochberg adjusted \textit{P-values} less than 0.01), the mean $PBS_{MXL:CHB}$ value among these significant SNPs, and the associated 95\% confidence intervals ($\pm CI_{95\%}$) for the mean $PBS_{MXL:CHB}$ value. Additionally, the mean percentile rank of significant SNPs and the associated 95\% confidence intervals ($\pm CI_{95\%}$) for the mean percentile rank is reported. We report the results for all SNPs, as well as stratified by SNP set (see Methods for a description of these SNP sets).

In [112]:
df.iloc[:, [0, 1, 2, 5, 8, 11, 14]]

Unnamed: 0,SNP Set,Total SNPs,Benjamini-Hochberg Adjusted $P-values < 0.01$,$PBS \; \left( \mu \right)$,$PBS \; \left( \pm CI_{95\%} \right)$,Percentile Rank $\left( \mu \right)$,Percentile Rank $\left( \pm CI_{95\%} \right)$
0,All SNPs,6144,485,0.293157,0.003382,99.968599,0.001737
1,Archaic SNPs,217,217,0.291069,0.003776,99.970305,0.00178
2,Denisovan-specific SNPs,135,135,0.279338,0.000319,99.966727,0.000229
3,Neanderthal-specific SNPs,80,80,0.311784,0.008503,99.977022,0.004372
4,Human-specific SNPs,5420,200,0.291433,0.00675,99.963633,0.003535
5,Shared Hominin SNPs,507,68,0.30489,0.006364,99.977758,0.002286


## S56

In [113]:
# Load the dataframe.
df = pd.read_csv('../meta_data/slim_demographic_info.txt', sep='\t')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S56. Demographic model from Medina-Mu\~{n}oz et al., 2023 with archaic introgression.} \newline Demographic parameters used for \textit{PBS} simulations.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/slim_demographic_info.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(1, 2))+r' & '\
        + r' & '.join(f'{mat[i, j]}' for j in range(2, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S56. Demographic model from Medina-Mu\~{n}oz et al., 2023 with archaic introgression.} \newline Demographic parameters used for \textit{PBS} simulations.

In [114]:
df

Unnamed: 0,Parameter,Value,Description,Reference
0,Population size,13580.0,Ancestral Pop. size,"Medina-Mu\~{n}oz et al., 2023"
1,Population size,13249.0,Archaic Pop. size,"Jacobs et al., 2019; Malaspinas et al., 2016"
2,Population size,27142.0,AMH Pop. size,"Medina-Mu\~{n}oz et al., 2023"
3,Population size,826.0,NEA Pop. size,"Jacobs et al., 2019; Malaspinas et al., 2016"
4,Population size,5083.0,DEN Pop. size,"Jacobs et al., 2019; Malaspinas et al., 2016"
5,Population size,1835.0,OOA Pop. size,"Medina-Mu\~{n}oz et al., 2023"
6,Population size,27142.0,AFR Pop. size,"Medina-Mu\~{n}oz et al., 2023"
7,Population size,2761.0,EUR Pop. size,"Medina-Mu\~{n}oz et al., 2023"
8,Population size,1955.0,EAS Pop. size,"Medina-Mu\~{n}oz et al., 2023"
9,Population size,1313.0,MXB Pop. size,"Medina-Mu\~{n}oz et al., 2023"


## S57

In [115]:
# Load the dataframe.
df = pd.read_csv('./dataframes/slim_per_snp_geq_freq_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S57. Summary of SNPs segregating at high frequency in MXL across 10,000 simulated replicates.} \newline The total number of SNPs, the number of SNPs segregating a frequency greater than or equal to 30\%, and the associated \textit{P-value} based on 10,000 replicates per simulation model. The \textit{P-value} represents the proportion of SNPs across all 10,000 simulations that are segregating at a frequency greater than or equal to 30\%, where a \textit{P-value} less than 0.05 is considered statistically significant. We report the results for all SNPs, as well as archaic SNPs.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/slim_per_snp_geq_freq_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(2))+r' & '\
        + r' & '.join(
            (convert_sci_notation_to_latex(mat[i, j]) if abs(mat[i, j]) < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(2, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S57. Summary of SNPs segregating at high frequency in MXL across 10,000 simulated replicates.} \newline The total number of SNPs, the number of SNPs segregating a frequency greater than or equal to 30\%, and the associated \textit{P-value} based on 10,000 replicates per simulation model. The \textit{P-value} represents the proportion of SNPs across all 10,000 simulations that are segregating at a frequency greater than or equal to 30\%, where a \textit{P-value} less than 0.05 is considered statistically significant. We report the results for all SNPs, as well as archaic SNPs.

In [116]:
df

Unnamed: 0,Selection Model,SNP Set,Total SNPs,MXL SNPs $\geq 30\%$,$P-value$
0,Neutral,All SNPs,55296617,7572328,0.13694
1,Neutral,Archaic SNPs,55296617,47581,0.00086
2,Negative,All SNPs,53988190,6910329,0.127997
3,Negative,Archaic SNPs,53988190,65682,0.001217


## S58

In [117]:
# Load the dataframe.
df = pd.read_csv('./dataframes/slim_per_snp_pbs_significance_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S58. Summary of significant per-SNP $\boldsymbol{PBS_{MXL:CHB:CEU}}$ values across 10,000 simulated replicates for all 417 outlier SNPs.} \newline The number of  $PBS_{MXL:CHB:CEU}$ values among the 417 SNPs with a per-SNP \textit{PBS} value greater than the empirical $99.95^{th}$ genome-wide percentile, and those identified as significant based on 10,000 replicate simulations per model and different statistical criteria: uncorrected \textit{P-values} less than 0.05, Bonferroni adjusted \textit{P-values} less than 0.05, and Benjamini-Hochberg adjusted \textit{P-values} less than 0.01. We report the results for all SNPs, as well as archaic SNPs.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/slim_per_snp_pbs_significance_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(mat.shape[1]))
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S58. Summary of significant per-SNP $\boldsymbol{PBS_{MXL:CHB:CEU}}$ values across 10,000 simulated replicates for all 417 outlier SNPs.} \newline The number of  $PBS_{MXL:CHB:CEU}$ values among the 417 SNPs with a per-SNP \textit{PBS} value greater than the empirical $99.95^{th}$ genome-wide percentile, and those identified as significant based on 10,000 replicate simulations per model and different statistical criteria: uncorrected \textit{P-values} less than 0.05, Bonferroni adjusted \textit{P-values} less than 0.05, and Benjamini-Hochberg adjusted \textit{P-values} less than 0.01. We report the results for all SNPs, as well as archaic SNPs.

In [118]:
df

Unnamed: 0,Selection Model,SNP Set,Number of $PBS_{MXL:CHB:CEU}$ Values,Uncorrected $P-value < 0.05$,Bonferroni Adj. $P-value < 0.05$,Benjamini-Hochberg Adj. $P-value < 0.01$
0,Negative,All SNPs,417,417,184,417
1,Negative,Archaic SNPs,208,208,208,208
2,Neutral,All SNPs,417,417,97,417
3,Neutral,Archaic SNPs,208,208,208,208


## S59

In [119]:
# Load the dataframe.
df = pd.read_csv('./dataframes/slim_per_snp_unique_pbs_significance_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S59. Summary of significant per-SNP $\boldsymbol{PBS_{MXL:CHB:CEU}}$ values across 10,000 simulated replicates for the 57 unique outlier SNPs.} \newline The number of unique $PBS_{MXL:CHB:CEU}$ values among the 57 unique per-SNP \textit{PBS} value greater than the empirical $99.95^{th}$ genome-wide percentile, and those identified as significant based on 10,000 replicate simulations per model and different statistical criteria: uncorrected \textit{P-values} less than 0.05, Bonferroni adjusted \textit{P-values} less than 0.05, and Benjamini-Hochberg adjusted \textit{P-values} less than 0.01. We report the results for all SNPs, as well as archaic SNPs.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/slim_per_snp_pbs_unique_significance_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(mat.shape[1]))
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S59. Summary of significant per-SNP $\boldsymbol{PBS_{MXL:CHB:CEU}}$ values across 10,000 simulated replicates for the 57 unique outlier SNPs.} \newline The number of unique $PBS_{MXL:CHB:CEU}$ values among the 57 unique per-SNP \textit{PBS} value greater than the empirical $99.95^{th}$ genome-wide percentile, and those identified as significant based on 10,000 replicate simulations per model and different statistical criteria: uncorrected \textit{P-values} less than 0.05, Bonferroni adjusted \textit{P-values} less than 0.05, and Benjamini-Hochberg adjusted \textit{P-values} less than 0.01. We report the results for all SNPs, as well as archaic SNPs.

In [120]:
df

Unnamed: 0,Selection Model,SNP Set,Unique $PBS_{MXL:CHB:CEU}$ Values,Uncorrected $P-value < 0.05$,Bonferroni Adj. $P-value < 0.05$,Benjamini-Hochberg Adj. $P-value < 0.01$
0,Negative,All SNPs,57,57,57,57
1,Negative,Archaic SNPs,20,20,20,20
2,Neutral,All SNPs,57,57,57,57
3,Neutral,Archaic SNPs,20,20,20,20


## S60

In [121]:
# Load the dataframe.
df = pd.read_csv('./dataframes/slim_per_region_geq_freq_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 2, 3, 5, 6]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 2, 3, 5, 6]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S60. Comparison of the number of SNPs segregating at high frequency in MXL for the observed focal regions and across 10,000 simulated replicates.} \newline The number of SNPs segregating a frequency greater than or equal to 30\% observed in MXL, along with the number of simulated replicates where the number SNPs segregating a frequency greater than or equal to 30\% is greater than or equal to what is observed in the empirical data. The \textit{P-value} represents the proportion of the 10,000 simulation replicates where the simulated value is greater than or equal to what is observed in the empirical data, where a \textit{P-value} less than 0.05 is considered statistically significant. We report the results for both the focal 742kb and 72kb \textit{MUC19} regions, for all SNPs, as well as archaic SNPs.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/slim_per_region_geq_freq_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(5))+r' & '\
        + r' & '.join(
            r'$<0.0001$' if mat[i, j][0] == '<' else
            (convert_sci_notation_to_latex(float(mat[i, j])) if abs(float(mat[i, j])) < 0.0001 else
            '{:.4f}'.format(round(float(mat[i, j]), 4)).rstrip('0').rstrip('.'))
            for j in range(5, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S60. Comparison of the number of SNPs segregating at high frequency in MXL for the observed focal regions and across 10,000 simulated replicates.} \newline The number of SNPs segregating a frequency greater than or equal to 30\% observed in MXL, along with the number of simulated replicates where the number SNPs segregating a frequency greater than or equal to 30\% is greater than or equal to what is observed in the empirical data. The \textit{P-value} represents the proportion of the 10,000 simulation replicates where the simulated value is greater than or equal to what is observed in the empirical data, where a \textit{P-value} less than 0.05 is considered statistically significant. We report the results for both the focal 742kb and 72kb \textit{MUC19} regions, for all SNPs, as well as archaic SNPs.

In [122]:
df

Unnamed: 0,Selection Model,SNP Set,Region,MXL AAF $\geq 30\%$ (Observed),Total Replicates,Simulated Replicates $\geq$ Observed,$P-value$
0,Neutral,All SNPs,742kb,1311,10000,0,<0.0001
1,Neutral,Archaic SNPs,742kb,208,10000,0,<0.0001
2,Neutral,All SNPs,72kb,300,10000,0,<0.0001
3,Neutral,Archaic SNPs,72kb,140,10000,0,<0.0001
4,Negative,All SNPs,742kb,1311,10000,0,<0.0001
5,Negative,Archaic SNPs,742kb,208,10000,0,<0.0001
6,Negative,All SNPs,72kb,300,10000,0,<0.0001
7,Negative,Archaic SNPs,72kb,140,10000,0,<0.0001


## S61

In [123]:
# Load the dataframe.
df = pd.read_csv('./dataframes/slim_per_region_pbs_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 2, 3, 5, 6]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 2, 3, 5, 6]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S61. Comparison of per-region \textit{PBS} values for the observed focal regions and across 10,000 simulated replicates.} \newline The per-region \textit{PBS} values observed in MXL, along with the number of simulated replicates where the per-region \textit{PBS} value is greater than or equal to what is observed in the empirical data. The \textit{P-value} represents the proportion of the 10,000 simulation replicates where the simulated value is greater than or equal to what is observed in the empirical data, where a \textit{P-value} less than 0.05 is considered statistically significant. We report the results for both the focal 742kb and 72kb \textit{MUC19} regions, for all SNPs, as well as archaic SNPs.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/slim_per_region_pbs_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(3, 5))+r' & '\
        + r' & '.join(
            r'$<0.0001$' if mat[i, j][0] == '<' else
            (convert_sci_notation_to_latex(float(mat[i, j])) if abs(float(mat[i, j])) < 0.0001 else
            '{:.4f}'.format(round(float(mat[i, j]), 4)).rstrip('0').rstrip('.'))
            for j in range(5, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S61. Comparison of per-region \textit{PBS} values for the observed focal regions and across 10,000 simulated replicates.} \newline The per-region \textit{PBS} values observed in MXL, along with the number of simulated replicates where the per-region \textit{PBS} value is greater than or equal to what is observed in the empirical data. The \textit{P-value} represents the proportion of the 10,000 simulation replicates where the simulated value is greater than or equal to what is observed in the empirical data, where a \textit{P-value} less than 0.05 is considered statistically significant. We report the results for both the focal 742kb and 72kb \textit{MUC19} regions, for all SNPs, as well as archaic SNPs.

In [124]:
df

Unnamed: 0,Selection Model,SNP Set,Region,$PBS$ (Observed),Total Replicates,Simulated Replicates $\geq$ Observed,$P-value$
0,Neutral,All SNPs,742kb,0.065842,10000,0,<0.0001
1,Neutral,Archaic SNPs,742kb,0.291188,10000,0,<0.0001
2,Neutral,All SNPs,72kb,0.127226,10000,3,0.0003
3,Neutral,Archaic SNPs,72kb,0.279597,10000,6,0.0006
4,Negative,All SNPs,742kb,0.065842,10000,0,<0.0001
5,Negative,Archaic SNPs,742kb,0.291188,10000,0,<0.0001
6,Negative,All SNPs,72kb,0.127226,10000,1,0.0001
7,Negative,Archaic SNPs,72kb,0.279597,10000,1,0.0001


## S62

In [125]:
# Load the dataframe.
df = pd.read_csv('./dataframes/slim_pbs_power_summary.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S62. Power to detect positive selection in MXL using per-region \textit{PBS} values.} \newline The power to detect positive selection for different strengths of positive selection: strong ($s = 0.1$), weak ($s = 0.01$), and extremely weak ($s = 0.0015$). The power represents the proportion of the 1,000 simulated replicates with per-region \textit{PBS} values greater than the upper 5\% quantile from its respective neutral distribution of per-region \textit{PBS} values consisting of 10,000 replicates. For each selection coefficient we report the results for both the simulated 742kb and 72kb \textit{MUC19} regions, for all SNPs, as well as archaic SNPs.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/slim_pbs_power_summary.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(
            '{:.4f}'.format(round(mat[i, j], 4)).rstrip('0').rstrip('.')
        for j in range(mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S62. Power to detect positive selection in MXL using per-region \textit{PBS} values.} \newline The power to detect positive selection for different strengths of positive selection: strong ($s = 0.1$), weak ($s = 0.01$), and extremely weak ($s = 0.0015$). The power represents the proportion of the 1,000 simulated replicates with per-region \textit{PBS} values greater than the upper 5\% quantile from its respective neutral distribution of per-region \textit{PBS} values consisting of 10,000 replicates. For each selection coefficient we report the results for both the simulated 742kb and 72kb \textit{MUC19} regions, for all SNPs, as well as archaic SNPs.

In [126]:
df

Unnamed: 0,$s$,All SNPs (72kb),Archaic SNPs (72kb),All SNPs (742kb),Archaic SNPs (742kb)
0,0.1,0.986,0.984,0.982,0.981
1,0.01,0.873,0.935,0.794,0.857
2,0.0015,0.096,0.193,0.081,0.126


## S63

In [127]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_ihs_critical_scores_proportions_742kb.csv.gz')
df = df[(df['SNP Set'] == 'Archaic SNPs') & (df['Super Population'] != 'AFR')]
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 3, 4, 5, 6, 7, 8, 9]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 3, 4, 5, 6, 7, 8, 9]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S63. Integrated Haplotype Scores (\textit{iHS}) for archaic SNPs in the focal 742kb \textit{MUC19} region among 1000 Genomes Project non-African populations.} \newline The normalized $\mid iHS \mid$ scores for archaic SNPs observed at the focal 742kb region for each non-African population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each population, the normalized \textit{iHS} scores were calculated for archaic SNPs with a minor allele frequency $> 5\%$. For each population, the table reports the total number of \textit{iHS} scores observed for archaic SNPs, the number of extreme \textit{iHS} scores for archaic SNPs (i.e., $\mid iHS \mid > 2$), the observed proportion of extreme \textit{iHS} scores for archaic SNPs (calculated as the number of extreme \textit{iHS} scores normalized by the total number of \textit{iHS} scores), the $99^{th}$ percentile of the proportion of extreme \textit{iHS} scores for archaic SNPs determined from the genomic background distribution (i.e., non-overlapping 742kb windows with more than 10 archaic SNPs), the total number of 742kb windows in the genomic background distribution, the number of 742kb windows where the observed proportion of extreme \textit{iHS} scores for archaic SNPs is greater than the genomic background distribution, and the percentile rank of the observed proportion of extreme \textit{iHS} scores for archaic SNPs (i.e., the percentage of the genomic background distribution that is less than the observed proportion of extreme \textit{iHS} scores). An observed proportion of extreme \textit{iHS} scores greater than the $99^{th}$ percentile of the genomic background distribution is considered evidence of positive selection. Note that a "---" denotes that the population did not meet the minimum number of archaic SNPs threshold for the \textit{iHS} analysis.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_ihs_critical_scores_proportions_archaic_snps_742kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join((
            '---' if np.isnan(mat[i, j]) else
            '0' if mat[i, j] == 0 else
            convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S63. Integrated Haplotype Scores (\textit{iHS}) for archaic SNPs in the focal 742kb \textit{MUC19} region among 1000 Genomes Project non-African populations.} \newline The normalized $\mid iHS \mid$ scores for archaic SNPs observed at the focal 742kb region for each non-African population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each population, the normalized \textit{iHS} scores were calculated for archaic SNPs with a minor allele frequency $> 5\%$. For each population, the table reports the total number of \textit{iHS} scores observed for archaic SNPs, the number of extreme \textit{iHS} scores for archaic SNPs (i.e., $\mid iHS \mid > 2$), the observed proportion of extreme \textit{iHS} scores for archaic SNPs (calculated as the number of extreme \textit{iHS} scores normalized by the total number of \textit{iHS} scores), the $99^{th}$ percentile of the proportion of extreme \textit{iHS} scores for archaic SNPs determined from the genomic background distribution (i.e., non-overlapping 742kb windows with more than 10 archaic SNPs), the total number of 742kb windows in the genomic background distribution, the number of 742kb windows where the observed proportion of extreme \textit{iHS} scores for archaic SNPs is greater than the genomic background distribution, and the percentile rank of the observed proportion of extreme \textit{iHS} scores for archaic SNPs (i.e., the percentage of the genomic background distribution that is less than the observed proportion of extreme \textit{iHS} scores). An observed proportion of extreme \textit{iHS} scores greater than the $99^{th}$ percentile of the genomic background distribution is considered evidence of positive selection. Note that a "---" denotes that the population did not meet the minimum number of archaic SNPs threshold for the \textit{iHS} analysis.

In [128]:
df.iloc[:, [0, 1, 3, 4, 5, 6, 7, 8, 9]]

Unnamed: 0,Super Population,Population,Focal 742kb Region (Total SNPs),Focal 742kb Region (SNPs with $\mid iHS \mid > 2$),Focal 742kb Region (Prop. of SNPs with $\mid iHS \mid > 2$),742kb Non-overlapping Windows ($99^{th}$ Percentile),742kb Non-overlapping Windows (Total SNPs $> 10$),Focal 742kb Region $>$ 742kb Non-overlapping Windows,Focal 742kb Region (Percentile Rank)
1,AMR,MXL,217,208,0.958525,0.73332,1250.0,1248.0,99.84
5,AMR,PEL,217,66,0.304147,0.686908,1119.0,1063.0,94.995532
9,AMR,CLM,217,6,0.02765,0.52043,1276.0,1032.0,80.877743
13,AMR,PUR,217,0,0.0,0.5625,1148.0,0.0,0.0
17,SAS,BEB,197,0,0.0,0.514489,1263.0,0.0,0.0
21,SAS,STU,214,1,0.004673,0.494474,1264.0,908.0,71.835443
25,SAS,ITU,184,0,0.0,0.544545,1269.0,0.0,0.0
29,SAS,PJL,156,0,0.0,0.493746,1312.0,0.0,0.0
33,SAS,GIH,214,0,0.0,0.650556,1288.0,0.0,0.0
37,EAS,CHB,0,0,,,,,


## S64

In [129]:
# Load the dataframe.
df = pd.read_csv('./dataframes/tgp_ihs_critical_scores_proportions_72kb.csv.gz')
df = df[(df['SNP Set'] == 'Archaic SNPs') & (df['Super Population'] != 'AFR')]
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, [0, 1, 3, 4, 5, 6, 7, 8, 9]]
# Intialize the columns.
cols = df.columns.values[[0, 1, 3, 4, 5, 6, 7, 8, 9]]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S64. Integrated Haplotype Scores (\textit{iHS}) for archaic SNPs in the focal 72kb \textit{MUC19} region among 1000 Genomes Project non-African populations.} \newline The normalized $\mid iHS \mid$ scores for archaic SNPs observed at the focal 72kb region for each non-African population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each population, the normalized \textit{iHS} scores were calculated for archaic SNPs with a minor allele frequency $> 5\%$. For each population, the table reports the total number of \textit{iHS} scores observed for archaic SNPs, the number of extreme \textit{iHS} scores for archaic SNPs (i.e., $\mid iHS \mid > 2$), the observed proportion of extreme \textit{iHS} scores for archaic SNPs (calculated as the number of extreme \textit{iHS} scores normalized by the total number of \textit{iHS} scores), the $99^{th}$ percentile of the proportion of extreme \textit{iHS} scores for archaic SNPs determined from the genomic background distribution (i.e., non-overlapping 72kb windows with more than 10 archaic SNPs), the total number of 72kb windows in the genomic background distribution, the number of 72kb windows where the observed proportion of extreme \textit{iHS} scores for archaic SNPs is greater than the genomic background distribution, and the percentile rank of the observed proportion of extreme \textit{iHS} scores for archaic SNPs (i.e., the percentage of the genomic background distribution that is less than the observed proportion of extreme \textit{iHS} scores). An observed proportion of extreme \textit{iHS} scores greater than the $99^{th}$ percentile of the genomic background distribution is considered evidence of positive selection. Note that a "---" denotes that the population did not meet the minimum number of archaic SNPs threshold for the \textit{iHS} analysis.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/tgp_ihs_critical_scores_proportions_archaic_snps_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(3))+r' & '\
        + r' & '.join((
            '---' if np.isnan(mat[i, j]) else
            '0' if mat[i, j] == 0 else
            convert_sci_notation_to_latex(mat[i, j]) if mat[i, j] < 0.001 else
            '{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.'))
            for j in range(3, mat.shape[1])
        )\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S64. Integrated Haplotype Scores (\textit{iHS}) for archaic SNPs in the focal 72kb \textit{MUC19} region among 1000 Genomes Project non-African populations.} \newline The normalized $\mid iHS \mid$ scores for archaic SNPs observed at the focal 72kb region for each non-African population stratified by super population: Admixed Americans (AMR), South Asians (SAS), East Asians (EAS), and Europeans (EUR). For each population, the normalized \textit{iHS} scores were calculated for archaic SNPs with a minor allele frequency $> 5\%$. For each population, the table reports the total number of \textit{iHS} scores observed for archaic SNPs, the number of extreme \textit{iHS} scores for archaic SNPs (i.e., $\mid iHS \mid > 2$), the observed proportion of extreme \textit{iHS} scores for archaic SNPs (calculated as the number of extreme \textit{iHS} scores normalized by the total number of \textit{iHS} scores), the $99^{th}$ percentile of the proportion of extreme \textit{iHS} scores for archaic SNPs determined from the genomic background distribution (i.e., non-overlapping 72kb windows with more than 10 archaic SNPs), the total number of 72kb windows in the genomic background distribution, the number of 72kb windows where the observed proportion of extreme \textit{iHS} scores for archaic SNPs is greater than the genomic background distribution, and the percentile rank of the observed proportion of extreme \textit{iHS} scores for archaic SNPs (i.e., the percentage of the genomic background distribution that is less than the observed proportion of extreme \textit{iHS} scores). An observed proportion of extreme \textit{iHS} scores greater than the $99^{th}$ percentile of the genomic background distribution is considered evidence of positive selection. Note that a "---" denotes that the population did not meet the minimum number of archaic SNPs threshold for the \textit{iHS} analysis.

In [130]:
df.iloc[:, [0, 1, 3, 4, 5, 6, 7, 8, 9]]

Unnamed: 0,Super Population,Population,Focal 72kb Region (Total SNPs),Focal 72kb Region (SNPs with $\mid iHS \mid > 2$),Focal 72kb Region (Prop. of SNPs with $\mid iHS \mid > 2$),72kb Non-overlapping Windows ($99^{th}$ Percentile),72kb Non-overlapping Windows (Total SNPs $> 10$),Focal 72kb Region $>$ 72kb Non-overlapping Windows,Focal 72kb Region (Percentile Rank)
1,AMR,MXL,140,135,0.964286,0.9375,2699.0,2677.0,99.184883
5,AMR,PEL,140,58,0.414286,1.0,2332.0,2225.0,95.411664
9,AMR,CLM,140,6,0.042857,0.823529,2630.0,2339.0,88.935361
13,AMR,PUR,140,0,0.0,0.87785,2344.0,0.0,0.0
17,SAS,BEB,140,0,0.0,0.731524,3048.0,0.0,0.0
21,SAS,STU,140,0,0.0,0.733333,2901.0,0.0,0.0
25,SAS,ITU,140,0,0.0,0.733344,2900.0,0.0,0.0
29,SAS,PJL,139,0,0.0,0.801667,2952.0,0.0,0.0
33,SAS,GIH,140,0,0.0,0.848132,2883.0,0.0,0.0
37,EAS,CHB,0,0,,,,,


## S65

In [131]:
# Load the dataframe.
df = pd.read_csv('./dataframes/chagyrskaya_nean_manual_phasing_info.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, 1:]
# Intialize the columns.
cols = df.columns.values[1:]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S65. Read-based phasing information for the Chagyrskaya Neanderthal.} \newline The four heterozygous sites that could not be resolved with statistical phasing, preceding and proceeding heterozygous sites and their distance from the heterozygous site to resolve, and the results from manual read-based phasing for the Chagyrskaya Neanderthal at the focal 72kb \textit{MUC19} region.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/chagyrskaya_nean_manual_phasing_info.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(mat.shape[1]))
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S65. Read-based phasing information for the Chagyrskaya Neanderthal.} \newline The four heterozygous sites that could not be resolved with statistical phasing, preceding and proceeding heterozygous sites and their distance from the heterozygous site to resolve, and the results from manual read-based phasing for the Chagyrskaya Neanderthal at the focal 72kb \textit{MUC19} region.

In [132]:
df

Unnamed: 0,Late Neanderthal,Het. Pos. to Resolve,Preceding Het. Pos.,Dist. to Preceding Het. Pos.,Proceeding Het. Pos.,Dist. to Proceeding Het. Pos.,Is Resvolable?,Phasing Info.
0,Chagyrskaya Nean.,40767590,40761946,5644,40768905,1315,False,Distant Flanking Het. Sites
1,Chagyrskaya Nean.,40821807,40821795,12,40821847,40,True,G-C-C and A-T-A
2,Chagyrskaya Nean.,40826155,40826138,17,40826201,46,True,T-A-G and G-G-A
3,Chagyrskaya Nean.,40828121,40828019,102,40828306,185,False,Not Enough Overlapping Reads


## S66

In [133]:
# Load the dataframe.
df = pd.read_csv('./dataframes/vindija_nean_manual_phasing_info.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()[:, 1:]
# Intialize the columns.
cols = df.columns.values[1:]
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S66. Read-based phasing information for the Vindija Neanderthal.} \newline The five heterozygous sites that could not be resolved with statistical phasing, preceding and proceeding heterozygous sites and their distance from the heterozygous site to resolve, and the results from manual read-based phasing for the Vindija Neanderthal at the focal 72kb \textit{MUC19} region. Note that position 40829306 is the last heterozygous site in the focal 72kb \textit{MUC19} region for the Vindija Neanderthal, and thus, the proceeding heterozygous site information is denoted by a "-1".},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/vindija_nean_manual_phasing_info.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(mat.shape[1]))
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S66. Read-based phasing information for the Vindija Neanderthal.} \newline The five heterozygous sites that could not be resolved with statistical phasing, preceding and proceeding heterozygous sites and their distance from the heterozygous site to resolve, and the results from manual read-based phasing for the Vindija Neanderthal at the focal 72kb \textit{MUC19} region. Note that position 40829306 is the last heterozygous site in the focal 72kb \textit{MUC19} region for the Vindija Neanderthal, and thus, the proceeding heterozygous site information is denoted by a "-1".

In [134]:
df

Unnamed: 0,Late Neanderthal,Het. Pos. to Resolve,Preceding Het. Pos.,Dist. to Preceding Het. Pos.,Proceeding Het. Pos.,Dist. to Proceeding Het. Pos.,Is Resvolable?,Phasing Info.
0,Vindija Nean.,40765370,40761946,3424,40768905,3535,False,Distant Flanking Het. Sites
1,Vindija Nean.,40783344,40775127,8217,40784418,1074,False,Distant Flanking Het. Sites
2,Vindija Nean.,40821807,40821795,12,40821847,40,True,G-C-C and A-T-A
3,Vindija Nean.,40824215,40824154,61,40824266,51,True,G-A and A-C
4,Vindija Nean.,40829306,40828306,1000,-1,-1,False,Distant Flanking Het. Sites


## S67

In [135]:
# Load the dataframe.
df = pd.read_csv('./dataframes/287_denisovan_haps_v_archaics_pairwise_diffs_72kb.csv.gz')
# Convert the dataframe to a numpy matrix.
mat = df.to_numpy()
# Intialize the columns.
cols = df.columns.values
# Intialize a table header.
header = r'''\begin{longtblr}
{
colspec={XXXXX},
hlines={solid, 1pt},
vlines={solid, 1pt},
cells={halign=c, valign=m},
width=\linewidth,
rowhead=1,
row{odd}={Grey},
row{1}={Brown, fg=white, halign=c, valign=m},
caption={\textbf{Table S67. Patterns of pairwise differences at the focal 72kb \textit{MUC19} region between the \textit{Denisovan-like} haplotypes in 1000 Genomes Project and the archaic individuals} \newline The mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the number of pairwise distance distribution for the focal 72kb \textit{MUC19} region between the 287 \textit{Denisovan-like} haplotypes in 1000 Genomes Project and the four high-coverage archaic individuals. For the Denisovan and Altai Neanderthal, we compute the number of pairwise differences between a modern human haplotype and the two unphased archaic chromosomes, and for the Chagyrskaya and Vindija Neanderthals, we compute the number of pairwise differences between a modern human haplotype and a phased archaic haplotype.},
}'''+'\n'\
+ r' & '.join(f'{col}' for col in cols)+r' \\'+'\n'
# Open the table.
table = open('./supp_tables/287_denisovan_haps_v_archaics_pairwise_diffs_72kb.tex', 'w')
# Write the header.
table.write(header)
# For every row...
for i in range(mat.shape[0]):
    # Write the results.
    table.write(
        r' & '.join(f'{mat[i, j]}' for j in range(1))+r' & '\
        + r' & '.join('{:.3f}'.format(round(mat[i, j], 3)).rstrip('0').rstrip('.') for j in range(1, mat.shape[1]))\
        + r' \\'+'\n'
    )
# Write the footer.
table.write(r'\end{longtblr}'+'\n')
# Close the table file.
table.close()

\textbf{Table S67. Patterns of pairwise differences at the focal 72kb \textit{MUC19} region between the \textit{Denisovan-like} haplotypes in 1000 Genomes Project and the archaic individuals} \newline The mean ($\mu$), standard deviation ($\sigma$), standard error of the mean (\textit{SEM}), and 95\% confidence intervals ($\pm CI_{95\%}$) of the number of pairwise distance distribution for the focal 72kb \textit{MUC19} region between the 287 \textit{Denisovan-like} haplotypes in 1000 Genomes Project and the four high-coverage archaic individuals. For the Denisovan and Altai Neanderthal, we compute the number of pairwise differences between a modern human haplotype and the two unphased archaic chromosomes, and for the Chagyrskaya and Vindija Neanderthals, we compute the number of pairwise differences between a modern human haplotype and a phased archaic haplotype.

In [136]:
df

Unnamed: 0,Archaic,Focal 72kb Region Pairwise Diffs. $\left( \mu\right)$,Focal 72kb Region Pairwise Diffs. $\left( \sigma \right)$,Focal 72kb Region Pairwise Diffs. $\left( SEM \right)$,Focal 72kb Region Pairwise Diffs. $\left( \pm CI_{95\%} \right)$
0,Denisovan,49.010453,1.993866,0.1179,0.232061
1,Altai Nean.,179.837979,2.50311,0.148012,0.291331
2,Chagyrskaya Nean. Hap. 1,170.337979,2.456745,0.14527,0.285935
3,Chagyrskaya Nean. Hap. 2,7.547038,2.840177,0.167943,0.330561
4,Vindija Nean. Hap. 1,170.898955,2.178506,0.128818,0.253551
5,Vindija Nean. Hap. 2,6.108014,2.611721,0.154434,0.303972
