# Windowing Summary
* 72kb: Chr12: 40759001-40831000 
* 742kb: Chr12: 40272001-41014000

In [1]:
# Import packages.
import analysis_functions as af
import numpy as np
import pandas as pd
# Print version numbers.
print('numpy', np.__version__)
print('pandas', pd.__version__)

numpy 1.22.3
pandas 1.4.2


In [2]:
# Intialize an archaic list.
arc_list = ['ALT', 'CHA', 'VIN', 'DEN']

## All Archaic Datasets

### Without Ancestral Allele Calls (72kb)

In [3]:
# Load the non-overlapping windows.
arc_var_df = af.load_windows('arcs_masked_no_aa', 'variant', 72)
arc_invar_df = af.load_windows('arcs_masked_no_aa', 'invariant', 72)
# Concatenate the two dataframes.
arc_winds_df = pd.concat([arc_var_df, arc_invar_df])
# Compute the mean and standard deviation per archaic.
arc_esl_winds_avg = np.mean(arc_winds_df[arc_list].to_numpy(), axis=0)
arc_esl_winds_std = np.std(arc_winds_df[arc_list].to_numpy(), axis=0)
print('AVG', arc_esl_winds_avg)
print('STD', arc_esl_winds_std)
# Extract the average effective sequence length per window.
arc_var_esl_winds = np.all(
    (arc_var_df[arc_list].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std))\
    & (arc_var_df[arc_list].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)), 
    axis=1,
)
arc_invar_esl_winds = np.all(
    (arc_invar_df[arc_list].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std))\
    & (arc_invar_df[arc_list].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)), 
    axis=1,
)
# Determine the indicies that meet the esl criteria.
arc_var_esl_qc_idx = np.where(arc_var_esl_winds)[0]
arc_invar_esl_qc_idx = np.where(arc_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', arc_var_esl_qc_idx.size)
print('INVAR', arc_invar_esl_qc_idx.size)
print('TOTAL', (arc_var_esl_qc_idx.size + arc_invar_esl_qc_idx.size))

AVG [44891.52244102 44282.52509548 44793.36022102 44650.44573255]
STD [10489.63266445 10410.90108903 10466.64055609 10449.05907335]
VAR 29504
INVAR 0
TOTAL 29504


In [4]:
# Subset out the windows that did not pass QC.
arc_var_qced_df = arc_var_df.iloc[arc_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
arc_var_qced_df.groupby(['CHR'])[arc_list].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,ALT,ALT,ALT,CHA,CHA,CHA,VIN,VIN,VIN,DEN,DEN,DEN
Unnamed: 0_level_1,count,mean,std,count,mean,std,count,mean,std,count,mean,std
CHR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,2382,46263.896725,5211.981236,2382,45604.151553,5240.34551,2382,46139.268682,5198.68957,2382,46014.02351,5204.53516
2,2676,47210.172646,4890.382692,2676,46615.939836,4904.660572,2676,47136.490284,4889.041693,2676,47016.408072,4870.768426
3,2235,46681.70962,5023.950331,2235,46117.89396,5021.041988,2235,46621.830425,5028.867037,2235,46493.918121,5031.758353
4,2298,46508.187554,4824.887888,2298,46012.860748,4829.198262,2298,46510.265013,4823.356947,2298,46393.171018,4806.515494
5,2034,46710.121436,5004.841358,2034,46143.89528,4986.597729,2034,46677.725172,5002.303409,2034,46545.753687,4977.546582
6,1993,46569.63422,4963.869298,1993,45987.921726,4949.46828,1993,46530.521826,4977.477495,1993,46416.284496,4929.123929
7,1619,46633.303274,5151.313811,1619,46040.233477,5177.889597,1619,46577.157505,5206.550745,1619,46414.869055,5200.739407
8,1691,47109.033116,4791.236815,1691,46555.640449,4813.184748,1691,47052.567711,4810.729691,1691,46910.602011,4798.212707
9,1234,46537.263371,5062.067218,1234,45881.085089,5091.140571,1234,46422.491086,5070.291962,1234,46260.160454,5058.661265
10,1368,46739.303363,5040.851553,1368,46104.95614,5045.664417,1368,46620.809942,5036.939632,1368,46500.971491,5039.833383


In [5]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/arcs_masked_no_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [arc_var_esl_qc_idx], fmt='%d',
# )

### With Ancestral Allele Calls (72kb)

In [6]:
# Load the non-overlapping windows.
arc_var_df = af.load_windows('arcs_masked_aa', 'variant', 72)
arc_invar_df = af.load_windows('arcs_masked_aa', 'invariant', 72)
# Concatenate the two dataframes.
arc_winds_df = pd.concat([arc_var_df, arc_invar_df])
# Compute the mean and standard deviation per archaic.
arc_esl_winds_avg = np.mean(arc_winds_df[arc_list].to_numpy(), axis=0)
arc_esl_winds_std = np.std(arc_winds_df[arc_list].to_numpy(), axis=0)
print('AVG', arc_esl_winds_avg)
print('STD', arc_esl_winds_std)
# Extract the average effective sequence length per window.
arc_var_esl_winds = np.all(
    (arc_var_df[arc_list].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std))\
    & (arc_var_df[arc_list].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)), 
    axis=1,
)
arc_invar_esl_winds = np.all(
    (arc_invar_df[arc_list].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std))\
    & (arc_invar_df[arc_list].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)), 
    axis=1,
)
# Determine the indicies that meet the esl criteria.
arc_var_esl_qc_idx = np.where(arc_var_esl_winds)[0]
arc_invar_esl_qc_idx = np.where(arc_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', arc_var_esl_qc_idx.size)
print('INVAR', arc_invar_esl_qc_idx.size)
print('TOTAL', (arc_var_esl_qc_idx.size + arc_invar_esl_qc_idx.size))

AVG [44949.47824898 44340.00850354 44850.06307932 44710.56481557]
STD [10099.78824914 10036.1558132  10085.21973516 10066.38328233]
VAR 28696
INVAR 0
TOTAL 28696


In [7]:
# Subset out the windows that did not pass QC.
arc_var_qced_df = arc_var_df.iloc[arc_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
arc_var_qced_df.groupby(['CHR'])[arc_list].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,ALT,ALT,ALT,CHA,CHA,CHA,VIN,VIN,VIN,DEN,DEN,DEN
Unnamed: 0_level_1,count,mean,std,count,mean,std,count,mean,std,count,mean,std
CHR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,2314,46122.084702,5074.417474,2314,45465.889801,5113.546194,2314,45999.480985,5075.024813,2314,45876.530251,5080.276694
2,2604,47048.456989,4775.173866,2604,46459.813364,4786.057491,2604,46977.102535,4774.058326,2604,46858.402074,4756.885318
3,2178,46649.76079,4885.258956,2178,46085.552801,4884.012793,2178,46590.593205,4892.636929,2178,46461.593205,4894.652817
4,2258,46386.04163,4761.858701,2258,45894.261736,4763.338132,2258,46387.516829,4760.725566,2258,46271.575731,4744.856279
5,1983,46576.011094,4895.811067,1983,46016.764498,4869.962023,1983,46546.575391,4890.604847,1983,46414.251135,4861.229943
6,1953,46468.572965,4855.087162,1953,45889.55658,4840.132627,1953,46430.215566,4872.575347,1953,46314.110599,4828.132158
7,1569,46526.212874,5035.293348,1569,45939.616953,5069.924404,1569,46471.629063,5088.561831,1569,46306.173996,5098.54623
8,1639,46925.248322,4675.295489,1639,46371.829164,4696.706155,1639,46868.173887,4690.62709,1639,46734.912752,4691.732577
9,1208,46436.563742,4971.097041,1208,45784.073675,4990.954779,1208,46321.797185,4970.836737,1208,46164.110927,4962.736
10,1330,46594.123308,4955.320844,1330,45968.450376,4952.494527,1330,46478.381203,4948.861589,1330,46361.574436,4957.169205


In [8]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/arcs_masked_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [arc_var_esl_qc_idx], fmt='%d',
# )

## Single Archaic Datasets

### Denisovan Without Ancestral Allele Calls (72kb)

In [9]:
# Load the non-overlapping windows.
arc_var_df = af.load_windows('den_masked_no_aa', 'variant', 72)
arc_invar_df = af.load_windows('den_masked_no_aa', 'invariant', 72)
# Concatenate the two dataframes.
arc_winds_df = pd.concat([arc_var_df, arc_invar_df])
# Compute the mean and standard deviation per archaic.
arc_esl_winds_avg = np.mean(arc_winds_df['DEN'].to_numpy(), axis=0)
arc_esl_winds_std = np.std(arc_winds_df['DEN'].to_numpy(), axis=0)
print('AVG', arc_esl_winds_avg)
print('STD', arc_esl_winds_std)
# Extract the average effective sequence length per window.
arc_var_esl_winds = ((arc_var_df['DEN'].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std)) & (arc_var_df['DEN'].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)))
arc_invar_esl_winds = ((arc_invar_df['DEN'].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std)) & (arc_invar_df['DEN'].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
arc_var_esl_qc_idx = np.where(arc_var_esl_winds)[0]
arc_invar_esl_qc_idx = np.where(arc_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', arc_var_esl_qc_idx.size)
print('INVAR', arc_invar_esl_qc_idx.size)
print('TOTAL', (arc_var_esl_qc_idx.size + arc_invar_esl_qc_idx.size))

AVG 44556.680028110386
STD 10635.879969716942
VAR 26318
INVAR 4135
TOTAL 30453


In [10]:
# Subset out the windows that did not pass QC.
arc_var_qced_df = arc_var_df.iloc[arc_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
arc_var_qced_df.groupby(['CHR'])['DEN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2237,46179.500224,5400.983924
2,2332,47141.372642,5111.844274
3,1945,46603.240617,5254.87303
4,1811,46520.659856,4927.193445
5,1841,46728.934818,5101.904829
6,1785,46490.177031,5119.870413
7,1495,46648.517057,5341.111312
8,1557,47079.928709,4931.064871
9,1074,46534.875233,5221.908011
10,1262,46759.791601,5168.348268


In [11]:
# Subset out the windows that did not pass QC.
arc_invar_qced_df = arc_var_df.iloc[arc_invar_esl_qc_idx]
# Show a summary of the QC'ed windows.
arc_invar_qced_df.groupby(['CHR'])['DEN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2140,45365.040654,9096.229515
2,1863,47013.947397,7985.54907
3,132,47716.378788,7755.228459


In [12]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/den_masked_no_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [arc_var_esl_qc_idx], fmt='%d',
# )
# np.savetxt(
#     f'../windowing/den_masked_no_aa/72kb_esl_qced_nonoverlapping_invariant_windows.txt.gz',
#     [arc_invar_esl_qc_idx], fmt='%d',
# )

### Altai Nean. Without Ancestral Allele Calls (72kb)

In [13]:
# Load the non-overlapping windows.
arc_var_df = af.load_windows('alt_masked_no_aa', 'variant', 72)
arc_invar_df = af.load_windows('alt_masked_no_aa', 'invariant', 72)
# Concatenate the two dataframes.
arc_winds_df = pd.concat([arc_var_df, arc_invar_df])
# Compute the mean and standard deviation per archaic.
arc_esl_winds_avg = np.mean(arc_winds_df['ALT'].to_numpy(), axis=0)
arc_esl_winds_std = np.std(arc_winds_df['ALT'].to_numpy(), axis=0)
print('AVG', arc_esl_winds_avg)
print('STD', arc_esl_winds_std)
# Extract the average effective sequence length per window.
arc_var_esl_winds = ((arc_var_df['ALT'].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std)) & (arc_var_df['ALT'].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)))
arc_invar_esl_winds = ((arc_invar_df['ALT'].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std)) & (arc_invar_df['ALT'].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
arc_var_esl_qc_idx = np.where(arc_var_esl_winds)[0]
arc_invar_esl_qc_idx = np.where(arc_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', arc_var_esl_qc_idx.size)
print('INVAR', arc_invar_esl_qc_idx.size)
print('TOTAL', (arc_var_esl_qc_idx.size + arc_invar_esl_qc_idx.size))

AVG 44804.46627558055
STD 10663.624101620531
VAR 20677
INVAR 9783
TOTAL 30460


In [14]:
# Subset out the windows that did not pass QC.
arc_var_qced_df = arc_var_df.iloc[arc_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
arc_var_qced_df.groupby(['CHR'])['ALT'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1789,46524.641699,5323.132609
2,1998,47230.896396,5073.218045
3,1684,47024.744062,5136.967821
4,1443,46459.297297,4995.583547
5,1344,46948.756696,5234.248027
6,1577,46802.703234,5074.793785
7,1304,46714.332055,5299.022716
8,1302,47512.373272,4743.938512
9,584,46944.97089,5312.485906
10,1029,47144.515063,5120.85676


In [15]:
# Subset out the windows that did not pass QC.
arc_invar_qced_df = arc_var_df.iloc[arc_invar_esl_qc_idx]
# Show a summary of the QC'ed windows.
arc_invar_qced_df.groupby(['CHR'])['ALT'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1730,45726.387283,9282.901027
2,2017,46789.171542,8271.516278
3,1530,47119.156209,7339.155348
4,1239,45614.589992,8186.322549
5,1364,46712.460411,8032.723635
6,1318,45702.317147,8037.70039
7,585,42203.774359,13410.694139


In [16]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/alt_masked_no_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [arc_var_esl_qc_idx], fmt='%d',
# )
# np.savetxt(
#     f'../windowing/alt_masked_no_aa/72kb_esl_qced_nonoverlapping_invariant_windows.txt.gz',
#     [arc_invar_esl_qc_idx], fmt='%d',
# )

### Chagyrskaya Nean. Without Ancestral Allele Calls (72kb)

In [17]:
arc_var_df = af.load_windows('cha_masked_no_aa', 'variant', 72)
arc_invar_df = af.load_windows('cha_masked_no_aa', 'invariant', 72)
# Concatenate the two dataframes.
arc_winds_df = pd.concat([arc_var_df, arc_invar_df])
# Compute the mean and standard deviation per archaic.
arc_esl_winds_avg = np.mean(arc_winds_df['CHA'].to_numpy(), axis=0)
arc_esl_winds_std = np.std(arc_winds_df['CHA'].to_numpy(), axis=0)
print('AVG', arc_esl_winds_avg)
print('STD', arc_esl_winds_std)
# Extract the average effective sequence length per window.
arc_var_esl_winds = ((arc_var_df['CHA'].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std)) & (arc_var_df['CHA'].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)))
arc_invar_esl_winds = ((arc_invar_df['CHA'].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std)) & (arc_invar_df['CHA'].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
arc_var_esl_qc_idx = np.where(arc_var_esl_winds)[0]
arc_invar_esl_qc_idx = np.where(arc_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', arc_var_esl_qc_idx.size)
print('INVAR', arc_invar_esl_qc_idx.size)
print('TOTAL', (arc_var_esl_qc_idx.size + arc_invar_esl_qc_idx.size))

AVG 44189.595859123714
STD 10595.015711406359
VAR 19601
INVAR 10748
TOTAL 30349


In [18]:
# Subset out the windows that did not pass QC.
arc_var_qced_df = arc_var_df.iloc[arc_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
arc_var_qced_df.groupby(['CHR'])['CHA'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1805,45748.593906,5368.454294
2,1729,46704.721805,5101.170862
3,1575,46322.654603,5195.231364
4,1608,46009.056592,4888.725918
5,1302,46415.938556,5135.339884
6,1210,46227.454545,5106.133703
7,1067,46350.918463,5289.529024
8,1168,46601.837329,4933.637136
9,961,45876.915713,5176.943932
10,1023,46422.054741,5269.111907


In [19]:
# Subset out the windows that did not pass QC.
arc_invar_qced_df = arc_var_df.iloc[arc_invar_esl_qc_idx]
# Show a summary of the QC'ed windows.
arc_invar_qced_df.groupby(['CHR'])['CHA'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1754,45014.815849,9523.155454
2,1731,46400.940497,8023.097317
3,1494,46129.886212,7553.379438
4,1444,44979.273546,8001.441348
5,1189,46274.264087,7628.73647
6,1149,45278.086162,8022.225919
7,935,44935.97754,9849.338574
8,983,46027.109868,7508.188354
9,69,44775.855072,7381.766534


In [20]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/cha_masked_no_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [arc_var_esl_qc_idx], fmt='%d',
# )
# np.savetxt(
#     f'../windowing/cha_masked_no_aa/72kb_esl_qced_nonoverlapping_invariant_windows.txt.gz',
#     [arc_invar_esl_qc_idx], fmt='%d',
# )

### Vindija Nean. Without Ancestral Allele Calls (72kb)

In [21]:
# Load the non-overlapping windows.
arc_var_df = af.load_windows('vin_masked_no_aa', 'variant', 72)
arc_invar_df = af.load_windows('vin_masked_no_aa', 'invariant', 72)
# Concatenate the two dataframes.
arc_winds_df = pd.concat([arc_var_df, arc_invar_df])
# Compute the mean and standard deviation per archaic.
arc_esl_winds_avg = np.mean(arc_winds_df['VIN'].to_numpy(), axis=0)
arc_esl_winds_std = np.std(arc_winds_df['VIN'].to_numpy(), axis=0)
print('AVG', arc_esl_winds_avg)
print('STD', arc_esl_winds_std)
# Extract the average effective sequence length per window.
arc_var_esl_winds = ((arc_var_df['VIN'].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std)) & (arc_var_df['VIN'].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)))
arc_invar_esl_winds = ((arc_invar_df['VIN'].to_numpy() >= (arc_esl_winds_avg - arc_esl_winds_std)) & (arc_invar_df['VIN'].to_numpy() <= (arc_esl_winds_avg + arc_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
arc_var_esl_qc_idx = np.where(arc_var_esl_winds)[0]
arc_invar_esl_qc_idx = np.where(arc_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', arc_var_esl_qc_idx.size)
print('INVAR', arc_invar_esl_qc_idx.size)
print('TOTAL', (arc_var_esl_qc_idx.size + arc_invar_esl_qc_idx.size))

AVG 44687.1038992623
STD 10678.663696598771
VAR 23933
INVAR 6532
TOTAL 30465


In [22]:
# Subset out the windows that did not pass QC.
arc_var_qced_df = arc_var_df.iloc[arc_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
arc_var_qced_df.groupby(['CHR'])['VIN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1993,46285.84295,5445.465841
2,2143,47237.94587,5083.626599
3,1749,46767.253859,5301.596204
4,1953,46581.272401,4928.233454
5,1578,46871.043726,5134.372736
6,1505,46675.631229,5154.506607
7,1326,46712.647059,5337.987825
8,1262,47089.454041,5036.530875
9,1077,46568.564531,5263.669779
10,1211,46812.457473,5260.924354


In [23]:
# Subset out the windows that did not pass QC.
arc_invar_qced_df = arc_var_df.iloc[arc_invar_esl_qc_idx]
# Show a summary of the QC'ed windows.
arc_invar_qced_df.groupby(['CHR'])['VIN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2015,45101.512159,9824.597562
2,2049,46840.602245,8200.595107
3,1594,46850.663739,7579.693441
4,874,45574.416476,7365.138949


In [24]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/vin_masked_no_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [arc_var_esl_qc_idx], fmt='%d',
# )
# np.savetxt(
#     f'../windowing/vin_masked_no_aa/72kb_esl_qced_nonoverlapping_invariant_windows.txt.gz',
#     [arc_invar_esl_qc_idx], fmt='%d',
# )

## TGP + All Archaic Datasets

### Without Ancestral Allele Calls (742kb)

In [25]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_arcs_masked_no_aa', 'variant', 742)
tgp_invar_df = af.load_windows('tgp_arcs_masked_no_aa', 'invariant', 742)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df[arc_list].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df[arc_list].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = np.all(
    (tgp_var_df[arc_list].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std))\
    & (tgp_var_df[arc_list].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)), 
    axis=1,
)
tgp_invar_esl_winds = np.all(
    (tgp_invar_df[arc_list].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std))\
    & (tgp_invar_df[arc_list].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)), 
    axis=1,
)
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG [454461.16776678 448301.83250825 453466.16116612 452030.76155116]
STD [97219.13179505 96576.07382438 97218.20817278 97202.57135828]
VAR 3161
INVAR 0
TOTAL 3161


In [26]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])[arc_list].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,ALT,ALT,ALT,CHA,CHA,CHA,VIN,VIN,VIN,DEN,DEN,DEN
Unnamed: 0_level_1,count,mean,std,count,mean,std,count,mean,std,count,mean,std
CHR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,261,469300.049808,43227.453563,261,462723.896552,43992.286059,261,468009.601533,43567.195889,261,466775.697318,43727.659201
2,291,484197.786942,39806.899887,291,478192.484536,39951.50344,291,483555.714777,39874.605067,291,482318.676976,39727.936938
3,249,480513.341365,39617.826584,249,474555.927711,39971.910283,249,479730.843373,39685.917598,249,478106.871486,40308.894147
4,240,475868.375,31696.768234,240,470585.408333,32105.763572,240,475515.991667,32411.817313,240,474459.0375,32230.849191
5,219,478058.351598,37431.150966,219,472128.881279,37367.156566,219,477635.461187,37537.074608,219,476300.146119,37447.164417
6,216,476130.578704,37537.665897,216,470293.361111,37388.780773,216,475802.703704,37604.224454,216,474674.435185,37169.903309
7,167,477340.011976,40619.157738,167,471455.227545,40730.499983,167,476858.215569,41090.720184,167,475227.694611,41358.38855
8,181,484542.281768,33320.076268,181,478857.690608,34098.860643,181,483969.701657,33750.726575,181,482576.497238,33873.17641
9,135,477330.518519,38182.499318,135,470422.562963,38842.442257,135,476134.155556,38662.499724,135,474423.081481,38830.495894
10,150,476505.0,44679.501193,150,470057.833333,44682.758231,150,475258.86,44604.71457,150,474077.953333,44584.296128


In [27]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_arcs_masked_no_aa/742kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### Without Ancestral Allele Calls (72kb)

In [28]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_arcs_masked_no_aa', 'variant', 72)
tgp_invar_df = af.load_windows('tgp_arcs_masked_no_aa', 'invariant', 72)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df[arc_list].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df[arc_list].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = np.all(
    (tgp_var_df[arc_list].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std))\
    & (tgp_var_df[arc_list].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)), 
    axis=1,
)
tgp_invar_esl_winds = np.all(
    (tgp_invar_df[arc_list].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std))\
    & (tgp_invar_df[arc_list].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)), 
    axis=1,
)
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG [44849.43316    44240.99271477 44751.35153288 44608.23472538]
STD [10494.19811375 10415.27647401 10471.25280003 10453.63384715]
VAR 29515
INVAR 0
TOTAL 29515


In [29]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])[arc_list].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,ALT,ALT,ALT,CHA,CHA,CHA,VIN,VIN,VIN,DEN,DEN,DEN
Unnamed: 0_level_1,count,mean,std,count,mean,std,count,mean,std,count,mean,std
CHR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,2382,46228.515533,5208.571616,2382,45569.301847,5236.827067,2382,46103.963476,5195.281445,2382,45978.552897,5201.075521
2,2677,47175.895032,4888.93842,2677,46581.724692,4902.667076,2677,47101.648487,4886.743879,2677,46981.68136,4868.823757
3,2237,46642.805543,5027.854712,2237,46079.591864,5024.20241,2237,46583.295932,5032.408666,2237,46455.270451,5035.252295
4,2298,46469.916884,4821.404327,2298,45975.002611,4825.759479,2298,46472.047868,4819.89233,2298,46354.600087,4803.050698
5,2036,46670.299116,5009.521019,2036,46105.038802,4990.781673,2036,46638.036837,5007.980942,2036,46506.007859,4982.599262
6,1993,46531.078274,4961.095052,1993,45949.792273,4946.837912,1993,46491.913698,4974.74483,1993,46377.501254,4926.39135
7,1620,46588.688889,5153.732527,1620,45995.611111,5181.15007,1620,46532.240741,5209.548184,1620,46369.427778,5204.413901
8,1690,47066.240828,4785.165343,1690,46513.37929,4807.177319,1690,47009.854438,4804.713586,1690,46867.736095,4791.93328
9,1234,46498.808752,5059.396434,1234,45843.222853,5088.3931,1234,46384.169368,5067.570088,1234,46221.601297,5056.06647
10,1369,46707.751644,5040.270374,1369,46073.934989,5045.202521,1369,46589.565376,5036.644343,1369,46469.383492,5039.635911


In [30]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_arcs_masked_no_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

## TGP + Single Archaic Datasets

### TGP + Denisovan Without Ancestral Allele Calls (742kb)

In [31]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_den_masked_no_aa', 'variant', 742)
tgp_invar_df = af.load_windows('tgp_den_masked_no_aa', 'invariant', 742)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['DEN'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['DEN'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['DEN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['DEN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['DEN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['DEN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 451785.9282572842
STD 97752.5147852619
VAR 3186
INVAR 0
TOTAL 3186


In [32]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['DEN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,264,466248.57197,44792.790333
2,292,482548.339041,39847.351532
3,251,477125.948207,41629.489079
4,241,473983.59751,33013.26962
5,222,476425.932432,38664.959
6,217,474155.820276,37872.199174
7,169,476097.39645,41870.360693
8,181,482581.690608,33874.305744
9,136,474971.477941,39203.861933
10,151,473289.397351,45489.462651


In [33]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_den_masked_no_aa/742kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Denisovan Without Ancestral Allele Calls (72kb)

In [34]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_den_masked_no_aa', 'variant', 72)
tgp_invar_df = af.load_windows('tgp_den_masked_no_aa', 'invariant', 72)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['DEN'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['DEN'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['DEN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['DEN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['DEN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['DEN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 44520.870584101416
STD 10628.173401536731
VAR 30454
INVAR 0
TOTAL 30454


In [35]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['DEN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2465,46043.172008,5422.398293
2,2757,46984.932535,5077.629318
3,2304,46491.211372,5229.947563
4,2348,46372.011925,4937.034287
5,2093,46589.074056,5146.200639
6,2048,46370.327148,5127.088622
7,1673,46420.09205,5393.466322
8,1745,46975.045845,4980.174537
9,1272,46284.982704,5252.627697
10,1416,46588.43291,5231.927533


In [36]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_den_masked_no_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Denisovan With Ancestral Allele Calls (742kb)

In [37]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_den_masked_aa', 'variant', 742)
tgp_invar_df = af.load_windows('tgp_den_masked_aa', 'invariant', 742)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['DEN'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['DEN'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['DEN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['DEN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['DEN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['DEN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 450281.15299640986
STD 95873.81902740178
VAR 3146
INVAR 0
TOTAL 3146


In [38]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['DEN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,259,464133.65251,44281.217986
2,285,480181.336842,39577.955868
3,249,476285.626506,40879.523926
4,241,472829.141079,33416.044234
5,219,474654.228311,38616.259642
6,215,472781.483721,38423.487733
7,167,474187.934132,41995.590147
8,181,479958.59116,35546.680053
9,135,471246.918519,39610.09067
10,150,471267.686667,45670.282645


In [39]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_den_masked_aa/742kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Denisovan With Ancestral Allele Calls (72kb)

In [40]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_den_masked_aa', 'variant', 72)
tgp_invar_df = af.load_windows('tgp_den_masked_aa', 'invariant', 72)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['DEN'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['DEN'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['DEN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['DEN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['DEN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['DEN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 44597.44297335663
STD 10208.010839838424
VAR 29594
INVAR 0
TOTAL 29594


In [41]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['DEN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2393,45923.10071,5275.734946
2,2687,46858.716785,4948.765103
3,2240,46453.819643,5067.544271
4,2305,46226.166594,4860.741356
5,2034,46482.345133,4989.340838
6,2001,46307.576212,4978.132799
7,1623,46338.577942,5267.71112
8,1695,46823.39115,4857.769984
9,1242,46190.126409,5124.155326
10,1373,46385.009468,5137.571855


In [42]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_den_masked_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Altai Nean. Without Ancestral Allele Calls (742kb)

In [43]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_alt_masked_no_aa', 'variant', 742)
tgp_invar_df = af.load_windows('tgp_alt_masked_no_aa', 'invariant', 742)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['ALT'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['ALT'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['ALT'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['ALT'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['ALT'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['ALT'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 454216.54507971415
STD 97775.6356186113
VAR 3195
INVAR 0
TOTAL 3195


In [44]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['ALT'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,265,468458.554717,44536.581554
2,293,484659.52901,40052.642598
3,253,480119.067194,41230.775084
4,241,475379.813278,32546.558092
5,221,477859.420814,38390.542219
6,217,475615.304147,38226.835886
7,169,477070.786982,41794.227047
8,182,483889.357143,34399.466548
9,137,478408.211679,38917.872565
10,150,476510.773333,44680.335692


In [45]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_alt_masked_no_aa/742kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Altai Nean. Without Ancestral Allele Calls (72kb)

In [46]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_alt_masked_no_aa', 'variant', 72)
tgp_invar_df = af.load_windows('tgp_alt_masked_no_aa', 'invariant', 72)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['ALT'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['ALT'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['ALT'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['ALT'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['ALT'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['ALT'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 44770.14138956475
STD 10653.587883259566
VAR 30458
INVAR 0
TOTAL 30458


In [47]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['ALT'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2456,46253.145765,5411.867576
2,2765,47228.21953,5102.631626
3,2309,46656.696405,5242.226863
4,2342,46454.062767,4950.911745
5,2095,46746.680191,5193.763999
6,2042,46577.791381,5128.628567
7,1674,46660.712067,5357.282078
8,1737,47147.843408,4953.69002
9,1274,46561.083203,5261.651153
10,1417,46804.661962,5255.550492


In [48]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_alt_masked_no_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Altai Nean. With Ancestral Allele Calls (742kb)

In [49]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_alt_masked_aa', 'variant', 742)
tgp_invar_df = af.load_windows('tgp_alt_masked_aa', 'invariant', 742)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['ALT'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['ALT'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['ALT'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['ALT'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['ALT'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['ALT'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 452683.5136702568
STD 95803.4537059149
VAR 3145
INVAR 0
TOTAL 3145


In [50]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['ALT'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,260,466203.369231,44176.618813
2,284,482492.950704,38982.513925
3,250,477424.488,41309.206664
4,240,473925.070833,32556.942829
5,218,476073.885321,38362.157488
6,214,474776.453271,38003.644936
7,168,475558.97619,42198.183438
8,181,481908.878453,34844.504067
9,135,474113.148148,38902.42889
10,150,473669.026667,45779.886107


In [51]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_alt_masked_aa/742kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Altai Nean. With Ancestral Allele Calls (72kb)

In [52]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_alt_masked_aa', 'variant', 72)
tgp_invar_df = af.load_windows('tgp_alt_masked_aa', 'invariant', 72)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['ALT'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['ALT'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['ALT'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['ALT'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['ALT'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['ALT'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 44846.47026539974
STD 10220.48824472767
VAR 29601
INVAR 0
TOTAL 29601


In [53]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['ALT'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2387,46121.926267,5263.786022
2,2683,47079.486769,4948.922854
3,2237,46660.173,5035.188899
4,2307,46381.769831,4877.064361
5,2034,46616.186824,5037.790693
6,2000,46514.177,4998.785821
7,1618,46467.941904,5230.890101
8,1696,47009.258844,4855.791333
9,1249,46446.944756,5172.721254
10,1375,46649.515636,5143.700243


In [54]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_alt_masked_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Chagyrskaya Nean. Without Ancestral Allele Calls (742kb)

In [55]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_cha_masked_no_aa', 'variant', 742)
tgp_invar_df = af.load_windows('tgp_cha_masked_no_aa', 'invariant', 742)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['CHA'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['CHA'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['CHA'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['CHA'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['CHA'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['CHA'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 447937.46276449575
STD 97390.83271403892
VAR 3186
INVAR 0
TOTAL 3186


In [56]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['CHA'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,262,462328.648855,44383.896387
2,292,478422.041096,40067.278844
3,251,473575.454183,41309.232825
4,241,470128.232365,32833.203465
5,223,471728.775785,39265.174562
6,217,470641.230415,37642.240821
7,170,471584.217647,42074.97341
8,181,478864.381215,34100.608636
9,136,470970.772059,39212.76595
10,150,470063.353333,44683.573962


In [57]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_cha_masked_no_aa/742kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Chagyrskaya Nean. Without Ancestral Allele Calls (72kb)

In [58]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_cha_masked_no_aa', 'variant', 72)
tgp_invar_df = af.load_windows('tgp_cha_masked_no_aa', 'invariant', 72)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['CHA'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['CHA'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['CHA'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['CHA'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['CHA'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['CHA'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 44154.53777333297
STD 10587.42697669884
VAR 30348
INVAR 0
TOTAL 30348


In [59]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['CHA'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2447,45606.740499,5409.887949
2,2762,46634.595945,5101.534069
3,2308,46098.869151,5221.391056
4,2337,45974.836115,4932.064923
5,2094,46189.520057,5164.013137
6,2051,46024.888347,5120.862896
7,1668,46051.044365,5352.3702
8,1736,46598.497696,4960.002017
9,1267,45917.122336,5247.807614
10,1409,46129.049681,5226.562363


In [60]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_cha_masked_no_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Chagyrskaya Nean. With Ancestral Allele Calls (742kb)

In [61]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_cha_masked_aa', 'variant', 742)
tgp_invar_df = af.load_windows('tgp_cha_masked_aa', 'invariant', 742)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['CHA'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['CHA'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['CHA'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['CHA'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['CHA'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['CHA'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 446428.7087244616
STD 95552.01356811354
VAR 3134
INVAR 0
TOTAL 3134


In [62]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['CHA'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,257,460173.63035,43902.380218
2,286,476310.531469,39925.81603
3,247,472174.684211,40242.344082
4,241,468199.456432,33730.012298
5,219,470547.146119,38588.013971
6,214,468994.336449,37880.052921
7,169,470086.928994,42507.073774
8,180,475885.727778,35393.570107
9,135,467265.162963,39556.55277
10,149,466772.979866,45511.6729


In [63]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_cha_masked_aa/742kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Chagyrskaya Nean. With Ancestral Allele Calls (72kb)

In [64]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_cha_masked_aa', 'variant', 72)
tgp_invar_df = af.load_windows('tgp_cha_masked_aa', 'invariant', 72)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['CHA'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['CHA'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['CHA'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['CHA'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['CHA'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['CHA'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 44231.21255801256
STD 10168.74273436129
VAR 29488
INVAR 0
TOTAL 29488


In [65]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['CHA'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2380,45487.091597,5272.486986
2,2673,46523.721287,4909.565311
3,2234,46115.899284,5005.653205
4,2301,45902.119079,4857.752748
5,2032,46064.117618,5002.584871
6,2001,45952.001999,4962.866142
7,1620,45965.82037,5231.691317
8,1700,46438.738824,4901.401674
9,1234,45801.256888,5109.742566
10,1372,46039.143586,5107.882465


In [66]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_cha_masked_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Vindija Nean. Without Ancestral Allele Calls (742kb)

In [67]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_vin_masked_no_aa', 'variant', 742)
tgp_invar_df = af.load_windows('tgp_vin_masked_no_aa', 'invariant', 742)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['VIN'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['VIN'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['VIN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['VIN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['VIN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['VIN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 453097.5559219566
STD 98046.7034263633
VAR 3198
INVAR 0
TOTAL 3198


In [68]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['VIN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,265,467119.871698,44935.730777
2,294,483577.091837,40747.931104
3,251,478749.450199,41037.739902
4,240,475521.141667,32412.335083
5,222,476874.896396,39328.709355
6,217,475279.640553,38315.682182
7,169,476585.538462,42243.909504
8,183,483638.349727,35228.174948
9,137,477198.671533,39357.948959
10,150,475264.46,44605.539248


In [69]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_vin_masked_no_aa/742kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Vindija Nean. Without Ancestral Allele Calls (72kb)

In [70]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_vin_masked_no_aa', 'variant', 72)
tgp_invar_df = af.load_windows('tgp_vin_masked_no_aa', 'invariant', 72)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['VIN'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['VIN'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['VIN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['VIN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['VIN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['VIN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 44651.658929391735
STD 10671.031205241909
VAR 30469
INVAR 0
TOTAL 30469


In [71]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['VIN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2468,46161.138979,5425.726541
2,2759,47118.713664,5098.845899
3,2309,46601.342573,5238.519077
4,2348,46478.891823,4967.094593
5,2087,46657.342118,5179.376319
6,2040,46512.89902,5133.011374
7,1661,46585.804335,5359.176961
8,1746,47130.73425,4997.170111
9,1280,46470.636719,5305.276005
10,1420,46701.833099,5262.784207


In [72]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_vin_masked_no_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Vindija Nean. With Ancestral Allele Calls (742kb)

In [73]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_vin_masked_aa', 'variant', 742)
tgp_invar_df = af.load_windows('tgp_vin_masked_aa', 'invariant', 742)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['VIN'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['VIN'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['VIN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['VIN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['VIN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['VIN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 451433.3417057687
STD 96439.81660258546
VAR 3151
INVAR 0
TOTAL 3151


In [74]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['VIN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,260,465663.865385,44299.904317
2,285,481417.666667,39686.186891
3,250,477398.744,40943.224628
4,241,473866.941909,33526.91285
5,219,475966.511416,38785.832878
6,214,474460.126168,38094.294657
7,168,475073.494048,42636.820125
8,180,480969.411111,35100.052603
9,135,472921.933333,39407.928806
10,150,472420.326667,45728.08802


In [75]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_vin_masked_aa/742kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### TGP + Vindija Nean. With Ancestral Allele Calls (72kb)

In [76]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_vin_masked_aa', 'variant', 72)
tgp_invar_df = af.load_windows('tgp_vin_masked_aa', 'invariant', 72)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['VIN'].to_numpy(), axis=0)
tgp_esl_winds_std = np.std(tgp_winds_df['VIN'].to_numpy(), axis=0)
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = ((tgp_var_df['VIN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['VIN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
tgp_invar_esl_winds = ((tgp_invar_df['VIN'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['VIN'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)))
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 44730.01733173208
STD 10241.607979162914
VAR 29620
INVAR 0
TOTAL 29620


In [77]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['VIN'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2397,46024.607843,5280.699923
2,2681,46999.231257,4941.480614
3,2244,46585.835116,5063.785519
4,2304,46336.851562,4885.93239
5,2039,46573.512016,5054.419891
6,2001,46453.377811,5012.011596
7,1609,46417.246737,5247.14893
8,1689,46943.947898,4850.619974
9,1240,46319.700806,5132.763156
10,1379,46583.949964,5141.958983


In [78]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_vin_masked_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

## TGP Only Datasets

### Without Ancestral Allele Calls (742kb)

In [79]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_mod_no_aa', 'variant', 742)
tgp_invar_df = af.load_windows('tgp_mod_no_aa', 'invariant', 742)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['HUM'].to_numpy())
tgp_esl_winds_std = np.std(tgp_winds_df['HUM'].to_numpy())
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = (tgp_var_df['HUM'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['HUM'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std))
tgp_invar_esl_winds = (tgp_invar_df['HUM'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['HUM'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)) 
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 741114.2028423772
STD 275.364211610457
VAR 3273
INVAR 0
TOTAL 3273


In [80]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['HUM'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,279,741104.716846,95.573066
2,307,741083.42671,102.640739
3,251,741071.306773,103.566508
4,229,741030.707424,101.584668
5,224,741063.995536,109.603505
6,203,741044.246305,96.916359
7,184,741078.73913,112.466473
8,173,741097.526012,105.543751
9,135,741059.703704,105.778145
10,162,741055.333333,107.936835


In [81]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_mod_no_aa/742kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )

### Without Ancestral Allele Calls (72kb)

In [82]:
# Load the non-overlapping windows.
tgp_var_df = af.load_windows('tgp_mod_no_aa', 'variant', 72)
tgp_invar_df = af.load_windows('tgp_mod_no_aa', 'invariant', 72)
# Concatenate the two dataframes.
tgp_winds_df = pd.concat([tgp_var_df, tgp_invar_df])
# Compute the mean and standard deviation per archaic.
tgp_esl_winds_avg = np.mean(tgp_winds_df['HUM'].to_numpy())
tgp_esl_winds_std = np.std(tgp_winds_df['HUM'].to_numpy())
print('AVG', tgp_esl_winds_avg)
print('STD', tgp_esl_winds_std)
# Extract the average effective sequence length per window.
tgp_var_esl_winds = (tgp_var_df['HUM'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_var_df['HUM'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std))
tgp_invar_esl_winds = (tgp_invar_df['HUM'].to_numpy() >= (tgp_esl_winds_avg - tgp_esl_winds_std)) & (tgp_invar_df['HUM'].to_numpy() <= (tgp_esl_winds_avg + tgp_esl_winds_std)) 
# Determine the indicies that meet the esl criteria.
tgp_var_esl_qc_idx = np.where(tgp_var_esl_winds)[0]
tgp_invar_esl_qc_idx = np.where(tgp_invar_esl_winds)[0]
# Print a summary of the windows that passed QC.
print('VAR', tgp_var_esl_qc_idx.size)
print('INVAR', tgp_invar_esl_qc_idx.size)
print('TOTAL', (tgp_var_esl_qc_idx.size + tgp_invar_esl_qc_idx.size))

AVG 71914.02022348324
STD 32.923713614372076
VAR 31678
INVAR 0
TOTAL 31678


In [83]:
# Subset out the windows that did not pass QC.
tgp_var_qced_df = tgp_var_df.iloc[tgp_var_esl_qc_idx]
# Show a summary of the QC'ed windows.
tgp_var_qced_df.groupby(['CHR'])['HUM'].agg(['count', 'mean', 'std'])

Unnamed: 0_level_0,count,mean,std
CHR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2752,71913.919331,14.430068
2,2963,71912.683091,14.519718
3,2419,71912.06821,14.713612
4,2249,71908.601601,14.080365
5,2173,71910.924988,14.689085
6,2001,71909.271364,14.344435
7,1762,71910.946084,15.13839
8,1666,71911.310324,14.648977
9,1319,71910.59439,15.119905
10,1551,71910.745326,14.08505


In [84]:
# Export the QC'ed windows indicies.
# np.savetxt(
#     f'../windowing/tgp_mod_no_aa/72kb_esl_qced_nonoverlapping_variant_windows.txt.gz',
#     [tgp_var_esl_qc_idx], fmt='%d',
# )