# Validating Denisovan-specific SNPs with Long-Read Data

In [1]:
# Import packages.
import analysis_functions as af
import allel
import gzip
import numpy as np
import pandas as pd
# Print version numbers.
print('allel', allel.__version__)
print('numpy', np.__version__)
print('pandas', pd.__version__)

allel 1.3.5
numpy 1.22.3
pandas 1.4.2


In [2]:
# Load the TGP meta-information with haplotype identities for the 72kb region.
tgp_ann_df, tgp_ann_info = af.annotate_tgp_hap_identities_72kb(70, 100)
# Intialize the focal indicies.
idx_dicc = {
    'ALT': np.array([2347]), 'CHA': np.array([2348]),
    'VIN': np.array([2349]), 'DEN': np.array([2350]),
    'AFR': tgp_ann_df[tgp_ann_df['SUPERPOP'] == 'AFR'].index.values,
    'MXL': tgp_ann_df[tgp_ann_df['POP'] == 'MXL'].index.values,
}

In [3]:
# Load the genotype matrix.
tgp_arcs_no_aa_72kb_gt, tgp_arcs_no_aa_72kb_pos = af.load_hap_region('tgp_arcs_masked_no_aa', 12, 40759001, 40831000)

In [4]:
# Find the archaic SNP partitions.
is_arc_snps_72kb = af.find_arc_alleles(tgp_arcs_no_aa_72kb_gt, idx_dicc, 'MXL', 0.01)

In [5]:
# Subset the Denisovan-specific SNPs.
hg19_den_snps = tgp_arcs_no_aa_72kb_pos[is_arc_snps_72kb['DEN']]

In [6]:
# Sanity check that the Denisovan is homozygous for the alternative allele at every poistion (135 sites x 2 alleles = 270).
tgp_arcs_no_aa_72kb_gt.compress(is_arc_snps_72kb['DEN'], axis=0).take(idx_dicc['DEN'], axis=1).count_alleles()[:, 1].sum()

270

## hg19 $\rightarrow$ hg38 Coordinate Conversion

In [7]:
# Intialize the vcf file.
vcf = '../vcf_data/muc19/tgp_mod_all_archaics_masked_var_sites_no_aa_calls_muc19_72kb.vcf.gz'
# Intialize a dictionary.
hg19_info = {'POS': [], 'REF':[], 'ALT': []}
# Intialize a Denisovan SNP counter.
c_den_snp = 1
# Open the the vcf file.
with gzip.open(vcf, 'rt') as data:
    # Iterate through every line in the original vcf file.
    for line in data:
        # If the line is a meta line...
        if line.startswith('##'):
            # Continue to the next line.
            continue
        # Else-if this is the header line.
        elif line.startswith('#'):
            last_ind = line.split()[-1]
        # Else.
        else:
            # Split the line.
            spline = line.split()
            # If the current position is in the phased dataset.
            if int(spline[1]) in hg19_den_snps:
                # Grab the information.
                pos, ref, alt, den_gt = int(spline[1]), spline[3], spline[4], spline[-1][:3]
                # Fill the dictionary.
                hg19_info['POS'].append(pos)
                hg19_info['REF'].append(ref)
                hg19_info['ALT'].append(alt)
                # Print a summary.
                print(
                    f'Found {c_den_snp} out of {hg19_den_snps.size} Denisovan-specific SNPs...'+'\n'+f'hg19:{pos}-{ref}-{alt}'+'\t'+f'{last_ind}: {den_gt}'
                )
                # Increment.
                c_den_snp += 1

Found 1 out of 135 Denisovan-specific SNPs...
hg19:40788647-G-C	Denisova: 1/1
Found 2 out of 135 Denisovan-specific SNPs...
hg19:40789962-C-T	Denisova: 1/1
Found 3 out of 135 Denisovan-specific SNPs...
hg19:40790924-C-T	Denisova: 1/1
Found 4 out of 135 Denisovan-specific SNPs...
hg19:40791852-T-C	Denisova: 1/1
Found 5 out of 135 Denisovan-specific SNPs...
hg19:40792197-G-C	Denisova: 1/1
Found 6 out of 135 Denisovan-specific SNPs...
hg19:40792300-G-A	Denisova: 1/1
Found 7 out of 135 Denisovan-specific SNPs...
hg19:40792856-T-G	Denisova: 1/1
Found 8 out of 135 Denisovan-specific SNPs...
hg19:40793515-G-A	Denisova: 1/1
Found 9 out of 135 Denisovan-specific SNPs...
hg19:40793899-T-C	Denisova: 1/1
Found 10 out of 135 Denisovan-specific SNPs...
hg19:40794293-C-T	Denisova: 1/1
Found 11 out of 135 Denisovan-specific SNPs...
hg19:40794452-T-C	Denisova: 1/1
Found 12 out of 135 Denisovan-specific SNPs...
hg19:40796334-T-C	Denisova: 1/1
Found 13 out of 135 Denisovan-specific SNPs...
hg19:40796365-

In [8]:
# Convert to an array.
for key, values in hg19_info.items():
    hg19_info[key] = np.array(values)
    # Sanity check.
    print(key, hg19_info[key].size)

POS 135
REF 135
ALT 135


In [9]:
# Export to the positions to lift over using UCSC genome browswe liftover tool (https://genome.ucsc.edu/cgi-bin/hgLiftOver).
hg19_liftover_pos = np.array([f'chr12:{pos}-{pos}' for pos in hg19_den_snps])
# np.savetxt('../hg38_data/hg19_den_snps_72kb.txt', hg19_liftover_pos, fmt='%s')

In [10]:
# Load the hg38 coordinates.
hg38_liftover_pos = np.loadtxt('../hg38_data/hg38_den_snps_72kb.txt', dtype=str)
# Sanity check.
hg19_liftover_pos.size, hg38_liftover_pos.size

(135, 135)

In [11]:
# Extract the positions.
hg38_den_snps = np.array([int(coord[6:].split('-')[0]) for coord in hg38_liftover_pos])
# Sanity check.
hg38_den_snps.size, np.array_equal(hg38_den_snps, np.sort(hg38_den_snps))

(135, True)

## Validation

In [12]:
# Define a function for bookkeeping.
def hg38_den_snp_tracker(split_vcf_line, hg19_dicc, hg38_pos):
    # If the current poistion is a Denisovan-specific SNP.
    if int(split_vcf_line[1]) in hg38_pos:
        # Grab the hg38 info.
        pos_hg38, ref_hg38, alt_hg38 = int(split_vcf_line[1]), split_vcf_line[3], split_vcf_line[4]
        # Determine the position's index.
        pos_idx = int(np.flatnonzero(hg38_pos == pos_hg38))
        # Grab the hg19 info.
        pos_hg19, ref_hg19, alt_hg19 = hg19_dicc['POS'][pos_idx], hg19_dicc['REF'][pos_idx], hg19_dicc['ALT'][pos_idx]
        return f'Processed {pos_idx+1} out of {hg38_pos.size} Denisovan-specific SNPs...'+'\n'+f'hg19:{pos_hg19}-{ref_hg19}-{alt_hg19}'+'\t'+f'hg38:{pos_hg38}-{ref_hg38}-{alt_hg38}'+'\n'
    # Else.
    return None

### HGSV2

In [13]:
# Load the HGSV2 meta data.
hgsv2_meta_df = pd.read_csv('../hg38_data/HGSV2.csv')
# Determine which HGSV2 individuals have the Denisovan-like haplotype at the 72kb region
hgsv2_inds = hgsv2_meta_df['IND'].values
is_in_hgsv2 = np.isin(tgp_ann_info['IND'], hgsv2_inds)
is_focal_hgsv2_ind = is_in_hgsv2 & (tgp_ann_info['N_DEN_HAPS_72KB'] > 0)

In [14]:
# Show the focal HGSV2 individuals for validation.
tgp_ann_df[is_focal_hgsv2_ind]

Unnamed: 0,IND,POP,SUPERPOP,IS_HAP1_DEN_72KB,IS_HAP2_DEN_72KB,IS_HAP1_HUM_72KB,IS_HAP2_HUM_72KB,IS_HAP1_REC_72KB,IS_HAP2_REC_72KB,N_DEN_HAPS_72KB,N_HUM_HAPS_72KB,N_REC_HAPS_72KB
310,HG00864,CDX,EAS,False,True,True,False,False,False,1,1,0
1050,HG03009,BEB,SAS,False,True,True,False,False,False,1,1,0
2246,NA20847,GIH,SAS,False,True,True,False,False,False,1,1,0


In [15]:
# Intialize a list to store the Denisovan-specific sites.
hgsv2_den_snps = []
# Read the HGSV2 VCF file.
with gzip.open('../hg38_data/variants_freeze4_snv_snv_alt.vcf.gz', 'rt') as hgsv2_data:
    # Iterate through every line.
    for line in hgsv2_data:
        # Skip the meta information.
        if line.startswith('##'):
            continue
        # Else-if this is a header line.
        elif line.startswith('#'):
            # Split and grab the header line.
            hgsv2_header_line = np.array(line.split())
        # Else, the line contains genotype information.
        else:
            # Split the line by tabs.
            spline = line.split()
            # If this is not chromosome 12, skip.
            if spline[0] != 'chr12':
                continue
            # Else-if we have parsed through chromosome 12, break.
            elif spline[0] == 'chr13':
                break
            # Else, we are in chromosome 12.
            else:
                # Check to see if the position is a Denisovan-specific sites.
                den_snp_check = hg38_den_snp_tracker(spline, hg19_info, hg38_den_snps)
                if den_snp_check:
                    # Print an update.
                    print(den_snp_check)
                    # Update the list.
                    hgsv2_den_snps.append(spline)
                # If this is the last position of interest, break.
                if int(spline[1]) == hg38_den_snps[-1]:
                    break
# Convert the list to an array.
hgsv2_den_snps = np.array(hgsv2_den_snps)
# Construct a dataframe.
hgsv2_den_snps_vcf_df = pd.DataFrame(hgsv2_den_snps, columns=hgsv2_header_line)
# Grab only the nescessary columns.
hgsv2_den_snps_vcf_df = hgsv2_den_snps_vcf_df[['#CHROM', 'POS', 'REF', 'ALT', 'HG00864', 'HG03009', 'NA20847']].copy()

Processed 1 out of 135 Denisovan-specific SNPs...
hg19:40788647-G-C	hg38:40394845-G-C

Processed 2 out of 135 Denisovan-specific SNPs...
hg19:40789962-C-T	hg38:40396160-C-T

Processed 3 out of 135 Denisovan-specific SNPs...
hg19:40790924-C-T	hg38:40397122-C-T

Processed 4 out of 135 Denisovan-specific SNPs...
hg19:40791852-T-C	hg38:40398050-T-C

Processed 5 out of 135 Denisovan-specific SNPs...
hg19:40792197-G-C	hg38:40398395-G-C

Processed 6 out of 135 Denisovan-specific SNPs...
hg19:40792300-G-A	hg38:40398498-G-A

Processed 7 out of 135 Denisovan-specific SNPs...
hg19:40792856-T-G	hg38:40399054-T-G

Processed 8 out of 135 Denisovan-specific SNPs...
hg19:40793515-G-A	hg38:40399713-G-A

Processed 9 out of 135 Denisovan-specific SNPs...
hg19:40793899-T-C	hg38:40400097-T-C

Processed 10 out of 135 Denisovan-specific SNPs...
hg19:40794293-C-T	hg38:40400491-C-T

Processed 11 out of 135 Denisovan-specific SNPs...
hg19:40794452-T-C	hg38:40400650-T-C

Processed 12 out of 135 Denisovan-specifi

In [16]:
# Sanity check.
print(f'Passed REF QC: {np.array_equal(hg19_info["REF"], hgsv2_den_snps_vcf_df["REF"].values)}')
print(f'Passed ALT QC: {np.array_equal(hg19_info["ALT"], hgsv2_den_snps_vcf_df["ALT"].values)}')

Passed REF QC: True
Passed ALT QC: True


In [17]:
# Inspect.
hgsv2_den_snps_vcf_df

Unnamed: 0,#CHROM,POS,REF,ALT,HG00864,HG03009,NA20847
0,chr12,40394845,G,C,0|1,1|0,1|0
1,chr12,40396160,C,T,0|1,1|0,1|0
2,chr12,40397122,C,T,0|1,1|0,1|0
3,chr12,40398050,T,C,0|1,1|0,1|0
4,chr12,40398395,G,C,0|1,1|0,1|0
5,chr12,40398498,G,A,0|1,1|0,1|0
6,chr12,40399054,T,G,0|1,1|0,1|0
7,chr12,40399713,G,A,0|1,1|0,1|0
8,chr12,40400097,T,C,0|1,1|0,1|0
9,chr12,40400491,C,T,0|1,1|0,1|0


### 1KG-ONT

In [18]:
# Load the 1KG-ONT meta data.
ont_1kg_meta_df = pd.read_csv('../hg38_data/1000G-ONT-F100.csv')
# Determine which 1KG-ONT individuals have the Denisovan-like haplotype at the 72kb region
ont_1kg_inds = ont_1kg_meta_df['IND'].values
is_in_ont_1kg = np.isin(tgp_ann_info['IND'], ont_1kg_inds)
is_focal_ont_1kg_ind = is_in_ont_1kg & (tgp_ann_info['N_DEN_HAPS_72KB'] > 0) & (tgp_ann_info['SUPERPOP'] == 'AMR')

In [19]:
# Show the focal 1KG-ONT individuals for validation.
tgp_ann_df[is_focal_ont_1kg_ind]

Unnamed: 0,IND,POP,SUPERPOP,IS_HAP1_DEN_72KB,IS_HAP2_DEN_72KB,IS_HAP1_HUM_72KB,IS_HAP2_HUM_72KB,IS_HAP1_REC_72KB,IS_HAP2_REC_72KB,N_DEN_HAPS_72KB,N_HUM_HAPS_72KB,N_REC_HAPS_72KB
366,HG01122,CLM,AMR,False,True,True,False,False,False,1,1,0
817,HG02252,PEL,AMR,False,True,True,False,False,False,1,1,0
821,HG02262,PEL,AMR,True,True,False,False,False,False,2,0,0


In [20]:
# Intialize a list to store the Denisovan-specific sites.
HG01122_clair3_den_snps = []
# Read the HGSV2 VCF file.
with gzip.open('../hg38_data/HG01122-ONT-hg38-R9-LSK110-guppy-sup-5mC.clair3.notPhased.vcf.gz', 'rt') as HG01122_clair3_data:
    # Iterate through every line.
    for line in HG01122_clair3_data:
        # Skip the meta information.
        if line.startswith('##'):
            continue
        # Else-if this is a header line.
        elif line.startswith('#'):
            # Split and grab the header line.
            HG01122_clair3_header_line = np.array(line.split())
        # Else, the line contains genotype information.
        else:
            # Split the line by tabs.
            spline = line.split()
            # If this is not chromosome 12, skip.
            if spline[0] != 'chr12':
                continue
            # Else-if we have parsed through chromosome 12, break.
            elif spline[0] == 'chr13':
                break
            # Else, we are in chromosome 12.
            else:
                # Check to see if the position is a Denisovan-specific sites.
                den_snp_check = hg38_den_snp_tracker(spline, hg19_info, hg38_den_snps)
                if den_snp_check:
                    # Print an update.
                    print(den_snp_check)
                    # Clean up the genotype information for the final table.
                    spline[-1] = spline[-1][:3]
                    # Update the list.
                    HG01122_clair3_den_snps.append(spline)
                # If this is the last position of interest, break.
                if int(spline[1]) == hg38_den_snps[-1]:
                    break
# Convert the list to an array.
HG01122_clair3_den_snps = np.array(HG01122_clair3_den_snps)
# Construct a dataframe.
HG01122_clair3_den_snps_vcf_df = pd.DataFrame(HG01122_clair3_den_snps, columns=HG01122_clair3_header_line)
# Grab only the nescessary columns.
HG01122_clair3_den_snps_vcf_df = HG01122_clair3_den_snps_vcf_df[['#CHROM', 'POS', 'REF', 'ALT', 'SAMPLE']].copy()

Processed 1 out of 135 Denisovan-specific SNPs...
hg19:40788647-G-C	hg38:40394845-G-C

Processed 2 out of 135 Denisovan-specific SNPs...
hg19:40789962-C-T	hg38:40396160-C-T

Processed 3 out of 135 Denisovan-specific SNPs...
hg19:40790924-C-T	hg38:40397122-C-T

Processed 4 out of 135 Denisovan-specific SNPs...
hg19:40791852-T-C	hg38:40398050-T-C

Processed 5 out of 135 Denisovan-specific SNPs...
hg19:40792197-G-C	hg38:40398395-G-C

Processed 6 out of 135 Denisovan-specific SNPs...
hg19:40792300-G-A	hg38:40398498-G-A

Processed 7 out of 135 Denisovan-specific SNPs...
hg19:40792856-T-G	hg38:40399054-T-G

Processed 8 out of 135 Denisovan-specific SNPs...
hg19:40793515-G-A	hg38:40399713-G-A

Processed 9 out of 135 Denisovan-specific SNPs...
hg19:40793899-T-C	hg38:40400097-T-C

Processed 10 out of 135 Denisovan-specific SNPs...
hg19:40794293-C-T	hg38:40400491-C-T

Processed 11 out of 135 Denisovan-specific SNPs...
hg19:40794452-T-C	hg38:40400650-T-C

Processed 12 out of 135 Denisovan-specifi

In [21]:
# Sanity check.
print(f'Passed REF QC: {np.array_equal(hg19_info["REF"], HG01122_clair3_den_snps_vcf_df["REF"].values)}')
print(f'Passed ALT QC: {np.array_equal(hg19_info["ALT"], HG01122_clair3_den_snps_vcf_df["ALT"].values)}')

Passed REF QC: True
Passed ALT QC: True


In [22]:
# Inspect.
HG01122_clair3_den_snps_vcf_df

Unnamed: 0,#CHROM,POS,REF,ALT,SAMPLE
0,chr12,40394845,G,C,0/1
1,chr12,40396160,C,T,0/1
2,chr12,40397122,C,T,0/1
3,chr12,40398050,T,C,0/1
4,chr12,40398395,G,C,0/1
5,chr12,40398498,G,A,0/1
6,chr12,40399054,T,G,0/1
7,chr12,40399713,G,A,0/1
8,chr12,40400097,T,C,0/1
9,chr12,40400491,C,T,0/1


In [23]:
# Intialize a list to store the Denisovan-specific sites.
HG01122_pmdv_den_snps = []
# Read the HGSV2 VCF file.
with gzip.open('../hg38_data/HG01122-ONT-hg38-R9-LSK110-guppy-sup-5mC.PMDV_FINAL.vcf.gz', 'rt') as HG01122_pmdv_data:
    # Iterate through every line.
    for line in HG01122_pmdv_data:
        # Skip the meta information.
        if line.startswith('##'):
            continue
        # Else-if this is a header line.
        elif line.startswith('#'):
            # Split and grab the header line.
            HG01122_pmdv_header_line = np.array(line.split())
        # Else, the line contains genotype information.
        else:
            # Split the line by tabs.
            spline = line.split()
            # If this is not chromosome 12, skip.
            if spline[0] != 'chr12':
                continue
            # Else-if we have parsed through chromosome 12, break.
            elif spline[0] == 'chr13':
                break
            # Else, we are in chromosome 12.
            else:
                # Check to see if the position is a Denisovan-specific sites.
                den_snp_check = hg38_den_snp_tracker(spline, hg19_info, hg38_den_snps)
                if den_snp_check:
                    # Print an update.
                    print(den_snp_check)
                    # Clean up the genotype information for the final table.
                    spline[-1] = spline[-1][:3]
                    # Update the list.
                    HG01122_pmdv_den_snps.append(spline)
                # If this is the last position of interest, break.
                if int(spline[1]) == hg38_den_snps[-1]:
                    break
# Convert the list to an array.
HG01122_pmdv_den_snps = np.array(HG01122_pmdv_den_snps)
# Construct a dataframe.
HG01122_pmdv_den_snps_vcf_df = pd.DataFrame(HG01122_pmdv_den_snps, columns=HG01122_pmdv_header_line)
# Grab only the nescessary columns.
HG01122_pmdv_den_snps_vcf_df = HG01122_pmdv_den_snps_vcf_df[['#CHROM', 'POS', 'REF', 'ALT', 'Sample']].copy()

Processed 1 out of 135 Denisovan-specific SNPs...
hg19:40788647-G-C	hg38:40394845-G-C

Processed 2 out of 135 Denisovan-specific SNPs...
hg19:40789962-C-T	hg38:40396160-C-T

Processed 3 out of 135 Denisovan-specific SNPs...
hg19:40790924-C-T	hg38:40397122-C-T

Processed 4 out of 135 Denisovan-specific SNPs...
hg19:40791852-T-C	hg38:40398050-T-C

Processed 5 out of 135 Denisovan-specific SNPs...
hg19:40792197-G-C	hg38:40398395-G-C

Processed 6 out of 135 Denisovan-specific SNPs...
hg19:40792300-G-A	hg38:40398498-G-A

Processed 7 out of 135 Denisovan-specific SNPs...
hg19:40792856-T-G	hg38:40399054-T-G

Processed 8 out of 135 Denisovan-specific SNPs...
hg19:40793515-G-A	hg38:40399713-G-A

Processed 9 out of 135 Denisovan-specific SNPs...
hg19:40793899-T-C	hg38:40400097-T-C

Processed 10 out of 135 Denisovan-specific SNPs...
hg19:40794293-C-T	hg38:40400491-C-T

Processed 11 out of 135 Denisovan-specific SNPs...
hg19:40794452-T-C	hg38:40400650-T-C

Processed 12 out of 135 Denisovan-specifi

In [24]:
# Sanity check.
print(f'Passed REF QC: {np.array_equal(hg19_info["REF"], HG01122_pmdv_den_snps_vcf_df["REF"].values)}')
print(f'Passed ALT QC: {np.array_equal(hg19_info["ALT"], HG01122_pmdv_den_snps_vcf_df["ALT"].values)}')

Passed REF QC: True
Passed ALT QC: True


In [25]:
# Inspect.
HG01122_pmdv_den_snps_vcf_df

Unnamed: 0,#CHROM,POS,REF,ALT,Sample
0,chr12,40394845,G,C,0/1
1,chr12,40396160,C,T,0/1
2,chr12,40397122,C,T,0/1
3,chr12,40398050,T,C,0/1
4,chr12,40398395,G,C,0/1
5,chr12,40398498,G,A,0/1
6,chr12,40399054,T,G,0/1
7,chr12,40399713,G,A,0/1
8,chr12,40400097,T,C,0/1
9,chr12,40400491,C,T,0/1


In [26]:
# Intialize a list to store the Denisovan-specific sites.
HG02252_clair3_den_snps = []
# Read the HGSV2 VCF file.
with gzip.open('../hg38_data/HG02252-ONT-hg38-R9-LSK110-guppy-sup-5mC.clair3.notPhased.vcf.gz', 'rt') as HG02252_clair3_data:
    # Iterate through every line.
    for line in HG02252_clair3_data:
        # Skip the meta information.
        if line.startswith('##'):
            continue
        # Else-if this is a header line.
        elif line.startswith('#'):
            # Split and grab the header line.
            HG02252_clair3_header_line = np.array(line.split())
        # Else, the line contains genotype information.
        else:
            # Split the line by tabs.
            spline = line.split()
            # If this is not chromosome 12, skip.
            if spline[0] != 'chr12':
                continue
            # Else-if we have parsed through chromosome 12, break.
            elif spline[0] == 'chr13':
                break
            # Else, we are in chromosome 12.
            else:
                # Check to see if the position is a Denisovan-specific sites.
                den_snp_check = hg38_den_snp_tracker(spline, hg19_info, hg38_den_snps)
                if den_snp_check:
                    # Print an update.
                    print(den_snp_check)
                    # Clean up the genotype information for the final table.
                    spline[-1] = spline[-1][:3]
                    # Update the list.
                    HG02252_clair3_den_snps.append(spline)
                # If this is the last position of interest, break.
                if int(spline[1]) == hg38_den_snps[-1]:
                    break
# Convert the list to an array.
HG02252_clair3_den_snps = np.array(HG02252_clair3_den_snps)
# Construct a dataframe.
HG02252_clair3_den_snps_vcf_df = pd.DataFrame(HG02252_clair3_den_snps, columns=HG02252_clair3_header_line)
# Grab only the nescessary columns.
HG02252_clair3_den_snps_vcf_df = HG02252_clair3_den_snps_vcf_df[['#CHROM', 'POS', 'REF', 'ALT', 'SAMPLE']].copy()

Processed 1 out of 135 Denisovan-specific SNPs...
hg19:40788647-G-C	hg38:40394845-G-C

Processed 2 out of 135 Denisovan-specific SNPs...
hg19:40789962-C-T	hg38:40396160-C-T

Processed 3 out of 135 Denisovan-specific SNPs...
hg19:40790924-C-T	hg38:40397122-C-T

Processed 4 out of 135 Denisovan-specific SNPs...
hg19:40791852-T-C	hg38:40398050-T-C

Processed 5 out of 135 Denisovan-specific SNPs...
hg19:40792197-G-C	hg38:40398395-G-C

Processed 6 out of 135 Denisovan-specific SNPs...
hg19:40792300-G-A	hg38:40398498-G-A

Processed 7 out of 135 Denisovan-specific SNPs...
hg19:40792856-T-G	hg38:40399054-T-G

Processed 8 out of 135 Denisovan-specific SNPs...
hg19:40793515-G-A	hg38:40399713-G-A

Processed 9 out of 135 Denisovan-specific SNPs...
hg19:40793899-T-C	hg38:40400097-T-C

Processed 10 out of 135 Denisovan-specific SNPs...
hg19:40794293-C-T	hg38:40400491-C-T

Processed 11 out of 135 Denisovan-specific SNPs...
hg19:40794452-T-C	hg38:40400650-T-C

Processed 12 out of 135 Denisovan-specifi

In [27]:
# Sanity check.
print(f'Passed REF QC: {np.array_equal(hg19_info["REF"], HG02252_clair3_den_snps_vcf_df["REF"].values)}')
print(f'Passed ALT QC: {np.array_equal(hg19_info["ALT"], HG02252_clair3_den_snps_vcf_df["ALT"].values)}')

Passed REF QC: True
Passed ALT QC: True


In [28]:
# Inspect.
HG02252_clair3_den_snps_vcf_df

Unnamed: 0,#CHROM,POS,REF,ALT,SAMPLE
0,chr12,40394845,G,C,0/1
1,chr12,40396160,C,T,0/1
2,chr12,40397122,C,T,0/1
3,chr12,40398050,T,C,0/1
4,chr12,40398395,G,C,0/1
5,chr12,40398498,G,A,0/1
6,chr12,40399054,T,G,0/1
7,chr12,40399713,G,A,0/1
8,chr12,40400097,T,C,0/1
9,chr12,40400491,C,T,0/1


In [29]:
# Intialize a list to store the Denisovan-specific sites.
HG02252_pmdv_den_snps = []
# Read the HGSV2 VCF file.
with gzip.open('../hg38_data/HG02252-ONT-hg38-R9-LSK110-guppy-sup-5mC.PMDV_FINAL.vcf.gz', 'rt') as HG02252_pmdv_data:
    # Iterate through every line.
    for line in HG02252_pmdv_data:
        # Skip the meta information.
        if line.startswith('##'):
            continue
        # Else-if this is a header line.
        elif line.startswith('#'):
            # Split and grab the header line.
            HG02252_pmdv_header_line = np.array(line.split())
        # Else, the line contains genotype information.
        else:
            # Split the line by tabs.
            spline = line.split()
            # If this is not chromosome 12, skip.
            if spline[0] != 'chr12':
                continue
            # Else-if we have parsed through chromosome 12, break.
            elif spline[0] == 'chr13':
                break
            # Else, we are in chromosome 12.
            else:
                # Check to see if the position is a Denisovan-specific sites.
                den_snp_check = hg38_den_snp_tracker(spline, hg19_info, hg38_den_snps)
                if den_snp_check:
                    # Print an update.
                    print(den_snp_check)
                    # Clean up the genotype information for the final table.
                    spline[-1] = spline[-1][:3]
                    # Update the list.
                    HG02252_pmdv_den_snps.append(spline)
                # If this is the last position of interest, break.
                if int(spline[1]) == hg38_den_snps[-1]:
                    break
# Convert the list to an array.
HG02252_pmdv_den_snps = np.array(HG02252_pmdv_den_snps)
# Construct a dataframe.
HG02252_pmdv_den_snps_vcf_df = pd.DataFrame(HG02252_pmdv_den_snps, columns=HG02252_pmdv_header_line)
# Grab only the nescessary columns.
HG02252_pmdv_den_snps_vcf_df = HG02252_pmdv_den_snps_vcf_df[['#CHROM', 'POS', 'REF', 'ALT', 'Sample']].copy()

Processed 1 out of 135 Denisovan-specific SNPs...
hg19:40788647-G-C	hg38:40394845-G-C

Processed 2 out of 135 Denisovan-specific SNPs...
hg19:40789962-C-T	hg38:40396160-C-T

Processed 3 out of 135 Denisovan-specific SNPs...
hg19:40790924-C-T	hg38:40397122-C-T

Processed 4 out of 135 Denisovan-specific SNPs...
hg19:40791852-T-C	hg38:40398050-T-C

Processed 5 out of 135 Denisovan-specific SNPs...
hg19:40792197-G-C	hg38:40398395-G-C

Processed 6 out of 135 Denisovan-specific SNPs...
hg19:40792300-G-A	hg38:40398498-G-A

Processed 7 out of 135 Denisovan-specific SNPs...
hg19:40792856-T-G	hg38:40399054-T-G

Processed 8 out of 135 Denisovan-specific SNPs...
hg19:40793515-G-A	hg38:40399713-G-A

Processed 9 out of 135 Denisovan-specific SNPs...
hg19:40793899-T-C	hg38:40400097-T-C

Processed 10 out of 135 Denisovan-specific SNPs...
hg19:40794293-C-T	hg38:40400491-C-T

Processed 11 out of 135 Denisovan-specific SNPs...
hg19:40794452-T-C	hg38:40400650-T-C

Processed 12 out of 135 Denisovan-specifi

In [30]:
# Sanity check.
print(f'Passed REF QC: {np.array_equal(hg19_info["REF"], HG02252_pmdv_den_snps_vcf_df["REF"].values)}')
print(f'Passed ALT QC: {np.array_equal(hg19_info["ALT"], HG02252_pmdv_den_snps_vcf_df["ALT"].values)}')

Passed REF QC: True
Passed ALT QC: True


In [31]:
# Inspect.
HG02252_pmdv_den_snps_vcf_df

Unnamed: 0,#CHROM,POS,REF,ALT,Sample
0,chr12,40394845,G,C,0/1
1,chr12,40396160,C,T,0/1
2,chr12,40397122,C,T,0/1
3,chr12,40398050,T,C,0/1
4,chr12,40398395,G,C,0/1
5,chr12,40398498,G,A,0/1
6,chr12,40399054,T,G,0/1
7,chr12,40399713,G,A,0/1
8,chr12,40400097,T,C,0/1
9,chr12,40400491,C,T,0/1


In [32]:
# Intialize a list to store the Denisovan-specific sites.
HG02262_clair3_den_snps = []
# Read the HGSV2 VCF file.
with gzip.open('../hg38_data/HG02262-ONT-hg38-R9-LSK110-guppy-sup-5mC.clair3.notPhased.vcf.gz', 'rt') as HG02262_clair3_data:
    # Iterate through every line.
    for line in HG02262_clair3_data:
        # Skip the meta information.
        if line.startswith('##'):
            continue
        # Else-if this is a header line.
        elif line.startswith('#'):
            # Split and grab the header line.
            HG02262_clair3_header_line = np.array(line.split())
        # Else, the line contains genotype information.
        else:
            # Split the line by tabs.
            spline = line.split()
            # If this is not chromosome 12, skip.
            if spline[0] != 'chr12':
                continue
            # Else-if we have parsed through chromosome 12, break.
            elif spline[0] == 'chr13':
                break
            # Else, we are in chromosome 12.
            else:
                # Check to see if the position is a Denisovan-specific sites.
                den_snp_check = hg38_den_snp_tracker(spline, hg19_info, hg38_den_snps)
                if den_snp_check:
                    # Print an update.
                    print(den_snp_check)
                    # Clean up the genotype information for the final table.
                    spline[-1] = spline[-1][:3]
                    # Update the list.
                    HG02262_clair3_den_snps.append(spline)
                # If this is the last position of interest, break.
                if int(spline[1]) == hg38_den_snps[-1]:
                    break
# Convert the list to an array.
HG02262_clair3_den_snps = np.array(HG02262_clair3_den_snps)
# Construct a dataframe.
HG02262_clair3_den_snps_vcf_df = pd.DataFrame(HG02262_clair3_den_snps, columns=HG02262_clair3_header_line)
# Grab only the nescessary columns.
HG02262_clair3_den_snps_vcf_df = HG02262_clair3_den_snps_vcf_df[['#CHROM', 'POS', 'REF', 'ALT', 'SAMPLE']].copy()

Processed 1 out of 135 Denisovan-specific SNPs...
hg19:40788647-G-C	hg38:40394845-G-C

Processed 2 out of 135 Denisovan-specific SNPs...
hg19:40789962-C-T	hg38:40396160-C-T

Processed 3 out of 135 Denisovan-specific SNPs...
hg19:40790924-C-T	hg38:40397122-C-T

Processed 4 out of 135 Denisovan-specific SNPs...
hg19:40791852-T-C	hg38:40398050-T-C

Processed 5 out of 135 Denisovan-specific SNPs...
hg19:40792197-G-C	hg38:40398395-G-C

Processed 6 out of 135 Denisovan-specific SNPs...
hg19:40792300-G-A	hg38:40398498-G-A

Processed 7 out of 135 Denisovan-specific SNPs...
hg19:40792856-T-G	hg38:40399054-T-G

Processed 8 out of 135 Denisovan-specific SNPs...
hg19:40793515-G-A	hg38:40399713-G-A

Processed 9 out of 135 Denisovan-specific SNPs...
hg19:40793899-T-C	hg38:40400097-T-C

Processed 10 out of 135 Denisovan-specific SNPs...
hg19:40794293-C-T	hg38:40400491-C-T

Processed 11 out of 135 Denisovan-specific SNPs...
hg19:40794452-T-C	hg38:40400650-T-C

Processed 12 out of 135 Denisovan-specifi

In [33]:
# Sanity check.
print(f'Passed REF QC: {np.array_equal(hg19_info["REF"], HG02262_clair3_den_snps_vcf_df["REF"].values)}')
print(f'Passed ALT QC: {np.array_equal(hg19_info["ALT"], HG02262_clair3_den_snps_vcf_df["ALT"].values)}')

Passed REF QC: True
Passed ALT QC: True


In [34]:
# Inspect.
HG02262_clair3_den_snps_vcf_df

Unnamed: 0,#CHROM,POS,REF,ALT,SAMPLE
0,chr12,40394845,G,C,1/1
1,chr12,40396160,C,T,1/1
2,chr12,40397122,C,T,1/1
3,chr12,40398050,T,C,1/1
4,chr12,40398395,G,C,1/1
5,chr12,40398498,G,A,1/1
6,chr12,40399054,T,G,1/1
7,chr12,40399713,G,A,1/1
8,chr12,40400097,T,C,1/1
9,chr12,40400491,C,T,1/1


In [35]:
# Intialize a list to store the Denisovan-specific sites.
HG02262_pmdv_den_snps = []
# Read the HGSV2 VCF file.
with gzip.open('../hg38_data/HG02262-ONT-hg38-R9-LSK110-guppy-sup-5mC.PMDV_FINAL.vcf.gz', 'rt') as HG02262_pmdv_data:
    # Iterate through every line.
    for line in HG02262_pmdv_data:
        # Skip the meta information.
        if line.startswith('##'):
            continue
        # Else-if this is a header line.
        elif line.startswith('#'):
            # Split and grab the header line.
            HG02262_pmdv_header_line = np.array(line.split())
        # Else, the line contains genotype information.
        else:
            # Split the line by tabs.
            spline = line.split()
            # If this is not chromosome 12, skip.
            if spline[0] != 'chr12':
                continue
            # Else-if we have parsed through chromosome 12, break.
            elif spline[0] == 'chr13':
                break
            # Else, we are in chromosome 12.
            else:
                # Check to see if the position is a Denisovan-specific sites.
                den_snp_check = hg38_den_snp_tracker(spline, hg19_info, hg38_den_snps)
                if den_snp_check:
                    # Print an update.
                    print(den_snp_check)
                    # Clean up the genotype information for the final table.
                    spline[-1] = spline[-1][:3]
                    # Update the list.
                    HG02262_pmdv_den_snps.append(spline)
                # If this is the last position of interest, break.
                if int(spline[1]) == hg38_den_snps[-1]:
                    break
# Convert the list to an array.
HG02262_pmdv_den_snps = np.array(HG02262_pmdv_den_snps)
# Construct a dataframe.
HG02262_pmdv_den_snps_vcf_df = pd.DataFrame(HG02262_pmdv_den_snps, columns=HG02262_pmdv_header_line)
# Grab only the nescessary columns.
HG02262_pmdv_den_snps_vcf_df = HG02262_pmdv_den_snps_vcf_df[['#CHROM', 'POS', 'REF', 'ALT', 'Sample']].copy()

Processed 1 out of 135 Denisovan-specific SNPs...
hg19:40788647-G-C	hg38:40394845-G-C

Processed 2 out of 135 Denisovan-specific SNPs...
hg19:40789962-C-T	hg38:40396160-C-T

Processed 3 out of 135 Denisovan-specific SNPs...
hg19:40790924-C-T	hg38:40397122-C-T

Processed 4 out of 135 Denisovan-specific SNPs...
hg19:40791852-T-C	hg38:40398050-T-C

Processed 5 out of 135 Denisovan-specific SNPs...
hg19:40792197-G-C	hg38:40398395-G-C

Processed 6 out of 135 Denisovan-specific SNPs...
hg19:40792300-G-A	hg38:40398498-G-A

Processed 7 out of 135 Denisovan-specific SNPs...
hg19:40792856-T-G	hg38:40399054-T-G

Processed 8 out of 135 Denisovan-specific SNPs...
hg19:40793515-G-A	hg38:40399713-G-A

Processed 9 out of 135 Denisovan-specific SNPs...
hg19:40793899-T-C	hg38:40400097-T-C

Processed 10 out of 135 Denisovan-specific SNPs...
hg19:40794293-C-T	hg38:40400491-C-T

Processed 11 out of 135 Denisovan-specific SNPs...
hg19:40794452-T-C	hg38:40400650-T-C

Processed 12 out of 135 Denisovan-specifi

In [36]:
# Sanity check.
print(f'Passed REF QC: {np.array_equal(hg19_info["REF"], HG02262_pmdv_den_snps_vcf_df["REF"].values)}')
print(f'Passed ALT QC: {np.array_equal(hg19_info["ALT"], HG02262_pmdv_den_snps_vcf_df["ALT"].values)}')

Passed REF QC: True
Passed ALT QC: True


In [37]:
# Inspect.
HG02262_pmdv_den_snps_vcf_df

Unnamed: 0,#CHROM,POS,REF,ALT,Sample
0,chr12,40394845,G,C,1/1
1,chr12,40396160,C,T,1/1
2,chr12,40397122,C,T,1/1
3,chr12,40398050,T,C,1/1
4,chr12,40398395,G,C,1/1
5,chr12,40398498,G,A,1/1
6,chr12,40399054,T,G,1/1
7,chr12,40399713,G,A,1/1
8,chr12,40400097,T,C,1/1
9,chr12,40400491,C,T,1/1


### Validation Table

In [38]:
# Intialize a dictionary.
validation_dicc = {
    'Chr12 Position (Hg19)': hg19_info['POS'],
    'Ref. Allele (Hg19)': hg19_info['REF'],
    'Denisovan Allele (Hg19)': hg19_info['ALT'],
    'Chr12 Position (Hg38)': hg38_den_snps,
    'Ref. Allele (Hg38)': HG02262_pmdv_den_snps_vcf_df['REF'].values, # Note I have demonstrated that the refrence and alternative alleles are the same across all datasets so I just grabbed them from this dataframe since the choice is arbitrary.
    'Denisovan Allele (Hg38)': HG02262_pmdv_den_snps_vcf_df['ALT'].values, # See previous comment the same is true here.
    'HG00864 (PAV)': hgsv2_den_snps_vcf_df['HG00864'].values,
    'HG03009 (PAV)': hgsv2_den_snps_vcf_df['HG03009'].values,
    'NA20847 (PAV)': hgsv2_den_snps_vcf_df['NA20847'].values,
    'HG01122 (Clair3)': HG01122_clair3_den_snps_vcf_df['SAMPLE'].values,
    'HG01122 (PMDV)': HG01122_pmdv_den_snps_vcf_df['Sample'].values,
    'HG02252 (Clair3)': HG02252_clair3_den_snps_vcf_df['SAMPLE'].values,
    'HG02252 (PMDV)': HG02252_pmdv_den_snps_vcf_df['Sample'].values,
    'HG02262 (Clair3)': HG02262_clair3_den_snps_vcf_df['SAMPLE'].values,
    'HG02262 (PMDV)': HG02262_pmdv_den_snps_vcf_df['Sample'].values,
}
# Intialize the sample columns.
samp_cols = [
    'HG00864 (PAV)', 'HG03009 (PAV)', 'NA20847 (PAV)',
    'HG01122 (Clair3)', 'HG01122 (PMDV)',
    'HG02252 (Clair3)', 'HG02252 (PMDV)',
    'HG02262 (Clair3)', 'HG02262 (PMDV)'
]
# Iterate through each position.
for i, (ref, alt) in enumerate(zip(
    validation_dicc['Ref. Allele (Hg38)'],
    validation_dicc['Denisovan Allele (Hg38)'],
)):
    # Intialize the allele mapping.
    gt2allele = {'0': ref, '1': alt}
    # For every sample.
    for samp in samp_cols:
        # If this is a HGSV2 sample.
        if 'PAV' in samp:
            # Extract the genotype.
            gt = validation_dicc[samp][i]
            # Recode the genotype with the alleles.
            validation_dicc[samp][i] = '|'.join(gt2allele.get(allele) for allele in gt.split('|'))
        # Else this is a 1KG-ONT sample.
        else:
            # Extract the genotype.
            gt = validation_dicc[samp][i]
            # Recode the genotype with the alleles.
            validation_dicc[samp][i] = '/'.join(gt2allele.get(allele) for allele in gt.split('/'))
# Convert the dictionary tp a datafrme.
validation_df = pd.DataFrame(validation_dicc)

In [39]:
# Export.
# validation_df.to_csv('./dataframes/denisovan_snp_hgsv2_1kg_ont_long_read_validation.csv.gz', index=False)
# View the validation dataframe.
validation_df

Unnamed: 0,Chr12 Position (Hg19),Ref. Allele (Hg19),Denisovan Allele (Hg19),Chr12 Position (Hg38),Ref. Allele (Hg38),Denisovan Allele (Hg38),HG00864 (PAV),HG03009 (PAV),NA20847 (PAV),HG01122 (Clair3),HG01122 (PMDV),HG02252 (Clair3),HG02252 (PMDV),HG02262 (Clair3),HG02262 (PMDV)
0,40788647,G,C,40394845,G,C,G|C,C|G,C|G,G/C,G/C,G/C,G/C,C/C,C/C
1,40789962,C,T,40396160,C,T,C|T,T|C,T|C,C/T,C/T,C/T,C/T,T/T,T/T
2,40790924,C,T,40397122,C,T,C|T,T|C,T|C,C/T,C/T,C/T,C/T,T/T,T/T
3,40791852,T,C,40398050,T,C,T|C,C|T,C|T,T/C,T/C,T/C,T/C,C/C,C/C
4,40792197,G,C,40398395,G,C,G|C,C|G,C|G,G/C,G/C,G/C,G/C,C/C,C/C
5,40792300,G,A,40398498,G,A,G|A,A|G,A|G,G/A,G/A,G/A,G/A,A/A,A/A
6,40792856,T,G,40399054,T,G,T|G,G|T,G|T,T/G,T/G,T/G,T/G,G/G,G/G
7,40793515,G,A,40399713,G,A,G|A,A|G,A|G,G/A,G/A,G/A,G/A,A/A,A/A
8,40793899,T,C,40400097,T,C,T|C,C|T,C|T,T/C,T/C,T/C,T/C,C/C,C/C
9,40794293,C,T,40400491,C,T,C|T,T|C,T|C,C/T,C/T,C/T,C/T,T/T,T/T
