# Missense Mutations

In [1]:
# Import packages.
import analysis_functions as af
import allel
import numpy as np
import pandas as pd
# Print version numbers.
print('allel', allel.__version__)
print('numpy', np.__version__)
print('pandas', pd.__version__)

allel 1.3.5
numpy 1.22.3
pandas 1.4.2


In [2]:
# Intialize the pandas preferences.
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [8]:
# Load the missense mutation dataframe.
den_mis_df = pd.read_csv('../meta_data/denisovan_specific_derived_missense_muts_info.csv')
# Extract the positions of the missense mutations.
den_mis_pos = den_mis_df['POS'].values

## TGP & SGDP

In [9]:
# Load the meta data file for the tgp and sgdp.
tgp_df = pd.read_csv(
    '../meta_data/tgp_mod.txt', sep='\t',
    names=['IND', 'POP', 'SUPERPOP'],
)
pap_df = pd.read_csv(
    '../meta_data/sgdp_pap.txt', sep='\t',
    names=['IDX', 'IND', 'POP', 'SUPERPOP'],
)
sgdp_amr_df = pd.read_csv('../meta_data/sgdp_amr.csv')
# Intialize the tgp and sgdp pop list.
tgp_pop_list = [
    'LWK', 'GWD', 'MSL', 'ESN', 'YRI', # AFR.
    'BEB', 'STU', 'ITU', 'PJL', 'GIH', # SAS.
    'CHB', 'KHV', 'CHS', 'JPT', 'CDX', # EAS.    
    'TSI', 'CEU', 'IBS', 'GBR', 'FIN', # EUR.
    'PEL', 'MXL', 'CLM', 'PUR', # AMR.
]
sgdp_amr_list = [
    'Mayan', 'Mixe','Mixtec',
    'Pima', 'Zapotec', 'Chane',
    'Karitiana', 'Surui', 'Piapoco',
    'Quechua',
]
sgdp_amr_dicc = {
    'Chipewyan': 'Canada',
    'Cree':  'Canada',
    'Mayan': 'Mexico',
    'Mixe': 'Mexico',
    'Mixtec': 'Mexico',
    'Nahua': 'Mexico',
    'Pima': 'Mexico',
    'Zapotec': 'Mexico',
    'Chane': 'Argentina',
    'Karitiana': 'Brazil',
    'Surui': 'Brazil',
    'Piapoco': 'Colombia',
    'Quechua': 'Peru',
}

In [10]:
# Load the genotype matricies.
tgp_72kb_gt, tgp_72kb_pos = af.load_hap_region('tgp_mod_arc_anc', 12, 40758000, 40830000)
sgdp_72kb_gt, sgdp_72kb_pos = af.load_hap_region('sgdp_arc_anc', 12, 40758000, 40830000)
# Create missense mutation masks.
tgp_mis_mask = np.in1d(tgp_72kb_pos, den_mis_pos)
sgdp_mis_mask = np.in1d(sgdp_72kb_pos, den_mis_pos)
# Polarize the genotype matricies.
tgp_mis_p_gt = af.polarize_gt(tgp_72kb_gt.compress(tgp_mis_mask, axis=0))
sgdp_mis_p_gt = af.polarize_gt(sgdp_72kb_gt.compress(sgdp_mis_mask, axis=0))

In [11]:
# Intialize a dictionary for the tgp samples.
tgp_dicc = {
    'Position (hg19)': den_mis_df['POS'].values,
    'rsID': den_mis_df['ID'].values,
    'Reference Allele': den_mis_df['REF'].values,
    'Alternative Allele': den_mis_df['ALT'].values,
    'Ancestral Allele': den_mis_df['ANC'].values,
    'Derived Allele': den_mis_df['DER'].values,
    r'Anc. AA $\rightarrow$ Der. AA': den_mis_df['AA'].values,
}
# Add the sgdp information.
tgp_dicc[r'Papuan ($n$ = 15)'] = af.calc_alt_freqs(sgdp_mis_p_gt.take(pap_df['IDX'].values, axis=1))
tgp_dicc[r'SGDP AMR ($n$ = 22)'] = af.calc_alt_freqs(sgdp_mis_p_gt.take(sgdp_amr_df['IDX'].values, axis=1))
# For every population...
for pop in tgp_pop_list:
    # Extract the sample indicies.
    pop_idx = tgp_df[tgp_df['POP'] == pop].index.values
    # Update the dataframe.
    tgp_dicc[pop] = af.calc_alt_freqs(tgp_mis_p_gt.take(pop_idx, axis=1))
# Convert the dataframe.
tgp_den_mis_freq_df = pd.DataFrame(tgp_dicc)

In [12]:
# Show the results for tgp + sgdp.
tgp_den_mis_freq_df

Unnamed: 0,Position (hg19),rsID,Reference Allele,Alternative Allele,Ancestral Allele,Derived Allele,Anc. AA $\rightarrow$ Der. AA,Papuan ($n$ = 15),SGDP AMR ($n$ = 22),LWK,GWD,MSL,ESN,YRI,BEB,STU,ITU,PJL,GIH,CHB,KHV,CHS,JPT,CDX,TSI,CEU,IBS,GBR,FIN,PEL,MXL,CLM,PUR
0,40808672,rs4768252,C,T,C,T,Ser$\rightarrow$Leu,0.1,0.363636,0.0,0.0,0.0,0.0,0.0,0.156977,0.112745,0.112745,0.052083,0.097087,0.033981,0.085859,0.038095,0.028846,0.123656,0.051402,0.015152,0.03271,0.005495,0.005051,0.217647,0.304688,0.079787,0.086538
1,40808726,rs4768253,C,T,C,T,Thr$\rightarrow$Met,0.1,0.363636,0.0,0.0,0.0,0.0,0.0,0.168605,0.112745,0.112745,0.052083,0.101942,0.033981,0.085859,0.038095,0.028846,0.123656,0.051402,0.015152,0.03271,0.005495,0.005051,0.217647,0.304688,0.074468,0.086538
2,40815060,rs149221842,C,T,C,T,His$\rightarrow$Tyr,0.1,0.363636,0.0,0.0,0.0,0.0,0.0,0.156977,0.112745,0.107843,0.052083,0.097087,0.033981,0.085859,0.038095,0.028846,0.123656,0.051402,0.015152,0.03271,0.005495,0.005051,0.217647,0.304688,0.074468,0.086538
3,40821795,rs61736852,G,A,G,A,Glu$\rightarrow$Lys,0.1,0.363636,0.0,0.0,0.0,0.0,0.0,0.139535,0.112745,0.102941,0.052083,0.087379,0.033981,0.085859,0.038095,0.028846,0.123656,0.051402,0.015152,0.03271,0.005495,0.005051,0.217647,0.304688,0.069149,0.086538
4,40821847,rs11564170,C,A,C,A,Thr$\rightarrow$Lys,0.1,0.363636,0.0,0.0,0.0,0.0,0.0,0.139535,0.112745,0.102941,0.052083,0.087379,0.033981,0.085859,0.038095,0.028846,0.123656,0.051402,0.015152,0.03271,0.005495,0.005051,0.217647,0.304688,0.069149,0.086538
5,40821871,rs17467284,G,T,G,T,Arg$\rightarrow$Leu,0.1,0.363636,0.0,0.0,0.0,0.0,0.0,0.139535,0.112745,0.102941,0.052083,0.087379,0.033981,0.085859,0.038095,0.028846,0.123656,0.051402,0.015152,0.03271,0.005495,0.005051,0.217647,0.304688,0.069149,0.086538


In [13]:
# Intialize a dictionary for the sgdp samples.
sgdp_dicc = {
    'Position (hg19)': den_mis_df['POS'].values,
    'rsID': den_mis_df['ID'].values,
    'Reference Allele': den_mis_df['REF'].values,
    'Alternative Allele': den_mis_df['ALT'].values,
    'Ancestral Allele': den_mis_df['ANC'].values,
    'Derived Allele': den_mis_df['DER'].values,
    r'Anc. AA $\rightarrow$ Der. AA': den_mis_df['AA'].values,
}
# Add the papuan info.
sgdp_dicc[r'Papuan ($n$ = 15)'] = af.calc_alt_freqs(sgdp_mis_p_gt.take(pap_df['IDX'].values, axis=1))
# For every population...
for pop in sgdp_amr_list:
    # Subset the dataframe.
    sub_df = sgdp_amr_df[sgdp_amr_df['POP'] == pop]
    # Intialize a title.
    title = pop+r' ($n$ = '+f'{sub_df.shape[0]})'
    # Update the dataframe.
    sgdp_dicc[title] = af.calc_alt_freqs(sgdp_mis_p_gt.take(sub_df['IDX'].values, axis=1))
# Convert the dataframe.
sgdp_den_mis_freq_df = pd.DataFrame(sgdp_dicc)

In [14]:
# Show the results for the sgdp.
sgdp_den_mis_freq_df

Unnamed: 0,Position (hg19),rsID,Reference Allele,Alternative Allele,Ancestral Allele,Derived Allele,Anc. AA $\rightarrow$ Der. AA,Papuan ($n$ = 15),Mayan ($n$ = 2),Mixe ($n$ = 3),Mixtec ($n$ = 2),Pima ($n$ = 2),Zapotec ($n$ = 2),Chane ($n$ = 1),Karitiana ($n$ = 3),Surui ($n$ = 2),Piapoco ($n$ = 2),Quechua ($n$ = 3)
0,40808672,rs4768252,C,T,C,T,Ser$\rightarrow$Leu,0.1,0.5,0.666667,0.0,0.25,0.5,0.0,0.166667,0.5,0.5,0.333333
1,40808726,rs4768253,C,T,C,T,Thr$\rightarrow$Met,0.1,0.5,0.666667,0.0,0.25,0.5,0.0,0.166667,0.5,0.5,0.333333
2,40815060,rs149221842,C,T,C,T,His$\rightarrow$Tyr,0.1,0.5,0.666667,0.0,0.25,0.5,0.0,0.166667,0.5,0.5,0.333333
3,40821795,rs61736852,G,A,G,A,Glu$\rightarrow$Lys,0.1,0.5,0.666667,0.0,0.25,0.5,0.0,0.166667,0.5,0.5,0.333333
4,40821847,rs11564170,C,A,C,A,Thr$\rightarrow$Lys,0.1,0.5,0.666667,0.0,0.25,0.5,0.0,0.166667,0.5,0.5,0.333333
5,40821871,rs17467284,G,T,G,T,Arg$\rightarrow$Leu,0.1,0.5,0.666667,0.0,0.25,0.5,0.0,0.166667,0.5,0.5,0.333333


## Ancient Americans

In [15]:
# Extract the poistions, ancestral, and derived alleles.
missense_sites = den_mis_df['POS'].values
anc_alleles = den_mis_df['ANC'].values
der_alleles = den_mis_df['DER'].values
# Load the ancient sample ids.
anc_samps = np.array([
    'USR1', 'CK-13', 'Anzick',
    'CR-01', 'CT-01', 'NC_C',
    'PS-06', 'SC-05', 'SM-02',
    'SN-13', 'SN-17', 'SN-44',
    '2417J', '2417Q', '333B',
    'SRR8144644', 'SRR8144646',
    'SRR8144647', 'SRR8144650',
    'ERR2270782', 'ERR2270783',
    'ERR2270784', 'ERR2270785',

])
# Intialize a dictionary to determine the allele from angsd.
angsd_allele_dicc = {
    0: 'A', 1: 'C',
    2: 'G', 3: 'T',
}
# Intialize a dictionary for the ancient samples.
anc_dicc = {
    'Position (hg19)': den_mis_df['POS'].values,
    'rsID': den_mis_df['ID'].values,
    'Reference Allele': den_mis_df['REF'].values,
    'Alternative Allele': den_mis_df['ALT'].values,
    'Ancestral Allele': den_mis_df['ANC'].values,
    'Derived Allele': den_mis_df['DER'].values,
    r'Anc. AA $\rightarrow$ Der. AA': den_mis_df['AA'].values,
}
# For all ancient samples...
for samp in anc_samps:
    # Intialize the individual.
    anc_dicc[samp] = []
    # Load the read data.
    anc_df = pd.read_csv(f'../ancient_americans/{samp}_ac.txt.gz', sep='\t')
    # Extract the positions array.
    anc_pos = anc_df['pos'].values
    # For every Denisovan missense mutation...
    for i in range(den_mis_df.shape[0]):
        # Extract the missense info.
        pos = missense_sites[i]
        anc = anc_alleles[i]
        der = der_alleles[i]
        # Determine the index in the ancient sample.
        pos_idx = np.where(anc_pos == pos)[0]
        # If the site passed QC in the ancient sample...
        if pos_idx.size > 0:
            # Extract read counts.
            read_counts = anc_df.iloc[pos_idx].to_numpy()[:, -4:].flatten()
            # Determine which allele indicies have more than two reads.
            allele_idx = np.where(read_counts >= 2)[0]
            # If this site is not mono- or bi-allelic.
            if allele_idx.size > 2:
                # Append the dictionary.
                anc_dicc[samp].append('.')
                # Continue to the next site.
                continue
            # Else-if this is a mono-allelic site...
            elif allele_idx.size == 1:
                # Determine the allele identity.
                anc_allele_call = angsd_allele_dicc[allele_idx[0]]
                # If the ancient allele is the derived allele...
                if anc_allele_call == der:
                    # Append the dictionary.
                    anc_dicc[samp].append(der+'/'+der)
                # Else-if the ancient allele is the ancestral allele...
                elif anc_allele_call == anc:
                    # Append the dictionary.
                    anc_dicc[samp].append(anc+'/'+anc)
                # Else...
                else:
                    # Append the dictionary.
                    anc_dicc[samp].append('.')
                    # Continue to the next site.
                    continue
            # Else-if this is a bi-allelic site...
            elif allele_idx.size == 2:
                # Determine the allele identities.
                anc_allele_calls = np.array([angsd_allele_dicc[j] for j in allele_idx])
                # If the ancient alleles are the same as the hg19 alleles...
                if np.array_equal(np.sort(anc_allele_calls), np.sort(np.array([anc, der]))):
                    # Append the dictionary.
                    anc_dicc[samp].append(anc+'/'+der)
                # Else...
                else:
                    # Append the dictionary.
                    anc_dicc[samp].append('.')
                    # Continue to the next site.
                    continue
            # Else...
            else:
                # Append the dictionary.
                anc_dicc[samp].append('.')
                # Continue to the next site.
                continue
        # Else...
        else:
            # Append the dictionary.
            anc_dicc[samp].append('.')
            # Continue to the next site.
            continue
# Convert the dictionary to a dataframe.
anc_genos_df = pd.DataFrame(data=anc_dicc)

In [16]:
# Show the results for the ancient american samples.
anc_genos_df

Unnamed: 0,Position (hg19),rsID,Reference Allele,Alternative Allele,Ancestral Allele,Derived Allele,Anc. AA $\rightarrow$ Der. AA,USR1,CK-13,Anzick,CR-01,CT-01,NC_C,PS-06,SC-05,SM-02,SN-13,SN-17,SN-44,2417J,2417Q,333B,SRR8144644,SRR8144646,SRR8144647,SRR8144650,ERR2270782,ERR2270783,ERR2270784,ERR2270785
0,40808672,rs4768252,C,T,C,T,Ser$\rightarrow$Leu,C/T,.,C/T,T/T,C/C,.,C/C,C/T,T/T,.,C/C,C/T,.,C/C,.,C/C,C/C,C/C,C/C,T/T,T/T,C/C,C/T
1,40808726,rs4768253,C,T,C,T,Thr$\rightarrow$Met,C/C,T/T,C/C,T/T,C/C,C/C,C/C,C/T,T/T,T/T,C/T,C/T,.,.,.,C/C,C/C,C/C,C/C,T/T,C/T,C/C,C/T
2,40815060,rs149221842,C,T,C,T,His$\rightarrow$Tyr,C/C,.,C/C,T/T,C/C,.,C/C,T/T,.,T/T,T/T,C/T,.,.,.,C/C,C/C,C/C,C/C,T/T,C/T,C/C,C/T
3,40821795,rs61736852,G,A,G,A,Glu$\rightarrow$Lys,G/G,.,G/G,A/A,.,G/G,G/G,G/A,.,.,A/A,.,.,G/G,.,.,G/G,G/G,G/G,A/A,G/A,.,G/A
4,40821847,rs11564170,C,A,C,A,Thr$\rightarrow$Lys,C/C,A/A,C/C,A/A,C/C,C/C,C/C,C/A,A/A,.,C/C,C/C,.,.,A/A,.,C/C,C/C,C/C,A/A,C/A,C/C,C/A
5,40821871,rs17467284,G,T,G,T,Arg$\rightarrow$Leu,G/G,.,G/G,T/T,G/G,G/G,G/G,G/T,T/T,.,G/G,.,T/T,.,G/T,G/T,G/G,G/G,G/G,T/T,G/G,G/G,G/T
