# Table 2
Aggregate low frequency SNVs in each gene.

In [1]:
import pandas as pd
pd.set_option('display.min_rows', 101)
pd.set_option('display.max_colwidth',300)
import numpy as np
import os
from tqdm import tqdm
import Bio
import Bio.SeqUtils

Process data from get_all_variants, aggregates SNVs by genename
Annotate genes

In [3]:
ec_variants = pd.read_excel('../data/ec_variant_SS_rescuedlt5.xlsx', index_col=[0,1])
kp_variants = pd.read_excel('../data/kp_variant_SS_rescuedlt5.xlsx', index_col=[0,1])
genesDF = pd.read_excel("../data/miniKegg_annotation.xlsx", index_col=0)

In [4]:
species = 'ec' # choose ec or kp
if species=='ec':
    variants = ec_variants
elif species=='kp':
    variants = kp_variants

# Aggregate SNVs

In [5]:
mAF_threshold = 0.5 #threshold for low AF
indv_treshold = 0 #threshold for number of individuals
filtered_variants = variants[(variants['AF_mean'] < mAF_threshold) & (variants['individuals'] > indv_treshold)]
SNVcounts = pd.DataFrame(filtered_variants.groupby(level='GENENAME')
                         .count()['HGVS.P']
                         .sort_values(ascending=False))\
                         .rename(columns={'HGVS.P':'SNV counts'})

# Add properties

In [6]:
SNVcounts['Protein Length'] = genesDF.reindex(SNVcounts.index).loc[SNVcounts.index,'prot_len']
SNVcounts['Normalized counts'] = SNVcounts['SNV counts']/SNVcounts['Protein Length']
SNVcounts['Synonymous'] = pd.DataFrame(filtered_variants[filtered_variants['ANN']=='synonymous_variant']).groupby(level='GENENAME').count()['HGVS.P']
SNVcounts['Normalized Synonymous'] = SNVcounts['Synonymous']/SNVcounts['Protein Length']
SNVcounts['Non-Synonymous'] = pd.DataFrame(filtered_variants[filtered_variants['ANN']!='synonymous_variant']).groupby(level='GENENAME').count()['HGVS.P']
SNVcounts['Normalized Non-Synonymous'] = SNVcounts['Non-Synonymous']/SNVcounts['Protein Length']
SNVcounts['N/S'] = SNVcounts['Non-Synonymous']/SNVcounts['Synonymous']
SNVcounts['Annotation'] = genesDF.reindex(SNVcounts.index).loc[SNVcounts.index,'annotation']

# Show table
SNVcounts

Unnamed: 0_level_0,SNV counts,Protein Length,Normalized counts,Synonymous,Normalized Synonymous,Non-Synonymous,Normalized Non-Synonymous,N/S,Annotation
GENENAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
rpoC,147,1407.0,0.104478,143.0,0.101635,4.0,0.002843,0.027972,"rpoC; RNA polymerase, beta prime subunit"
rpoB,144,1342.0,0.107303,135.0,0.100596,9.0,0.006706,0.066667,"rpoB; RNA polymerase, beta subunit"
ECIAI39_0530,121,1159.0,0.104400,87.0,0.075065,34.0,0.029336,0.390805,Host specificity protein J of prophage
ECIAI39_4258,105,2836.0,0.037024,57.0,0.020099,48.0,0.016925,0.842105,putative invasin/intimin protein
fusA,71,704.0,0.100852,63.0,0.089489,8.0,0.011364,0.126984,"fusA; protein chain elongation factor EF-G, GTP-binding"
rpoD,67,613.0,0.109299,64.0,0.104405,3.0,0.004894,0.046875,"rpoD; RNA polymerase, sigma 70 (sigma D) factor"
metG,67,677.0,0.098966,65.0,0.096012,2.0,0.002954,0.030769,metG; methionyl-tRNA synthetase
ECIAI39_4895,60,1232.0,0.048701,45.0,0.036526,15.0,0.012175,0.333333,Host specificity protein J
lacZ,58,1024.0,0.056641,41.0,0.040039,17.0,0.016602,0.414634,lacZ; beta-D-galactosidase
rpsA,57,557.0,0.102334,56.0,0.100539,1.0,0.001795,0.017857,rpsA; 30S ribosomal subunit protein S1
