# Table 3
Calculate number of SNVs with AF change of greater than 0.3 in each gene

In [5]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import os
import Bio
from itertools import permutations
from tqdm import tqdm
import Bio
import Bio.SeqUtils
import Bio.codonalign
from Bio.Seq import Seq
import uniprot
import Bio.KEGG.REST as BKR
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from time import sleep
from scipy.stats import chisquare, chi2_contingency, fisher_exact, poisson, binom_test
from statsmodels.stats import multitest

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load SNVs with AF change of greater than 0.3

In [6]:
filter_2orMore_indv = True
species = 'ec' # ec or kp
INPUT = '../data/' + f'{species}_indiv_0.3_filtered_rescued.bed'

# Aggregate SNVs in each gene

In [10]:
header = ['CHROM','POS-1','POS','INDIV','1VISIT','2VISIT','1POS','1ID', '1REF','1ALT','1QUAL','1AF','1DEPTH','2POS','2ID','2REF','2ALT','2QUAL', '2AF','2DEPTH','ANN','PUTIMPACT','GENENAME','BIOTYPE','HGVS.C','HGVS.P']
SNV = pd.read_csv(INPUT, sep='\t', names=header)

# Remove multiple counts within an individual
SNV = SNV.groupby(['CHROM','POS-1','POS','INDIV','1POS','1REF','1ALT','2POS','2REF','2ALT','ANN','PUTIMPACT','GENENAME','BIOTYPE','HGVS.C','HGVS.P'],as_index=False).size().drop(columns='size')

# Group by SNV
aggSNV = SNV.groupby(['CHROM','POS-1','POS','1POS','1REF','1ALT','2POS','2REF','2ALT','ANN','PUTIMPACT','GENENAME','BIOTYPE','HGVS.C','HGVS.P'],as_index=False).size().rename(columns={'size':'count'})


# Filter for SNV that occur in more than one individual
if filter_2orMore_indv:
    aggSNV = aggSNV[aggSNV['count']>1]

# Filter for non-synonymous SNVs
aggSNV = aggSNV[aggSNV['ANN']=='missense_variant']

# Combine AA substition and number of individuals
aggSNV['HGVS.P'] = aggSNV['HGVS.P'].str.extract(r'p.(.*)')
aggSNV['Variant'] = aggSNV[['HGVS.P','count']].agg(lambda x: f"{x['HGVS.P']} ({x['count']})", axis=1)

# Group by genename
parse_feature = lambda x: ', '.join([str(i) for i in x])
aggSNV_genename = pd.concat([aggSNV.groupby(['GENENAME'])['Variant'].agg(parse_feature), aggSNV.groupby(['GENENAME']).size().rename('SNV counts')],axis=1)
aggSNV_genename = aggSNV_genename.sort_values('SNV counts',ascending=False)

# Get Annotation
miniKegg = pd.read_excel('../data/miniKegg_annotation.xlsx')
aggSNV_genename['prot_len'] = np.array(miniKegg.set_index('genename').loc[aggSNV_genename.index]['prot_len'])
aggSNV_genename['annotation'] = np.array(miniKegg.set_index('genename').loc[aggSNV_genename.index]['annotation'])

# Show table
aggSNV_genename

Unnamed: 0_level_0,Variant,SNV counts,prot_len,annotation
GENENAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
lacZ,"Cys77Arg (2), Val86Ile (2), Thr109Ala (2), Phe...",12,1024.0,lacZ; beta-D-galactosidase
tRNA-Sec,"Thr27Met (2), Ala23Thr (2), Ser6Leu (3)",3,,
pflB,"Thr16Ala (3), Asn383Ser (3)",2,760.0,pflB; pyruvate formate lyase I
lpp,"Ala3Pro (2), Lys2Asn (3)",2,78.0,lpp; murein lipoprotein
rpsF,Asp114Glu (2),1,131.0,rpsF; 30S ribosomal subunit protein S6
pnp,Ile122Val (3),1,734.0,pnp; polynucleotide phosphorylase/polyadenylase
pykF,His216Gln (2),1,470.0,pykF; pyruvate kinase I
rnpA,Lys91Arg (3),1,119.0,rnpA; protein C5 component of RNase P
rplQ,Ser119Ala (2),1,127.0,rplQ; 50S ribosomal subunit protein L17
rpmE,Ser38Gly (2),1,70.0,rpmE; 50S ribosomal subunit protein L31
