In [87]:
import numpy as np
import pandas as pd
import networkx as nx
import plotly.io as pio
import plotly.express as plx 
from IPython.display import HTML, display
import IPython 
from plotly.offline import init_notebook_mode
import sys, os, argparse

CONFIG_FILE = '.ipynb.config'
if os.path.isfile(CONFIG_FILE):
    with open(CONFIG_FILE) as f:
        sys.argv = f.read().split()
else:
	sys.argv = ['run_notebook.py', '/home/john.palmer/work/vibrio/results-june15-1/virulence']

parser = argparse.ArgumentParser()
parser.add_argument("vf_path",help="Input path to all virulence factor outputs for one sequencing run.")
args = parser.parse_args()


init_notebook_mode(connected=True)
pio.renderers.default = "notebook"

display(HTML("<style>.container { width:100% !important; }</style>"))

In [88]:
from glob import glob

dflist = []
for filename in glob(args.vf_path + "/*vf.tsv"):
	df = pd.read_csv(filename, sep='\t')
	dflist.append(df)

fulldf = pd.concat(dflist)
fulldf = fulldf.rename({'#FILE':"sample_name", 'GENE':"vf"},axis=1)
fulldf['sample_name'] = fulldf['sample_name'].str.split(".").str[0]

In [89]:
plot_df = fulldf[['sample_name','vf']].drop_duplicates()
plot_df = plot_df.sort_values(['sample_name','vf'])
plot_df['status'] = 1

plot_df = plot_df.pivot(index='sample_name', columns='vf',values='status')
plot_df = plot_df.fillna(0)
plot_df = plot_df.transpose()


['VP1611' 'VPA0450' 'exsA' 'exsD' 'sycN' 'tlh' 'tyeA' 'vcrD' 'vcrG' 'vcrH'
 'vcrR' 'vcrV' 'vecA' 'virG' 'vopB' 'vopD' 'vopN' 'vopQ' 'vopR' 'vopS'
 'vscB' 'vscC' 'vscD' 'vscF' 'vscG' 'vscH' 'vscI' 'vscJ' 'vscK' 'vscL'
 'vscN' 'vscO' 'vscP' 'vscQ' 'vscR' 'vscS' 'vscT' 'vscU' 'vscX' 'vscY'
 'vxsC' 'tdh' 'trhX' 'hlyB']
Index(['VP1611', 'VPA0450', 'exsA', 'exsD', 'hlyB', 'sycN', 'tdh', 'tlh',
       'trhX', 'tyeA', 'vcrD', 'vcrG', 'vcrH', 'vcrR', 'vcrV', 'vecA', 'virG',
       'vopB', 'vopD', 'vopN', 'vopQ', 'vopR', 'vopS', 'vscB', 'vscC', 'vscD',
       'vscF', 'vscG', 'vscH', 'vscI', 'vscJ', 'vscK', 'vscL', 'vscN', 'vscO',
       'vscP', 'vscQ', 'vscR', 'vscS', 'vscT', 'vscU', 'vscX', 'vscY', 'vxsC'],
      dtype='object', name='vf')


In [92]:
# restructure the plotdf dataframe to highlight Tess's virulence factors of particular interest
target_genes = sorted([
    'vopQ','vopR', 'vopS','VPA0450','exsD','hutC','toxR','vpm','tdh','trh','tlh',
    'vopA/vopP','vopC','vopL','vopT','vopV','vopZ','VPA1380','vtrB','pilA'
])

targets_present = plot_df.index[plot_df.index.isin(target_genes)]

target_df = plot_df.loc[targets_present].sort_index()
plot_df = plot_df.drop(targets_present)
final_df = pd.concat([plot_df, target_df])


sample_name,BUR-FI-1999-MI-00278-01,BUR-FI-1999-MI-00282-01,BUR-FI-1999-MI-00285-01,BUR-FI-1999-MI-00286-01,BUR-FI-1999-MI-00289-01,BUR-FI-1999-MI-00291-02,BUR-FI-1999-MI-00300-01,BUR-FI-1999-MI-00314-02,BUR-FI-1999-MI-00325-01,BUR-FI-1999-MI-00334-02,...,BUR-FI-2021-MI-00490-01,BUR-FI-2021-MI-00490-73,BUR-FI-2022-MI-00564-01,BUR-FI-2022-MI-00564-25,BUR-FI-2022-MI-00564-62,BUR-FI-2022-MI-00593-03,BUR-FI-2022-MI-00641-01,BUR-FI-2022-MI-00641-06,BUR-FI-2022-MI-00748-07,BUR-FI-2022-MI-00756-04
vf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
VPA0450,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
exsD,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
tdh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
tlh,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
vopQ,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
vopR,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
vopS,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [99]:

fig = plx.imshow(final_df, aspect='auto', height=200+18*final_df.shape[0], width=600+20*final_df.shape[1])

fig.update_xaxes(tickangle=45, showgrid=True, title='Sample Name')
fig.update_yaxes(title='Virulence Factor')
fig.show()