## Script is designed to create interactive vizualization of the taxonomy from the last GTDB release 

Input files were taken from last to day release of GTDB DB (release207/207.0) <br>
https://data.ace.uq.edu.au/public/gtdb/data/releases/release207/207.0/

Output is: <br>
Plots with taxonomy visualisation

In [7]:
# import packages
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns 
import os
import glob
import plotly.express as px

In [36]:
colnames = ['dom','phylum','class','order','family','genus','sp']

# Read data for GTDB release 207
#rel = 'release207'
#bac = pd.read_csv('input/bac120_taxonomy_r207.tsv', sep=';', names=colnames, header=None)
#arh = pd.read_csv('input/ar53_taxonomy_r207.tsv', sep=';', names=colnames, header=None)

# Read data for GTDB release 214
rel = 'release214'
bac = pd.read_csv('input/bac120_taxonomy_r214.tsv', sep=';', names=colnames, header=None)
arh = pd.read_csv('input/ar53_taxonomy_r214.tsv', sep=';', names=colnames, header=None)

bac.head(2)

Unnamed: 0,dom,phylum,class,order,family,genus,sp
0,RS_GCF_016456235.1\td__Bacteria,p__Pseudomonadota,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli
1,RS_GCF_023534435.1\td__Bacteria,p__Pseudomonadota,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli


In [37]:
# modify bac120 table

bac['domain'] = bac['dom'].str.split('__', expand=True)[1]
bac['phylum'] = bac['phylum'].str.replace(r'p__', '')
bac['class'] = bac['class'].str.replace(r'c__', '')
bac['order'] = bac['order'].str.replace(r'o__', '')
bac['family'] = bac['family'].str.replace(r'f__', '')
bac['genus'] = bac['genus'].str.replace(r'g__', '')
bac['sp'] = bac['sp'].str.replace(r's__', '')
bac = bac[['domain','phylum','class','order','family','genus','sp']]
bac = bac.drop_duplicates()

bac.head(2)

Unnamed: 0,domain,phylum,class,order,family,genus,sp
0,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
33849,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Klebsiella,Klebsiella pneumoniae


In [38]:
# modify arch table

arh['domain'] = arh['dom'].str.split('__', expand=True)[1]
arh['phylum'] = arh['phylum'].str.replace(r'p__', '')
arh['class'] = arh['class'].str.replace(r'c__', '')
arh['order'] = arh['order'].str.replace(r'o__', '')
arh['family'] = arh['family'].str.replace(r'f__', '')
arh['genus'] = arh['genus'].str.replace(r'g__', '')
arh['sp'] = arh['sp'].str.replace(r's__', '')
arh = arh[['domain','phylum','class','order','family','genus','sp']]
arh = arh.drop_duplicates()

arh.head(2)

Unnamed: 0,domain,phylum,class,order,family,genus,sp
0,Archaea,Halobacteriota,Methanosarcinia,Methanosarcinales,Methanosarcinaceae,Methanosarcina,Methanosarcina mazei
78,Archaea,Thermoproteota,Thermoprotei_A,Sulfolobales,Sulfolobaceae,Sulfolobus,Sulfolobus acidocaldarius


In [39]:
frames = [bac,arh]
gtdb = pd.concat(frames)

gtdb.head(2)

Unnamed: 0,domain,phylum,class,order,family,genus,sp
0,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
33849,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Klebsiella,Klebsiella pneumoniae


### Draw plots

In [40]:
gtdb['count'] = 1

fig = px.sunburst(gtdb,
                  path=['domain','phylum','class'],
                  values='count',
                  color_discrete_sequence=px.colors.qualitative.Pastel,
                  title="GTDB, "+str(rel)+", all classes",
                  width=750, height=750)

fig.write_html("results/"+str(rel)+"_GTDB_all_class.html")
fig.show()

In [43]:
bac['count'] = 1

fig = px.sunburst(bac,
                  path=['domain','phylum','class','order','family','genus'],
                  values='count',
                  color_discrete_sequence=px.colors.qualitative.Pastel,
                  title="GTDB, "+str(rel)+", bac120",
                  width=750, height=750)

fig.write_html("results/"+str(rel)+"_GTDB_bac120.html")

In [44]:
arh['count'] = 1

fig = px.sunburst(arh,
                  path=['domain','phylum','class','order','family','genus'],
                  values='count',
                  color_discrete_sequence=px.colors.qualitative.Pastel,
                  title="GTDB, "+str(rel)+", ar53",
                  width=750, height=750)

fig.write_html("results/"+str(rel)+"_GTDB_arch.html")