## Script is designed to create the taxonomy table file

Output is: <br>

1. <i> File "Tax_gtdb.csv" <br></i>
Taxonomy table file. Taxonomy was taken from last GTDB release (release207/207.0) <br>
https://data.ace.uq.edu.au/public/gtdb/data/releases/release207/207.0/ <br>
<br>
2. <i> File "Tax_gtdb-ncbi.csv" <br></i>
Taxonomy table file. The table includes the gtdb taxonomy and ncbi names for all species for which I find a correspondence between dtdb and ncbi spesies names. Gtdb species names vs ncbi species names match files were taken from here: <br>
https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/


In [1]:
# import packages
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
from matplotlib_venn import venn3
import pandas as pd
import seaborn as sns 
import os
import glob
import plotly.express as px
import numpy as np

## Tax_gtdb

In [2]:
colnames = ['dom','phylum','class','order','family','genus','sp']

bac = pd.read_csv('input/bac120_taxonomy_r207.tsv', sep=';', names=colnames, header=None)
arh = pd.read_csv('input/ar53_taxonomy_r207.tsv', sep=';', names=colnames, header=None)

# modify bac120 table
bac['domain'] = bac['dom'].str.split('__', expand=True)[1]
bac['phylum'] = bac['phylum'].str.replace(r'p__', '')
bac['class'] = bac['class'].str.replace(r'c__', '')
bac['order'] = bac['order'].str.replace(r'o__', '')
bac['family'] = bac['family'].str.replace(r'f__', '')
bac['genus'] = bac['genus'].str.replace(r'g__', '')
bac['sp'] = bac['sp'].str.replace(r's__', '')
bac = bac[['domain','phylum','class','order','family','genus','sp']]
bac = bac.drop_duplicates()

# modify arch table
arh['domain'] = arh['dom'].str.split('__', expand=True)[1]
arh['phylum'] = arh['phylum'].str.replace(r'p__', '')
arh['class'] = arh['class'].str.replace(r'c__', '')
arh['order'] = arh['order'].str.replace(r'o__', '')
arh['family'] = arh['family'].str.replace(r'f__', '')
arh['genus'] = arh['genus'].str.replace(r'g__', '')
arh['sp'] = arh['sp'].str.replace(r's__', '')
arh = arh[['domain','phylum','class','order','family','genus','sp']]
arh = arh.drop_duplicates()

frames = [bac,arh]
gtdb = pd.concat(frames)

gtdb.to_excel('Tax_gtdb.xlsx', index=False)
gtdb.head(2)

Unnamed: 0,domain,phylum,class,order,family,genus,sp
0,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli
26859,Bacteria,Firmicutes,Bacilli,Staphylococcales,Staphylococcaceae,Staphylococcus,Staphylococcus aureus


In [3]:
print('Total bacteria species number in GTDB is ',len(gtdb.loc[gtdb['domain'] == 'Bacteria']))

Total bacteria species number in GTDB is  62291


## Tax_gtdb-ncbi

In [4]:
# исходный файл соотыетвия
convB = pd.read_excel('input/gtdb_vs_ncbi_bacteria.xlsx', sheet_name='names')

convB['top_sp_ncbi'] = convB['List of NCBI species'].str.split(',', expand=True)[0]
convB['GTDB R207 species'] = convB['GTDB R207 species'].str.replace(r's__', '')
convB['top_sp_ncbi'] = convB['top_sp_ncbi'].str.replace(r's__', '')
convB.rename(columns = {'GTDB R207 species':'sp_gtdb'}, inplace = True)
new = convB.top_sp_ncbi.str.rsplit(' ', 1, expand=True)\
  .rename(columns=lambda x: 'col{}'.format(x + 1))
convB['sp_ncbi'] = new['col1']

convB = convB[['sp_gtdb','sp_ncbi']]
convB = convB.loc[convB['sp_ncbi'] != "(g__)"]

convB.head(2)

Unnamed: 0,sp_gtdb,sp_ncbi
0,Zymomonas pomaceae,Zymomonas mobilis
1,Zymomonas mobilis_B,Zymomonas mobilis


In [64]:
# вручную переделанный файл соответсвия
convB = pd.read_excel('input/gtdb_vs_ncbi_bacteria.xlsx', sheet_name='names')
print('Unique bacteria species by gtdb - ',len(convB['GTDB R207 species'].unique()))

convB.rename(columns = {'GTDB R207 species':'sp_gtdb',"List of NCBI species":'ncbi'}, inplace = True)
new = convB.ncbi.str.rsplit(' ', 1, expand=True)\
  .rename(columns=lambda x: 'col{}'.format(x + 1))
convB['sp_ncbi'] = new['col1']
convB['sp_ncbi'] = convB['sp_ncbi'].str.replace(r'^ ', '')
convB = convB[['sp_gtdb','sp_ncbi']]
convB = convB.reset_index(drop=True)

convB.tail()

Unique bacteria species by gtdb -  9759


Unnamed: 0,sp_gtdb,sp_ncbi
11572,Streptomyces albidoflavus,Streptomyces albus
11573,Pseudomonas_E amygdali,Pseudomonas coronafaciens
11574,Enterobacter hormaechei_A,Escherichia coli
11575,Escherichia coli,Enterobacter hormaechei
11576,Brucella melitensis,Brucella neotomae


In [63]:
convA = pd.read_excel('input/gtdb_vs_ncbi_r207_archaea.xlsx', sheet_name='Species')
print('Unique archaea species by gtdb - ',len(convA['GTDB R207 species'].unique()))

convA['top_sp_ncbi'] = convA['List of NCBI species'].str.split(',', expand=True)[0]
convA['GTDB R207 species'] = convA['GTDB R207 species'].str.replace(r's__', '')
convA['top_sp_ncbi'] = convA['top_sp_ncbi'].str.replace(r's__', '')
convA.rename(columns = {'GTDB R207 species':'sp_gtdb'}, inplace = True)
new = convA.top_sp_ncbi.str.rsplit(' ', 1, expand=True)\
  .rename(columns=lambda x: 'col{}'.format(x + 1))
convA['sp_ncbi'] = new['col1']

convA = convA[['sp_gtdb','sp_ncbi']]
convA = convA.loc[convA['sp_ncbi'] != "(g__)"]
convA = convA.reset_index(drop=True)

convA.tail(2)

Unique archaea species by gtdb -  3102


Unnamed: 0,sp_gtdb,sp_ncbi
233,Ignisphaera aggregans_G,Ignisphaera aggregans
234,Methanobrevibacter_D curvatus,Methanobrevibacter curvatus


In [66]:
frames = [convA,convB]
conv = pd.concat(frames)
conv = conv.reset_index(drop=True)

print('Unique species by gtdb - ',len(conv['sp_gtdb'].unique()))
conv.tail(2)

Unique species by gtdb -  9994


Unnamed: 0,sp_gtdb,sp_ncbi
11810,Escherichia coli,Enterobacter hormaechei
11811,Brucella melitensis,Brucella neotomae


In [67]:
gtdbN = pd.merge(gtdb, conv, how="inner", left_on="sp", right_on='sp_gtdb').sort_values('sp_ncbi').reset_index(drop=True)
gtdbN.to_excel('Tax_gtdb-ncbi_v2.xlsx', index=False)

gtdbN.head(2)

Unnamed: 0,domain,phylum,class,order,family,genus,sp,sp_gtdb,sp_ncbi
0,Bacteria,Firmicutes,Bacilli,Erysipelotrichales,Erysipelotrichaceae,Absicoccus,Absicoccus porci,Absicoccus porci,Absicoccus porci
1,Bacteria,Proteobacteria,Gammaproteobacteria,Nevskiales,OUC007,Abyssibacter,Abyssibacter profundi,Abyssibacter profundi,Abyssibacter profundi


### Random check merge table

In [68]:
gtdbN.loc[gtdbN['sp_ncbi'] == 'Treponema succinifaciens']

Unnamed: 0,domain,phylum,class,order,family,genus,sp,sp_gtdb,sp_ncbi
11343,Bacteria,Spirochaetota,Spirochaetia,Treponematales,Treponemataceae,Treponema_D,Treponema_D succinifaciens,Treponema_D succinifaciens,Treponema succinifaciens
