# GTDB-Tk _de novo_ trees

GTDB taxonomy was assigned to genomes from Atanasova et al., 2013 using [GTDB-Tk _de novo_ workflow](https://ecogenomics.github.io/GTDBTk/commands/de_novo_wf.html). The outpt of the _de novo_ workflow is a tree that includes GTDB reference genomes and the new genomes provided.

Below is code to process these trees.

In [9]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from Bio import Phylo

cwd = os.getcwd()
if cwd.endswith('notebook'):
    os.chdir('..')
    cwd = os.getcwd()

from src.tree.tree_util import prune_leaves_with_unknown_id
from src.tree.itol_annotation import itol_labels

In [10]:
base_folder = Path(os.path.expanduser('~')) / 'Documents' / 'Chahrazad' / 'Atanasova_genomes' / 'GTDB-Tk_de_novo'
assert base_folder.is_dir()

gtdb_folder = Path('../data/gtdb_r220/')
assert gtdb_folder.is_dir()

## Load trees

In [13]:
raw_trees = {
    'Haloferax': Phylo.read(base_folder / 'Haloferax' / 'Haloferax.ar53.decorated.tree', 'newick'),
    'Halorubrum': Phylo.read(base_folder / 'Halorubrum' / 'Halorubrum.ar53.decorated.tree', 'newick'),
    'Pontibacillus': Phylo.read(base_folder / 'Pontibacillus' / 'Pontibacillus.bac120.decorated.tree', 'newick'),
}
genera = sorted(raw_trees.keys())

## Load GTDB metadata

In [14]:
bac_metadata = pd.read_csv(
    gtdb_folder / 'bac120_metadata_r220.tsv.gz', 
    sep='\t',
)
bac_metadata['assembly_accession'] = [a[3:] for a in bac_metadata['accession'].values]
bac_metadata['gtdb_genus'] = bac_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[5].replace('g__', ''))
bac_metadata['gtdb_species'] = bac_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[6].replace('s__', ''))
bac_metadata = bac_metadata.set_index('accession', drop=True)

arc_metadata = pd.read_csv(
    gtdb_folder / 'ar53_metadata_r220.tsv.gz', 
    sep='\t',
)
arc_metadata['assembly_accession'] = [a[3:] for a in arc_metadata['accession'].values]
arc_metadata['gtdb_genus'] = arc_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[5].replace('g__', ''))
arc_metadata['gtdb_species'] = arc_metadata['gtdb_taxonomy'].apply(lambda t: t.split(';')[6].replace('s__', ''))
arc_metadata = arc_metadata.set_index('accession', drop=True)

In [16]:
metadata = pd.concat([arc_metadata, bac_metadata])
metadata.head()

Unnamed: 0_level_0,ambiguous_bases,checkm2_completeness,checkm2_contamination,checkm2_model,checkm_completeness,checkm_contamination,checkm_marker_count,checkm_marker_lineage,checkm_marker_set_count,checkm_strain_heterogeneity,...,ssu_silva_blast_perc_identity,ssu_silva_blast_subject_id,ssu_silva_taxonomy,total_gap_length,trna_aa_count,trna_count,trna_selenocysteine_count,assembly_accession,gtdb_genus,gtdb_species
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RS_GCF_000485535.1,0,98.72,0.11,Specific,99.84,0.43,359,f__Halobacteriaceae (UID84),235,100.0,...,100.0,CP002062.13692.15164,Archaea;Halobacterota;Halobacteria;Halobactera...,10,19,48,0,GCF_000485535.1,Halalkalicoccus,Halalkalicoccus jeotgali
GB_GCA_030638685.1,0,85.56,1.82,General,78.5,4.05,149,k__Archaea (UID2),107,16.67,...,98.214,FN820420.1.1346,Archaea;Asgardarchaeota;Heimdallarchaeia;uncul...,0,15,36,0,GCA_030638685.1,DAOWED01,DAOWED01 sp030638685
GB_GCA_003163595.1,489,77.92,0.0,Specific,93.95,0.65,228,p__Euryarchaeota (UID49),153,0.0,...,98.681,JF789589.1.1440,Archaea;Halobacterota;Methanomicrobia;Methanom...,6666,17,36,0,GCA_003163595.1,Bog-38,Bog-38 sp003139855
GB_GCA_002782805.1,15,81.39,0.14,General,77.57,0.0,149,k__Archaea (UID2),107,0.0,...,100.0,MNUF01000028.16163.17137,Archaea;Aenigmarchaeota;Aenigmarchaeia;Aenigma...,1891,19,36,0,GCA_002782805.1,CG10238-14,CG10238-14 sp002789635
GB_GCA_939800415.1,0,67.91,0.25,Specific,87.07,0.82,234,p__Euryarchaeota (UID54),153,50.0,...,96.519,CP000559.403632.405096,Archaea;Halobacterota;Methanomicrobia;Methanom...,0,20,41,0,GCA_939800415.1,Methanocorpusculum,Methanocorpusculum faecipullorum


## Prune tree to include only relevant genus and new genomes

In [20]:
trees = {}
for genus in genera:
    raw_tree = raw_trees[genus]

    ids = set(metadata[metadata['gtdb_genus'] == genus].index)

    for leaf in raw_tree.get_terminals():
        if not leaf.name.startswith('RS_') and not leaf.name.startswith('GB_'):
            ids.add(leaf.name)

    tree = prune_leaves_with_unknown_id(raw_tree, ids)
    
    with (base_folder / genus / f'{genus}_de_novo.tree').open('w') as f_out:
        Phylo.write([tree], f_out, 'phyloxml')

    trees[genus] = tree

## Annotations: labels

In [19]:
for genus in genera:
    labels = []
    tree = trees[genus]

    for leaf in tree.get_terminals():
        if not leaf.name.startswith('RS_') and not leaf.name.startswith('GB_'):
            label = leaf.name
        else:
            row = metadata.loc[leaf.name]
            accession = row['assembly_accession']
            name = row['ncbi_organism_name']
            label = f'{name} [{accession}]'

        labels.append([leaf.name, label])

    itol_labels(
        labels, 
        base_folder / genus / 'annotations' / f'{genus}_labels.txt'
    )