# Subgroup/Class Tree Sequences Preparation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
from tqdm import tqdm
import os
import shutil
import warnings
warnings.filterwarnings('ignore')

In [2]:
def copy_files(file_list, source_dir, destination_dir):
    """
    Copies files from the source directory to the destination directory based on the provided list of filenames.

    :param file_list: List of filenames to copy.
    :param source_dir: Directory where the files are currently located.
    :param destination_dir: Directory where the files should be copied to.
    """
    # Ensure the destination directory exists
    os.makedirs(destination_dir, exist_ok=True)
    
    # Iterate over the file list with tqdm progress bar
    for filename in tqdm(file_list, desc="Copying files"):
        # Construct full paths for the source and destination
        source_path = os.path.join(source_dir, filename)
        destination_path = os.path.join(destination_dir, filename)
        
        # Copy the file from the source to the destination
        shutil.copy(source_path, destination_path)

# Example usage:
# copy_files(['file1.txt', 'file2.txt'], '/path/to/source', '/path/to/destination')

In [3]:
nr_genomes_dataset = pd.read_csv('/Users/akshayonly/Work/04-Complex-I/Data/02-Genomic-Records-and-Metadata/02-Genome-Metadata/genomes_dataset_unique.csv')
genomes_dataset = pd.read_csv('/Users/akshayonly/Work/04-Complex-I/Data/02-Genomic-Records-and-Metadata/02-Genome-Metadata/genomes_dataset.csv')
taxonomy = pd.read_csv('/Users/akshayonly/Work/04-Complex-I/Data/02-Genomic-Records-and-Metadata/03-Taxonomy-Metadata/taxonomy.csv')

In [4]:
taxonomy = genomes_dataset[['Group', 'SubGroup', 'Species']].merge(taxonomy).drop_duplicates()

taxonomy = taxonomy[~taxonomy['Class'].isnull()]

taxonomy = taxonomy[(~taxonomy['Class'].str.startswith('Candidatus')) & (~taxonomy['Class'].str.startswith('candidatus'))]

In [5]:
taxonomy['SubGroup'].nunique(), taxonomy['Class'].nunique()

(65, 100)

In [6]:
representative_species = {
    "Acidimicrobiia": "Acidimicrobium ferrooxidans",
    "Acidithiobacillia": "Acidithiobacillus ferrooxidans",
    "Actinomycetes": "Streptomyces griseus",
    "Alphaproteobacteria": "Rhizobium leguminosarum",
    "Anaerolineae": "Anaerolinea thermophila",
    "Aquificia": "Aquifex aeolicus",
    "Archaeoglobi": "Archaeoglobus fulgidus",
    "Armatimonadia": "Armatimonas rosea",
    "Bacilli": "Bacillus subtilis",
    "Bacteriovoracia": "Bacteriovorax stolpii",
    "Bacteroidia": "Bacteroides fragilis",
    "Balneolia": "Balneola vulgaris",
    "Bdellovibrionia": "Bdellovibrio bacteriovorus",
    "Betaproteobacteria": "Neisseria meningitidis",
    "Blastocatellia": "Blastocatella fastidiosa",
    "Caldilineae": "Caldilinea aerophila",
    "Caldisericia": "Caldisericum exile",
    "Chitinophagia": "Chitinophaga pinensis",
    "Chlamydiia": "Chlamydia trachomatis",
    "Chlorobiia": "Chlorobium tepidum",
    "Chloroflexia": "Chloroflexus aurantiacus",
    "Chrysiogenia": "Chrysiogenes arsenatis",
    "Chthonomonadia": "Chthonomonas calidirosea",
    "Clostridia": "Clostridium botulinum",
    "Conexivisphaeria": "Conexibacter woesei",
    "Coriobacteriia": "Coriobacterium glomerans",
    "Cyanophyceae": "Synechocystis sp. PCC 6803",
    "Cytophagia": "Cytophaga hutchinsonii",
    "Deferribacteres": "Deferribacter thermophilus",
    "Dehalococcoidia": "Dehalococcoides mccartyi",
    "Deinococci": "Deinococcus radiodurans",
    "Desulfarculia": "Desulfarculus baarsii",
    "Desulfobaccia": "Desulfobacca acetoxidans",
    "Desulfobacteria": "Desulfobacter hydrogenophilus",
    "Desulfobulbia": "Desulfobulbus propionicus",
    "Desulfomonilia": "Desulfomonile tiedjei",
    "Desulfovibrionia": "Desulfovibrio vulgaris",
    "Desulfurellia": "Desulfurella acetivorans",
    "Desulfuromonadia": "Desulfuromonas acetoxidans",
    "Dictyoglomia": "Dictyoglomus thermophilum",
    "Elusimicrobia": "Elusimicrobium minutum",
    "Endomicrobiia": "Endomicrobium proavitum",
    "Epsilonproteobacteria": "Campylobacter jejuni",
    "Erysipelotrichia": "Erysipelothrix rhusiopathiae",
    "Fibrobacteria": "Fibrobacter succinogenes",
    "Fimbriimonadia": "Fimbriimonas ginsengisoli",
    "Flavobacteriia": "Flavobacterium johnsoniae",
    "Fusobacteriia": "Fusobacterium nucleatum",
    "Gammaproteobacteria": "Escherichia coli",
    "Gemmatimonadia": "Gemmatimonas aurantiaca",
    "Halobacteria": "Halobacterium salinarum",
    "Holophagae": "Holophaga foetida",
    "Hydrogenophilia": "Hydrogenophilus thermoluteolus",
    "Ignavibacteria": "Ignavibacterium album",
    "Kiritimatiellia": "Kiritimatiella glycovorans",
    "Ktedonobacteria": "Ktedonobacter racemifer",
    "Limnochordia": "Limnochorda pilosa",
    "Methanobacteria": "Methanobacterium formicicum",
    "Methanococci": "Methanococcus jannaschii",
    "Methanomicrobia": "Methanosarcina barkeri",
    "Methanonatronarchaeia": "Methanonatronarchaeum thermophilum",
    "Methanopyri": "Methanopyrus kandleri",
    "Methylacidiphilae": "Methylacidiphilum infernorum",
    "Mollicutes": "Mycoplasma pneumoniae",
    "Myxococcia": "Myxococcus xanthus",
    "Nanobdellia": "Nanobdella aerobia",
    "Negativicutes": "Veillonella parvula",
    "Nitriliruptoria": "Nitriliruptor alkaliphilus",
    "Nitrososphaeria": "Nitrosopumilus maritimus",
    "Nitrospinia": "Nitrospina gracilis",
    "Nitrospiria": "Nitrospira moscoviensis",
    "Oligoflexia": "Oligoflexus tunisiensis",
    "Opitutia": "Opitutus terrae",
    "Phycisphaerae": "Phycisphaera mikurensis",
    "Planctomycetia": "Planctomyces maris",
    "Rhodothermia": "Rhodothermus marinus",
    "Rubrobacteria": "Rubrobacter radiotolerans",
    "Saprospiria": "Saprospira grandis",
    "Sphingobacteriia": "Sphingobacterium spiritivorum",
    "Spirochaetia": "Treponema pallidum",
    "Synergistia": "Synergistes jonesii",
    "Syntrophia": "Syntrophus aciditrophicus",
    "Syntrophobacteria": "Syntrophobacter fumaroxidans",
    "Tepidiformia": "Tepidiforma aggregans",
    "Terriglobia": "Terriglobus roseus",
    "Thermococci": "Thermococcus kodakarensis",
    "Thermodesulfobacteria": "Thermodesulfobacterium commune",
    "Thermodesulfobiia": "Thermodesulfobium narugense",
    "Thermodesulfovibrionia": "Thermodesulfovibrio yellowstonii",
    "Thermoflexia": "Thermoflexus hugenholtzii",
    "Thermoleophilia": "Thermoleophilum album",
    "Thermomicrobia": "Thermomicrobium roseum",
    "Thermoprotei": "Sulfolobus solfataricus",
    "Thermotogae": "Thermotoga maritima",
    "Tichowtungiia": None,
    "Tissierellia": "Tissierella praeacuta",
    "Verrucomicrobiia": "Verrucomicrobium spinosum",
    "Vicinamibacteria": "Vicinamibacter silvestris",
    "Vulcanimicrobiia": "Vulcanimicrobium thermophilum",
    "Zetaproteobacteria": "Mariprofundus ferrooxydans"
}


In [7]:
# Step 1: Identify all classes and classify them as available or unavailable
all_classes = set(taxonomy['Class'])
available_classes = set(taxonomy[taxonomy['Species'].isin(representative_species.values())]['Class'])
unavailable_classes = all_classes - available_classes

# Step 2: Filter representative_species to only include available classes
filtered_representative_species = {
    class_: species for class_, species in representative_species.items()
    if class_ in available_classes
}

# Step 3: Merge nr_genomes_dataset with taxonomy to get class information
merged_genomes = nr_genomes_dataset.merge(taxonomy[['Species', 'Class']], on='Species').drop_duplicates()

# Step 4: Create a dictionary for unavailable classes with one representative species
unavailable_class_species = (
    merged_genomes[
        merged_genomes['Class'].isin(unavailable_classes) & (merged_genomes['Reference'] != '-')
    ]
    .drop_duplicates(subset=['Class'])
    .set_index('Class')['Species']
    .to_dict()
)

# Step 5: Combine filtered representative species with unavailable class species
final_representative_species = {**filtered_representative_species, **unavailable_class_species}

# Step 6: Handle remaining classes (if any) without references
remaining_class_species = (
    merged_genomes[
        merged_genomes['Class'].isin(all_classes - set(final_representative_species.keys())) &
        (merged_genomes['Reference'] == '-')
    ]
    .drop_duplicates(subset=['Class'])
    .set_index('Class')['Species']
    .to_dict()
)

# Step 7: Update the final representative species with remaining classes
final_representative_species.update(remaining_class_species)

# Step 8: Split genomes with and without references based on final species list
final_species = set(final_representative_species.values())

genomes_with_ref = genomes_dataset[
    (genomes_dataset['Species'].isin(final_species)) & (genomes_dataset['Reference'] != '-')
][['Organism', 'Species', 'Reference', 'Size (Mb)', 'ProteomeFile']].drop_duplicates(subset='Species', keep='first')

genomes_without_ref = genomes_dataset[
    (genomes_dataset['Species'].isin(final_species)) & (genomes_dataset['Reference'] == '-')
][['Organism', 'Species', 'Reference', 'Size (Mb)', 'ProteomeFile']]

# Exclude species already present in genomes_with_ref
genomes_without_ref = genomes_without_ref[~genomes_without_ref['Species'].isin(genomes_with_ref['Species'])]
genomes_without_ref.drop_duplicates(subset='Species', keep='first', inplace=True)

# Step 9: Combine genomes with and without references for tree generation
genomes_for_tree = pd.concat([genomes_with_ref, genomes_without_ref], ignore_index=True)

# Final Output: `genomes_for_tree` contains the combined data


In [8]:
genomes_for_tree = genomes_for_tree.merge(taxonomy[['Species', 'Class']]).drop_duplicates()

In [9]:
# copy_files(genomes_for_tree['ProteomeFile'].unique(), '/Users/akshayonly/Work/04-Complex-I/Data/01-Sequence-Data/03-Proteomes', '/Users/akshayonly/Work/04-Complex-I/Data/06-Phylogeny-Data/Class-Tree/seqs')

In [10]:
# tree_info = pd.read_csv('/Users/akshayonly/Work/04-Complex-I/Data/06-Phylogeny-Data/Class-Tree/results/Genomes_summary_info.tsv', sep='\t')

In [11]:
genomes_for_tree['Node'] = genomes_for_tree['ProteomeFile'].apply(lambda x: x.replace('.faa', ''))

In [12]:
genomes_for_tree.head()

Unnamed: 0,Organism,Species,Reference,Size (Mb),ProteomeFile,Class,Node
0,Bacteria,Campylobacter jejuni,REFR,1.64148,GCA_000009085.1_ASM908v1_cds_proteins.faa,Epsilonproteobacteria,GCA_000009085.1_ASM908v1_cds_proteins
33,Bacteria,Escherichia coli,REFR,4.64165,GCA_000005845.2_ASM584v2_cds_proteins.faa,Gammaproteobacteria,GCA_000005845.2_ASM584v2_cds_proteins
239,Bacteria,Neisseria meningitidis,REPR,2.18133,GCA_022869645.1_ASM2286964v1_cds_proteins.faa,Betaproteobacteria,GCA_022869645.1_ASM2286964v1_cds_proteins
257,Bacteria,Fusobacterium nucleatum,REPR,2.1801,GCF_003019295.1_ASM301929v1_cds_proteins.faa,Fusobacteriia,GCF_003019295.1_ASM301929v1_cds_proteins
262,Bacteria,Chlorobium phaeobacteroides,REPR,3.1339,GCA_000015125.1_ASM1512v1_cds_proteins.faa,Chlorobiia,GCA_000015125.1_ASM1512v1_cds_proteins


In [13]:
# genomes_for_tree[['Node', 'Class']].to_csv(
#     '/Users/akshayonly/Work/04-Complex-I/Data/06-Phylogeny-Data/Class-Tree/class_labels.txt',
#     sep=' ',
#     index=False
# )

In [14]:
organism_colors = {'Bacteria': '#40407a', 'Archaea': '#218c74'}
genomes_for_tree['OrganismColors'] = genomes_for_tree['Organism'].map(organism_colors)

In [15]:
# genomes_for_tree[['Node', 'OrganismColors']].to_csv(
#     '/Users/akshayonly/Work/04-Complex-I/Data/06-Phylogeny-Data/Class-Tree/organism.txt',
#     sep=' ',
#     index=False
# )