### Splitting up diamond classification of ORFs into taxonomic categories for future plotting

In [1]:
import glob
import os
import pandas as pd
import pysam
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
working_dir = '/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/data/' \
            'metaT_trimmed_reads/fasta_files/paired/mRNA/whole_community_transcripts/'

In [3]:
wc_categories = ['Bacteroidetes', 'Alphaproteobacteria', 'Betaproteobacteria', 'Deltaproteobacteria', 
                 'Gammaproteobacteria', 'Other proteobacteria', 'Other bacteria', 'Archaea', 'Prochlorococcus', 
                 'Synechococcus', 'Other cyanobacteria', 'Chlorophyta', 'Cryptophyta', 'Diatom', 
                 'Other stramenopiles', 'Dinophyta', 'Haptophyta', 'Amoebozoa', 'Ciliophora', 'Excavata', 'Fungi', 
                 'Metazoa', 'Other eukaryotes', 'Rhizaria', 'Viruses']

In [4]:
# Read in output of metaT_taxonomy.ipynb so we can split up into individual TSVs

taxon_df = pd.read_csv(working_dir + '../diamond_output/dino_metzyme_annotated_coassembly_diamond_out' \
                       '_taxonomy.tsv', sep='\t')

### Start with bacteria kingdom

Approach is to start by subsetting classifications where we know we want everything (i.e. all ORFs in the Synechococcus genus), and then take the complement of that when the classification is "Other" (i.e. "Other cyanobacteria" would be the complement of "contains cyanobacteria but not prochlorococcus or synechococcus")

We will generate a dictionary of dataframes that correspond to each taxa of interest

In [213]:
# Starting with most specific levels of bacteria

df_dict = {}
for category in ['Bacteroidetes', 'Alphaproteobacteria', 'Betaproteobacteria', 'Deltaproteobacteria', 
                 'Gammaproteobacteria', 'Prochlorococcus', 'Synechococcus']:
    # Look for rows that contain the category name (things like capitalization help prevent us from
    # including mismatches)
    df = taxon_df[taxon_df['taxonomy'].str.contains(category)]
    df_dict[category] = df

In [215]:
# Other cyanobacteria
other_df = taxon_df[taxon_df['taxonomy'].str.contains('Cyanobacteria')]

# For every subcategory, get its complement with the full dataframe of Cyanobacteria 
# (essentially, subtracting it out)
for category in ['Synechococcus', 'Prochlorococcus']:
    # indicator=True adds a new 'merge' column that lets us know whether a row was in both dataframes or not
    merged = other_df.merge(df_dict[category], how='left', indicator=True)
    # Redefine other_df as the rows that were only in other_df to begin with
    other_df = merged[merged['_merge'] == 'left_only'].drop('_merge', axis=1)
    
df_dict['Other_cyanobacteria'] = other_df

In [216]:
# Other proteobacteria, similar approach to before 
other_df = taxon_df[taxon_df['taxonomy'].str.contains('Proteobacteria')]

for category in ['Alphaproteobacteria', 'Betaproteobacteria', 'Deltaproteobacteria', 
                 'Gammaproteobacteria']:
    merged = other_df.merge(df_dict[category], how='left', indicator=True)
    other_df = merged[merged['_merge'] == 'left_only'].drop('_merge', axis=1)
    
df_dict['Other_proteobacteria'] = other_df

In [217]:
# Other bacteria, similar approach to above

other_df = taxon_df[taxon_df['taxonomy'].str.contains('Bacteria')]

for category in ['Cyanobacteria', 'Proteobacteria', 'Bacteroidetes']:
    # Create a new df for three major phyla of interest and subtract it out from bacteria
    phylum_df = taxon_df[taxon_df['taxonomy'].str.contains(category)]
    merged = other_df.merge(phylum_df, how='left', indicator=True)
    other_df = merged[merged['_merge'] == 'left_only'].drop('_merge', axis=1)
    
df_dict['Other_bacteria'] = other_df

### Repeat with archaea and viruses

In [218]:
# Just pick out rows that contain "archaea" or "viruses"

for category in ['Archaea', 'Viruses']:
    df = taxon_df[taxon_df['taxonomy'].str.contains(category)]
    df_dict[category] = df

### Lastly do Eukaryota — identical approach to above with bacteria

In [None]:
for category in ['Chlorophyta', 'Cryptophyta', 'Bacillariophyta', 'Dinophyta', 'Excavata',
                 'Haptophyta', 'Amoebozoa', 'Ciliophora', 'Fungi', 'Metazoa', 'Rhizaria']:
    df = taxon_df[taxon_df['taxonomy'].str.contains(category)]
    df_dict[category] = df
    
# Using common name for diatom for ease of interpretation
df_dict['Diatom'] = df_dict.pop('Bacillariophyta')

In [220]:
# Other stramenopiles
other_df = taxon_df[taxon_df['taxonomy'].str.contains('Stramenopile')]

for category in ['Diatom']:
    merged = other_df.merge(df_dict[category], how='left', indicator=True)
    other_df = merged[merged['_merge'] == 'left_only'].drop('_merge', axis=1)
    
df_dict['Other_stramenopiles'] = other_df

In [221]:
# Other eukaryotes
other_df = taxon_df[taxon_df['taxonomy'].str.contains('Eukaryota')]

for category in ['Chlorophyta', 'Cryptophyta', 'Diatom', 'Dinophyta', 'Excavata',
                 'Haptophyta', 'Amoebozoa', 'Ciliophora', 'Fungi', 'Metazoa', 'Rhizaria', 'Other_stramenopiles']:
    merged = other_df.merge(df_dict[category], how='left', indicator=True)
    other_df = merged[merged['_merge'] == 'left_only'].drop('_merge', axis=1)
    
df_dict['Other_eukaryotes'] = other_df

In [222]:
### Want to save all dataframes to individual tsv files for ease of making the plots
# We can also filter for eukaryotes specifically to make those relative abundance plots.

#output format will be 'transcripts_{key}.tsv'
transcript_file_name = working_dir + 'transcripts_' 

for key in df_dict.keys():
    df_dict[key].to_csv(transcript_file_name + f'{key}.tsv', sep='\t')