### Plotting whole-community abundance and eukaryotic community abundance from metatranscriptomic analysis

In [1]:
import glob
import os
import pandas as pd
import pysam
import re
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
working_dir = '/vortexfs1/omics/env-bio/collaboration/dinoflagellates_METZYME/data/' \
            'metaT_trimmed_reads/fasta_files/paired/mRNA/whole_community_transcripts/'

In [3]:
wc_categories = ['Bacteroidetes', 'Alphaproteobacteria', 'Betaproteobacteria', 'Deltaproteobacteria', 
                 'Gammaproteobacteria', 'Other proteobacteria', 'Other bacteria', 'Archaea', 'Prochlorococcus', 
                 'Synechococcus', 'Other cyanobacteria', 'Chlorophyta', 'Cryptophyta', 'Diatom', 
                 'Other stramenopiles', 'Dinophyta', 'Haptophyta', 'Amoebozoa', 'Ciliophora', 'Excavata', 'Fungi', 
                 'Metazoa', 'Other eukaryotes', 'Rhizaria', 'Viruses']

In [4]:
taxonomy_files = glob.glob(working_dir + '*')

#### Interpreting relative community abundance as the fraction of original reads that mapped to a given phylum, super group, etc. relative to the total number of reads. 
In the paper, they use a combination of phylum, supergroup, genus and other taxonomic classifications, depending on what's present and interesting, so we have used the exact same breakdown, as opposed to a straight phylum-only breakdown or other approach.

To do this, we want to use the output of the BWA alignment to get the number of reads that correspond to an individual contig, then sum at the individual levels to get a total number of reads for each classification. Using a similar approach to `TPM_normalization_and_plotting.ipynb`, but we are not normalizing by gene length because we're not looking at individual gene expression.

In [5]:
### Already have taxon_df generated from before

In [6]:
bam_paths = ['../bwa_mem_nucleotide_files/30B8Z_S11_001_40m_mRNA_mapped_sorted.bam',
            '../bwa_mem_nucleotide_files/30B90_S12_001_70m_mRNA_mapped_sorted.bam',
            '../bwa_mem_nucleotide_files/30B91_S28_001_380m_mRNA_mapped_sorted.bam']

In [7]:
def join_read_counts(df_taxon, bam_path):
    # This function takes in the location of bam files and a dataframe with contig and taxonomy info
    # and returns the same dataframe with read counts added
    samfile = pysam.AlignmentFile(working_dir+path, "rb")
    sample_id = path.split('/')[1].split('_mRNA')[0]

    idx_stats = samfile.get_index_statistics()

    contigs = []
    mapped = []
    unmapped = []

    for contig_stat in idx_stats:
        contigs.append(contig_stat.contig)
        mapped.append(contig_stat.mapped)
        unmapped.append(contig_stat.unmapped)

    ORF_read_counts = pd.DataFrame({'contig': contigs, 'mapped': mapped, 'unmapped': unmapped})

    #Ignore all contigs with no mapping hits
    ORF_read_counts = ORF_read_counts[ORF_read_counts['mapped'] != 0]

    taxon_ORF_reads = ORF_read_counts.join(df_taxon.set_index('qseqid'), on='contig', how='inner')

    return taxon_ORF_reads

In [None]:
abundance_dict = {}

for path in bam_paths:
    
    #Use regex to get the name of the site
    sample_name = re.search('[0-9]{2,3}m', path)[0]
    print(sample_name)
    
    sample_abundance_dict = {}

    for file in taxonomy_files:
        #Get just the name of the classification, which is the very end of the file i.e. .../transcripts_Rhizaria.tsv
        name = re.search('transcripts_.*.tsv', file)[0].split('transcripts_')[1].split('.tsv')[0]
        print(name)
        #Read in data
        taxon_df = pd.read_csv(file, sep='\t')
        #Do mapping with original reads
        temp_df = join_read_counts(taxon_df, path)
        #Calculate "abundance" as the sum of total reads mapped to the group
        abundance = temp_df['mapped'].sum()
        #Add to a dictionary
        sample_abundance_dict[name] = abundance
    
    abundance_dict[sample_name] = sample_abundance_dict
    

40m
Other_cyanobacteria
Rhizaria
Betaproteobacteria
Fungi
Amoebozoa
Other_stramenopiles
Deltaproteobacteria
Archaea
Viruses
Alphaproteobacteria
Other_proteobacteria
Bacteroidetes
Dinophyta
Other_eukaryotes
Haptophyta
Prochlorococcus
Cryptophyta
Gammaproteobacteria
Excavata
Metazoa
Other_bacteria
Ciliophora
Chlorophyta
Diatom
Synechococcus
70m
Other_cyanobacteria
Rhizaria
Betaproteobacteria


In [None]:
abundance_df = pd.DataFrame(abundance_dict)

In [None]:
abundance_df.index = abundance_df.index.str.replace('_', ' ')
abundance_df = abundance_df.loc[wc_categories] #sort similar to original plot

In [None]:
rel_abundance_df = abundance_df*100/abundance_df.sum()
#Add prefix to identify it as station 9
rel_abundance_df = rel_abundance_df.add_prefix('9_')
rel_abundance_df.index = rel_abundance_df.index.str.capitalize()

In [None]:
df_len = rel_abundance_df.shape[0]

In [None]:
custom_palette = sns.color_palette("tab20b", df_len).as_hex()

In [None]:
sns.set()

fig, ax = plt.subplots(figsize = (6, 8))
rel_abundance_df.transpose().plot.bar(stacked='True', ax=ax, color=custom_palette)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel('Site depth')
plt.ylabel('Percent')

### Filtering for eukaryotes only (Fig. 1b)

In [None]:
colors = ['#B2DF8A', '#CAB2D6', '#FDBF6F', '#FFFF99', '#1F78B4', '#E7298A', 
         '#1B9E77', '#D95F02', "#7570B3", '#E6AB02', '#66A61E']

#Trying to use similar colors to similar plot made from 18S data

In [None]:
euks = ['Rhizaria', 'Fungi', 'Amoebozoa', 'Other stramenopiles', 'Diatom', 'Other eukaryotes', 'Dinophyta',
        'Haptophyta', 'Excavata', 'Metazoa', 'Ciliophora', 'Cryptophyta', 'Chlorophyta']
euks.sort() #sort alphabetically

In [None]:
euk_abundance_df = abundance_df.loc[euks]
rel_abundance_df = euk_abundance_df*100/euk_abundance_df.sum()
#Add prefix to identify it as station 9
rel_abundance_df = rel_abundance_df.add_prefix('9_')
rel_abundance_df.index = rel_abundance_df.index.str.capitalize()

In [None]:
sns.set()

fig, ax = plt.subplots(figsize = (6, 8))
rel_abundance_df.transpose().plot.bar(stacked='True', ax=ax, color=colors)
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xlabel('Site depth')
plt.ylabel('Percent')