# Format MEGAN output into updated taxonomy table for all ASVs

## Import Libraries

In [1]:
import sys
import pandas as pd
import numpy as np
import glob

## Set Locations

In [4]:
directory = '../blastNR/'
taxa_table_file = directory+'tpath.txt'
print('taxa table:', taxa_table_file)

taxa table: ../blastNR/tpath.txt


## Functions

In [9]:
#From fasta file create pandas df of ASV and sequence
def from_fasta_to_df(file):
    print(file)
    with open(file) as f:
        Ids=[]
        seqs =[]
        for strline in f:
            if strline[0]=='>':
                Ids.append(strline[1:].strip())
            else:
                seqs.append(strline.strip())
    print('Number of Ids:',len(Ids))
    print('Number of Seqs:',len(seqs))
    seq_dict = dict(zip(Ids, seqs))
    #make pandas df
    df= pd.DataFrame.from_dict(seq_dict,orient='index', columns=['sequence'])
    return df

## Format MEGAN output

In [8]:
#Taxa table
levels = ['Domain', 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
df = pd.read_csv(taxa_table_file, sep="\t", header=None, names =['ASV', 'taxonomy'])

# extract all levels present in the taxonomy:
df.set_index('ASV', inplace=True)

df['Domain'] = df['taxonomy'].str.extract(r'\[D\] ([^;]*);.*')
df['Kingdom'] = df['taxonomy'].str.extract(r'\[K\] ([^;]*);.*')
df['Phylum'] = df['taxonomy'].str.extract(r'\[P\] ([^;]*);.*')
df['Class'] = df['taxonomy'].str.extract(r'\[C\] ([^;]*);.*')
df['Order'] = df['taxonomy'].str.extract(r'\[O\] ([^;]*);.*')
df['Family'] = df['taxonomy'].str.extract(r'\[F\] ([^;]*);.*')
df['Genus'] = df['taxonomy'].str.extract(r'\[G\] ([^;]*);.*')
df['Species'] = df['taxonomy'].str.extract(r'\[S\] ([^;]*);.*')

df = df.sort_values(levels)

# MEGAN already exports 'unknown' term but is inconsistent for some terms, like 'Kingdom'
# if no taxonomy:
for level in levels:
    df.loc[df['taxonomy'].isna(), level] = 'unassigned'

df.drop('taxonomy', axis=1, inplace=True)
taxa_tab = df.copy()
taxa_tab.head()

Unnamed: 0_level_0,Domain,Kingdom,Phylum,Class,Order,Family,Genus,Species
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ASV_121,Archaea,,Candidatus Thermoplasmatota,Candidatus Poseidoniia,Candidatus Poseidoniales,Candidatus Thalassarchaeaceae,Candidatus Thalassarchaeum,Candidatus Thalassoarchaea mediterranii
ASV_2848,Archaea,,Thaumarchaeota,,,,,
ASV_3016,Archaea,,unknown,unknown,unknown,unknown,unknown,archaeon
ASV_3182,Archaea,,unknown,unknown,unknown,unknown,unknown,archaeon
ASV_3359,Archaea,,unknown,unknown,unknown,unknown,unknown,archaeon


## Join with ASV sequence

In [15]:
# this only includes unique sequences
file = directory + 'GLOMICON_seq_merged_unique.fasta'
df= from_fasta_to_df(file)
# add on taxonomy
df = df.join(taxa_tab)
df = df.reset_index().set_index('sequence')
df = df.rename(columns={'index':'ASV'})
taxa_tab_seq = df.copy()
taxa_tab_seq.head()

../blastNR/GLOMICON_seq_merged_unique.fasta
Number of Ids: 13542
Number of Seqs: 13542


Unnamed: 0_level_0,ASV,Domain,Kingdom,Phylum,Class,Order,Family,Genus,Species
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CAATAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTCGGATTTCGGGTCGGGCCGAGCGGTCTGCCGATGGGTATGCACTGTTTGGCGCGGCCTTCTTTCCGGAGACCGCGGCTACTCTTAACTGAGCGGGCGTGGGAGACGGATCGTTTACTTTGAAAAAATCAGAGTGTTTCTAGCAGGCAGCTCGCTCTTGCATAGGTTAGCATGGGATAATTTAATAGGACTCTGGTGCTATTTTGTTGGTTTCGAACACCGGAGTAATGATTAAAAGGGGCAGTCAGGGGCACTCGTATTCCGTCGAGAGAGGTGAAATTCTCAGACCAATGGAAGACGAACCACTGC,ASV_1,Eukaryota,,Haptophyta,unknown,Phaeocystales,Phaeocystaceae,Phaeocystis,
GCACCTACCGATTGAATGGTCCGGTGAAGACTCGGGATTGTGGTCTGGCTCCTTCATTGGGGCCAGACCGTGAGAACTTGTCTGAACCTTATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC,ASV_2,Eukaryota,,Bacillariophyta,Coscinodiscophyceae,Thalassiosirales,Thalassiosiraceae,,
GCACCTACCGATTGAATGGTCCGGTGAGGCCTCGGGATCGTGGCGAACTTTCTTCATTGGAGGTGAGCTGTGAGAACTTGTCCAAATCTTATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC,ASV_3,Eukaryota,,Bacillariophyta,Coscinodiscophyceae,Melosirales,Melosiraceae,Melosira,
GCACCTACCGATTGAATGGTCCGGTGAAGCCTCGGGATTGTGGTTGGTTTCCTTTATTGGAATCTGACCACGAGAACCTGTCTAAACCTTATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC,ASV_4,Eukaryota,,Bacillariophyta,Bacillariophyceae,Bacillariales,Bacillariaceae,Pseudo-nitzschia,
GCTCCTACCGATTGAGTGATCCGGTGAATAATTCGGACTGCAGCAGTGTTCAGTTCCTGAACGTTGCAGTGGAAAGTTTAGTGAACCTTATCACTTAGAGGAAGGAGAAGTCGTAACAAGGTTTCC,ASV_5,Eukaryota,,unknown,Dinophyceae,,,,


## Add back in duplicate ASVs:

In [24]:
file = '../data/GLOMICON_seq_merged.csv'
df = pd.read_csv(file)
df.set_index('sequence', inplace=True)
df = df.join(taxa_tab_seq.drop('ASV', axis=1))
# df = df.join(taxa_tab_seq, lsuffix='orig')  # can check ASV IDs match
df.set_index('ASV', inplace=True)
df = df.drop('Analyzing_Institute', axis=1)
taxa_tab_all = df.copy()
taxa_tab_all.head()

Unnamed: 0_level_0,Domain,Kingdom,Phylum,Class,Order,Family,Genus,Species
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ASV_1950,Archaea,,unknown,unknown,unknown,unknown,unknown,archaeon
ASV_9581,,,,,,,,
ASV_63,Archaea,,unknown,unknown,unknown,unknown,unknown,archaeon
ASV_3182,Archaea,,unknown,unknown,unknown,unknown,unknown,archaeon
ASV_603,Archaea,,unknown,unknown,unknown,unknown,unknown,archaeon


## Export to csv file

In [25]:
# taxa_tab_seq.to_csv(directory +'GLOMICON_taxa_blastnr.csv')
taxa_tab_all.to_csv('../data/' +'GLOMICON_taxa_blastnr.csv')