# Limit dataset by Taxonomy (only Eukaryotic Phytoplankton/Protists)

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import glob

### File Locations

In [2]:
prefix = 'GLOMICON'

#Data Directory
directory = '../data/'


### Functions

In [3]:
# Dada2 Banzai Output Functions
levels = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

def make_metadata(infile):
    df = pd.read_csv(infile,index_col=0)
    df.index.names = ['sample_name']
    #df['date'] = pd.to_datetime(df['date'])
    print('Number samples:', len(df.index))
    return df

def make_taxa_asv(infile):
    df = pd.read_csv(infile)
    #df = df.drop('Unnamed: 0', axis=0)
    #df = df.rename(columns= {'Unnamed: 0':'ASV'})
    #df.set_index('ASV', inplace=True)
    print('Number ASVs:', len(df.index))
    return df

#from metadata file, limit OTU table and taxa table to those present in those samples
def from_metadata_to_taxareads(meta_data, otu_table, taxa_table):
    #standard M6 output; sample_names as index; OTUs as index
    cols = list(meta_data)
    otu_lim = pd.concat([meta_data, otu_table.T],join='inner', axis=1)
    otu_lim.drop(cols, inplace=True, axis=1)
    otu_lim=otu_lim.T
    otu_lim['Total']=otu_lim.sum(axis=1)
    otu_lim = otu_lim.loc[otu_lim['Total']>0]
    otu_lim.drop('Total', axis=1, inplace=True)
    cols=list(otu_lim)
    taxa_lim=pd.concat([otu_lim, taxa_table], axis=1, join='inner')
    taxa_lim.drop(cols, inplace=True, axis=1)
    return otu_lim, taxa_lim

## Load Data

In [4]:
marker = '18S'

print('#####' + marker + '#####')

# otu table
file = prefix +'_asv_merged.csv'
print(directory+file)
df = make_taxa_asv(directory+file)
df.set_index('ASV', inplace=True)
otu_all = df.copy()

# taxa table
file = prefix +'_taxa_blastnr.csv'
print(directory+file)
df = make_taxa_asv(directory+file)
#df = df.drop('Unnamed: 0', axis=1)
df.set_index('ASV', inplace=True)
taxa_all = df.copy()

# metadata
file = prefix +'_meta_merged.csv'
print(directory+file)
df = make_metadata(directory+file)
meta_all = df.copy()

# sequence table
file = prefix +'_seq_merged.csv'
print(directory+file)
df = make_taxa_asv(directory+file)
df.set_index('ASV', inplace=True)
seq_all = df.copy()

seq_all.head()

#####18S#####
../data/GLOMICON_asv_merged.csv
Number ASVs: 14547
../data/GLOMICON_taxa_blastnr.csv
Number ASVs: 14547
../data/GLOMICON_meta_merged.csv
Number samples: 170
../data/GLOMICON_seq_merged.csv
Number ASVs: 14547


Unnamed: 0_level_0,sequence,Analyzing_Institute
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1
ASV_1,CAATAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTCGG...,UDAL
ASV_2,GCACCTACCGATTGAATGGTCCGGTGAAGACTCGGGATTGTGGTCT...,MBARI
ASV_3,GCACCTACCGATTGAATGGTCCGGTGAGGCCTCGGGATCGTGGCGA...,MBARI
ASV_4,GCACCTACCGATTGAATGGTCCGGTGAAGCCTCGGGATTGTGGTTG...,MBARI
ASV_5,GCTCCTACCGATTGAGTGATCCGGTGAATAATTCGGACTGCAGCAG...,MBARI


## Filter by Taxonomy

### Look at spread of values

In [5]:
df = taxa_all.copy()
df=df.loc[df['Domain'].isin(['Bacteria', 'Archaea', 'no_hit', 'unassigned'])==False]
print(df['Kingdom'].unique())
df = df.loc[df['Kingdom']!='Metazoa']
df = df.loc[df['Kingdom']!='Fungi']
levels = list(df)
# remove nans present across all levels except Kingdom (Eukaryota)
print(levels[1:])
df = df[levels[1:]].dropna(how='all')

df['Number ASVs'] = 1
df = df.groupby(['Kingdom','Phylum'],dropna=False).sum(numeric_only=True)
#df = df.drop_duplicates()
#df = df.sort_values(levels[1:])
df

[nan 'Metazoa' 'Viridiplantae' 'Fungi']
['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']


Unnamed: 0_level_0,Unnamed: 1_level_0,Number ASVs
Kingdom,Phylum,Unnamed: 2_level_1
Viridiplantae,Chlorophyta,368
Viridiplantae,Prasinodermophyta,11
Viridiplantae,Streptophyta,29
Viridiplantae,,7
,Apicomplexa,104
,Bacillariophyta,1327
,Cercozoa,349
,Ciliophora,730
,Discosea,25
,Endomyxa,71


In [6]:
df = taxa_all.copy()
df=df.loc[df['Domain'].isin(['Bacteria', 'Archaea', 'no_hit', 'unassigned'])==False]
print(df['Kingdom'].unique())
df = df.loc[df['Kingdom']!='Metazoa']
df = df.loc[df['Kingdom']!='Fungi']
levels = list(df)
# remove nans present across all levels except Kingdom (Eukaryota)
print(levels[1:])
df = df[levels[1:]].dropna(how='all')

df['Number ASVs'] = 1
df = df.loc[df['Phylum']=='unknown']

df = df.groupby(['Kingdom','Phylum', 'Class'],dropna=False).sum(numeric_only=True)
df

[nan 'Metazoa' 'Viridiplantae' 'Fungi']
['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number ASVs
Kingdom,Phylum,Class,Unnamed: 3_level_1
,unknown,Acantharea,51
,unknown,Bigyra,358
,unknown,Bolidophyceae,67
,unknown,Centroplasthelida,21
,unknown,Choanoflagellata,143
,unknown,Chrysomerophyceae,1
,unknown,Chrysophyceae,88
,unknown,Cryptophyceae,237
,unknown,Developea,5
,unknown,Dictyochophyceae,71


### Proceed with limited data

In [7]:
df = taxa_all.copy()
df=df.loc[df['Domain'].isin(['Bacteria', 'Archaea', 'no_hit', 'unassigned'])==False]
print(df['Kingdom'].unique())
df = df.loc[df['Kingdom']!='Metazoa']
df = df.loc[df['Kingdom']!='Fungi']
levels = list(df)
# remove nans present across all levels except Kingdom (Eukaryota)
print(levels[1:])
df = df[levels[1:]].dropna(how='all')

taxa_new = df.copy()
taxa_new

[nan 'Metazoa' 'Viridiplantae' 'Fungi']
['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']


Unnamed: 0_level_0,Kingdom,Phylum,Class,Order,Family,Genus,Species
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ASV_13705,,unknown,Dinophyceae,Syndiniales,,,
ASV_11132,,Cercozoa,Thecofilosea,Tectofilosida,Chlamydophryidae,Lecythium,
ASV_13578,,unknown,Dinophyceae,Gonyaulacales,Pyrophacaceae,Fragilidium,
ASV_13105,,Haptophyta,unknown,Phaeocystales,Phaeocystaceae,,
ASV_13582,,Haptophyta,unknown,Phaeocystales,Phaeocystaceae,,
...,...,...,...,...,...,...,...
ASV_3017,,Rhodophyta,Florideophyceae,Hapalidiales,,,
ASV_4157,,Bacillariophyta,Coscinodiscophyceae,Melosirales,Melosiraceae,Melosira,
ASV_9654,,Bacillariophyta,Coscinodiscophyceae,Melosirales,Melosiraceae,Melosira,
ASV_12880,,Bacillariophyta,Bacillariophyceae,Mastogloiales,Achnanthaceae,Achnanthes,


In [8]:
# limit otu table:

df = otu_all.copy()
keep_ASVs = taxa_new.index.tolist()
print(len(keep_ASVs))
df = df.loc[df.index.isin(keep_ASVs)==True]
df = df.fillna(0)
otu_new = df.copy()

# check minimum reads per sample:
df = df.T
df['tot_reads'] = df.sum(axis=1)
df = df.sort_values('tot_reads')
df

10809


ASV,ASV_1,ASV_2,ASV_3,ASV_4,ASV_5,ASV_6,ASV_7,ASV_8,ASV_9,ASV_10,...,ASV_14539,ASV_14540,ASV_14541,ASV_14542,ASV_14543,ASV_14544,ASV_14545,ASV_14546,ASV_14547,tot_reads
G9r-NOAA17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,724.0
E-G2-NOAA29,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1197.0
E-G13-NOAA23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1606.0
E-G14-ROS20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2030.0
E-G3-DAL6,33.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2947.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
evenMock_04_0049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6575.0,122301.0
bloomMock_01_0049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14507.0,133951.0
bloomMock_03_0049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15011.0,138323.0
bloomMock_05_0049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16280.0,151914.0


## Export Tables

In [72]:
#export to csv files
dfs = [otu_new, taxa_new]
names = ['asv', 'taxa']
for df, name in zip(dfs,names):
    df.to_csv(directory + prefix + '_' +name+'_limitByTaxa.csv')
    print(directory + prefix + '_' +name+'_limitByTaxa.csv')

../data/GLOMICON_asv_limitByTaxa.csv
../data/GLOMICON_taxa_limitByTaxa.csv
