# check Arthropoda MBARI annotations in GLOMICON dataset



## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import glob
import os

#current directory
print('current directory:',os.getcwd())

#For illustrator import:
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

results_directory = '../test/'

current directory: /Users/kpitz/github/GLOMICON/intercomparison/Merged_Datasets/scripts


### File Locations

In [2]:
prefix = 'GLOMICON'

#Data Directory
directory = '../data/'

#Directory for saving Figures
plot_dir = '../figures/test/'
print(plot_dir)

../figures/test/


### Functions

In [3]:
# Dada2 Banzai Output Functions
levels = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']

def make_metadata(infile):
    df = pd.read_csv(infile,index_col=0)
    df.index.names = ['sample_name']
    #df['date'] = pd.to_datetime(df['date'])
    print('Number samples:', len(df.index))
    return df

def make_taxa_asv(infile):
    df = pd.read_csv(infile)
    #df = df.drop('Unnamed: 0', axis=0)
    #df = df.rename(columns= {'Unnamed: 0':'ASV'})
    #df.set_index('ASV', inplace=True)
    print('Number ASVs:', len(df.index))
    return df

#from metadata file, limit OTU table and taxa table to those present in those samples
def from_metadata_to_taxareads(meta_data, otu_table, taxa_table):
    #standard M6 output; sample_names as index; OTUs as index
    cols = list(meta_data)
    otu_lim = pd.concat([meta_data, otu_table.T],join='inner', axis=1)
    otu_lim.drop(cols, inplace=True, axis=1)
    otu_lim=otu_lim.T
    otu_lim['Total']=otu_lim.sum(axis=1)
    otu_lim = otu_lim.loc[otu_lim['Total']>0]
    otu_lim.drop('Total', axis=1, inplace=True)
    cols=list(otu_lim)
    taxa_lim=pd.concat([otu_lim, taxa_table], axis=1, join='inner')
    taxa_lim.drop(cols, inplace=True, axis=1)
    return otu_lim, taxa_lim

# MBARI

In [4]:
Analizing_Institute = 'MBARI'

## Load Data

In [5]:
marker = '18S'

print('#####' + marker + '#####')

# otu table
file = prefix +'_asv_merged.csv'
print(directory+file)
df = make_taxa_asv(directory+file)
df.set_index('ASV', inplace=True)
otu_all = df.copy()

# taxa table
file = prefix +'_taxa_merged_updated.csv'
#file = prefix +'_taxa_merged.csv'
print(directory+file)
df = make_taxa_asv(directory+file)
df = df.drop('Unnamed: 0', axis=1)
df.set_index('ASV', inplace=True)
taxa_all = df.copy()

# metadata
file = prefix +'_meta_merged.csv'
print(directory+file)
df = make_metadata(directory+file)
meta_all = df.copy()

# sequence table
file = prefix +'_seq_merged.csv'
print(directory+file)
df = make_taxa_asv(directory+file)
df.set_index('ASV', inplace=True)
seq_all = df.copy()

seq_all.head()

#####18S#####
../data/GLOMICON_asv_merged.csv
Number ASVs: 13666
../data/GLOMICON_taxa_merged_updated.csv
Number ASVs: 13666
../data/GLOMICON_meta_merged.csv
Number samples: 134
../data/GLOMICON_seq_merged.csv
Number ASVs: 13666


Unnamed: 0_level_0,sequence,Analyzing_Institute
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1
ASV_1,GCACCTACCGATTGAATGGTCCGGTGAAGACTCGGGATTGTGGTCT...,MBARI
ASV_2,GCACCTACCGATTGAATGGTCCGGTGAGGCCTCGGGATCGTGGCGA...,MBARI
ASV_3,GCACCTACCGATTGAATGGTCCGGTGAAGCCTCGGGATTGTGGTTG...,MBARI
ASV_4,GCTCCTACCGATTGAGTGATCCGGTGAATAATTCGGACTGCAGCAG...,MBARI
ASV_5,GCACCTACCGATTGAATGGTCCGGTGAAGACTCGGGATTGTGGTCT...,NOAA


In [6]:
print(list(taxa_all))
taxa_all.head()

['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']


Unnamed: 0_level_0,Kingdom,Phylum,Class,Order,Family,Genus,Species
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ASV_1,,Bacillariophyta,Coscinodiscophyceae,Thalassiosirales,Thalassiosiraceae,Thalassiosira,
ASV_10,,,,,,,uncultured marine eukaryote
ASV_100,,Bacillariophyta,Coscinodiscophyceae,Thalassiosirales,Thalassiosiraceae,,
ASV_1000,,,Bigyra,,,,
ASV_10000,,,Bigyra,,,,


In [7]:
meta_all.head()

Unnamed: 0_level_0,Analyzing_Institute,Collecting_Institute
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Davenport_02_0008,AWI,MBARI
Davenport_06_0008,AWI,MBARI
Davenport_11_0008,AWI,MBARI
Davenport_15_0008,AWI,MBARI
Davenport_19_0008,AWI,MBARI


In [8]:
otu_all.head()

Unnamed: 0_level_0,Davenport_02_0008,Davenport_06_0008,Davenport_11_0008,Davenport_15_0008,Davenport_19_0008,Framstrait_01_0008,Framstrait_05_0008,Framstrait_09_0008,Framstrait_13_0008,Framstrait_17_0008,...,GLOMICON_NOC_10,GLOMICON_NOC_16,GLOMICON_NOC_22,GLOMICON_NOC_3,GLOMICON_NOC_32,GLOMICON_Roscoff_11,GLOMICON_Roscoff_15,GLOMICON_Roscoff_19,GLOMICON_Roscoff_3,GLOMICON_Roscoff_7
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ASV_1,,,,,,,,,,,...,,,,,,,,,,
ASV_2,,,,,,,,,,,...,,,,,,,,,,
ASV_3,,,,,,,,,,,...,,,,,,,,,,
ASV_4,,,,,,,,,,,...,,,,,,,,,,
ASV_5,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df = taxa_all.copy()
df = df.loc[df['Genus']=='Calanus']
df = df[['Species']]
df = pd.concat([df, otu_all], axis=1, join='inner')
df = df.reset_index()
df.set_index(['ASV','Species'], inplace=True)
df = df.T
df = pd.concat([df, meta_all], join='inner', axis=1)
df = df.groupby(['Analyzing_Institute', 'Collecting_Institute']).sum()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,"(ASV_10412, Calanus helgolandicus)","(ASV_10851, Calanus helgolandicus)","(ASV_10929, Calanus helgolandicus)","(ASV_11113, Calanus helgolandicus)","(ASV_11528, nan)","(ASV_12268, Calanus finmarchicus)","(ASV_12395, Calanus helgolandicus)","(ASV_12563, nan)","(ASV_12932, Calanus helgolandicus)","(ASV_13394, Calanus helgolandicus)","(ASV_9804, Calanus helgolandicus)"
Analyzing_Institute,Collecting_Institute,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AWI,AWI,237.0,68070.0,140.0,225.0,0.0,0.0,6.0,0.0,0.0,12.0,2.0
AWI,BLOOMMOCK,0.0,42.0,10.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
AWI,EVENMOCK,0.0,105.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AWI,MBARI,2.0,267.0,49892.0,0.0,9.0,0.0,0.0,4.0,49.0,5.0,159.0
AWI,NOAA,0.0,2.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AWI,NOC,0.0,1.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AWI,UDalhousie,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MBARI,BLOOMMOCK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MBARI,EVENMOCK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MBARI,MBARI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Limit data by metadata parameters

- Run each Analyzing Institute separately

In [9]:
df = meta_all.copy()
#print(df['depth'].max())
df = df.loc[df['Analyzing_Institute'] == Analizing_Institute]
meta_lim = df.copy()
df.head()

Unnamed: 0_level_0,Analyzing_Institute,Collecting_Institute
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Davenport_02_0008,AWI,MBARI
Davenport_06_0008,AWI,MBARI
Davenport_11_0008,AWI,MBARI
Davenport_15_0008,AWI,MBARI
Davenport_19_0008,AWI,MBARI


In [10]:
otu_lim, taxa_lim = from_metadata_to_taxareads(meta_lim, otu_all, taxa_all)