# Merge individual datasets

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import glob

In [6]:
#From fasta file create pandas df of ASV and sequence
def from_fasta_to_df(file):
    print(file)
    with open(file) as f:
        Ids=[]
        seqs =[]
        for strline in f:
            if strline[0]=='>':
                Ids.append(strline[1:].strip())
            else:
                seqs.append(strline.strip())
    print('Number of Ids:',len(Ids))
    print('Number of Seqs:',len(seqs))
    seq_dict = dict(zip(Ids, seqs))
    #make pandas df
    df= pd.DataFrame.from_dict(seq_dict,orient='index', columns=['sequence'])
    return df

## Import Original Data Files

### NOAA

In [2]:
# NOAA
file = '../../NOAA/NOAA_GLOMICON_results/asv_taxa_sample_table.tsv'
df = pd.read_csv(file, sep='\t')
df.set_index('sequence', inplace=True)
# expand taxonomy column ; these levels may not be correct for each ASV because some levels may be missing sporadically - will fix later
tax_cols = ['Kingdom','Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
for i in range(len(tax_cols)):
    df[tax_cols[i]] = df['taxonomy'].str.split(';').str[i]

# for now limit to just sequence, taxonomy columns, and sample data
df = df.drop(['featureid', 'Confidence'], axis=1)

NOAA = df.copy()
NOAA.head()

Unnamed: 0_level_0,taxonomy,GLOMICON_AWI_12,GLOMICON_AWI_16,GLOMICON_AWI_20,GLOMICON_AWI_4,GLOMICON_AWI_8,GLOMICON_BloomMock_1,GLOMICON_BloomMock_10,GLOMICON_BloomMock_30,GLOMICON_BloomMock_4,...,GLOMICON_Roscoff_3,GLOMICON_Roscoff_7,Kingdom,Domain,Phylum,Class,Order,Family,Genus,Species
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCACCTACCGATTGAATGGTCCGGTGAAGACTCGGGATTGTGGTCTGGCTCCTTCATTGGGGCCAGACCGTGAGAACTTGTCTGAACCTTATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC,Eukaryota;TSAR;Stramenopiles;Gyrista;Mediophyc...,0,0,0,0,0,19548,20058,20015,7929,...,0,0,Eukaryota,TSAR,Stramenopiles,Gyrista,Mediophyceae,Thalassiosirales,Thalassiosiraceae,Conticribra
GCACCTACCGATTGAATGGTCCGGTGAGGCCTCGGGATCGTGGCGAACTTTCTTCATTGGAGGTGAGCTGTGAGAACTTGTCCAAATCTTATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC,Eukaryota;TSAR;Stramenopiles;Gyrista;Coscinodi...,29,5,16,12,13,17878,18175,22740,7652,...,0,0,Eukaryota,TSAR,Stramenopiles,Gyrista,Coscinodiscophyceae,Melosirales,Melosiraceae,Melosira
GCACCTACCGATTGAATGGTCCGGTGAAGCCTCGGGATTGTGGTTGGTTTCCTTTATTGGAATCTGACCACGAGAACCTGTCTAAACCTTATCATTTAGAGGAAGGTGAAGTCGTAACAAGGTTTCC,Eukaryota;TSAR;Stramenopiles;Gyrista;Bacillari...,28,36,46,26,16,4273,3661,3187,1778,...,0,0,Eukaryota,TSAR,Stramenopiles,Gyrista,Bacillariophyceae,Bacillariales,Bacillariaceae,Pseudo-nitzschia
GCTCCTACCGATTGAGTGATCCGGTGAATAATTCGGACTGCAGCAGTGTTCGGTTCCTGAACGTTGCAGCGGAAAGTTTAGTGAACCTTATCACTTAGAGGAAGGAGAAGTCGTAACAAGGTTTCC,Eukaryota;TSAR;Alveolata;Dinoflagellata;Dinoph...,10951,11244,15241,15572,8666,0,0,0,0,...,378,313,Eukaryota,TSAR,Alveolata,Dinoflagellata,Dinophyceae,,,
AAACCATCTTAGTTGGGGGTGGGTGAGGCTGCGCTTTATGGCGTATTCGAGCCTGCCTTCGACAAGGAGGGTTAAGTCGTAACAAGGTATCTGT,Eukaryota,176,157,207,232,70,16,0,0,0,...,5153,6053,Eukaryota,,,,,,,


### AWI

In [3]:
# AWI
# PR2
file  = '../../AWI/Glomicon-AWI-310124/GLOMICON-INTERCOMP_R-4.3.2_seqtab.merged.nochim.PR2-500.csv'
df = pd.read_csv(file)
df = df.rename(columns={'Unnamed: 0':'sequence'})
df.set_index('sequence', inplace=True)
print(list(df))
df = df.rename(columns={'Division':'Phylum'})
# remove some samples which had lab errors:
errors = ['evenMock_A', 'evenMock_B','evenMock_C','evenMock_D','evenMock_E','evenMock_F','evenMock_G','evenMock_H']
df = df.T
df = df.loc[df.index.str.contains('|'.join(errors))==False]
df = df.T
print(list(df))
AWI = df.copy()
df.head()

['Davenport_02_0008', 'Davenport_06_0008', 'Davenport_11_0008', 'Davenport_15_0008', 'Davenport_19_0008', 'Framstrait_01_0008', 'Framstrait_05_0008', 'Framstrait_09_0008', 'Framstrait_13_0008', 'Framstrait_17_0008', 'Bedford_01_0049', 'Bedford_07_0049', 'Bedford_13_0049', 'Bedford_19_0049', 'Bedford_25_0049', 'blank_01_0049', 'bloomMock_01_0049', 'bloomMock_02_0049', 'bloomMock_03_0049', 'bloomMock_04_0049', 'bloomMock_05_0049', 'Davenport_02_0049', 'Davenport_06_0049', 'Davenport_11_0049', 'Davenport_15_0049', 'Davenport_19_0049', 'evenMock_01_0049', 'evenMock_02_0049', 'evenMock_03_0049', 'evenMock_04_0049', 'evenMock_05_0049', 'evenMock_A_0049', 'evenMock_B_0049', 'evenMock_C_0049', 'evenMock_D_0049', 'evenMock_E_0049', 'evenMock_F_0049', 'evenMock_G_0049', 'evenMock_H_0049', 'Framstrait_01_0049', 'Framstrait_05_0049', 'Framstrait_09_0049', 'Framstrait_13_0049', 'Framstrait_17_0049', 'LaJolla_03_0049', 'LaJolla_09_0049', 'LaJolla_15_0049', 'LaJolla_21_0049', 'LaJolla_27_0049', 'Plym

Unnamed: 0_level_0,Davenport_02_0008,Davenport_06_0008,Davenport_11_0008,Davenport_15_0008,Davenport_19_0008,Framstrait_01_0008,Framstrait_05_0008,Framstrait_09_0008,Framstrait_13_0008,Framstrait_17_0008,...,Plymouth_30_0049,Domain,Kingdom,Supergroup,Phylum,Class,Order,Family,Genus,Species
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TAGCGTATATTTAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGGATTTCGGTTGAGAACGGCCGGTCCGCCGTTTGGTGTGCACTGGCTGGTCTCAACTTCCTGTAGAGGACGCGCTCTGGGTTAACGCTCGGACGCGGAGTCTACGTGGTTACTTTGAAAAAATTAGAGTGTTCAAAGCGGGCTTACGCTTGAATATTTCAGCATGGAATAACACTATAGGACTCCTGTCCTATTTCGTTGGTCTCGGGACGGGAGTAATGATTAAGAGGAACAGTTGGGGGCATTCGTATTTCATTGTCAGAGGTGAAATTCTTGGATTTATGAAAGACGAACTTCTGCGAAAGCATTTGCCAAGGATGTTTTCATTA,0,0,5,7,16,8611,7354,2066,4845,4061,...,7,Eukaryota,Archaeplastida,Chlorophyta,Chlorophyta_X,Mamiellophyceae,Mamiellales,Mamiellaceae,Micromonas,Micromonas_polaris
TAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTCGGATTTCGGGTCGGGCCGAGCGGTCTGCCGATGGGTATGCACTGTTTGGCGCGGCCTTCTTTCCGGAGACCGCGGCTACTCTTAACTGAGCGGGCGTGGGAGACGGATCGTTTACTTTGAAAAAATCAGAGTGTTTCTAGCAGGCAGCTCGCTCTTGCATAGGTTAGCATGGGATAATTTAATAGGACTCTGGTGCTATTTTGTTGGTTTCGAACACCGGAGTAATGATTAAAAGGGGCAGTCAGGGGCACTCGTATTCCGTCGAGAGAGGTGAAATTCTCAGACCAATGGAAGACGAACCACTGCGAAAGCATTTGCCAGGGATGTTTTCACTG,9,4,5,12,13,1277,1243,182,327,397,...,64,Eukaryota,Haptista,Haptophyta,Haptophyta_X,Prymnesiophyceae,Phaeocystales,Phaeocystaceae,Phaeocystis,Phaeocystis_pouchetii
TAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGGATTTGTGGTGCGACAGACCGGTCCGACCTTTGGTGGGTACTCGGTGTTGTTGCGCCATCCTTGAGAGGTACGTTCTGGCATTAAGTTGTCGGGGCGGTGTCCGCTCATCGTTTACTGTGAGAAAATTAGAGTGTTCAAAGCAGGCTTATGCCGTTGAATATGCTAGCATGGAATAATAAGATAGGACCTCGGTACTATTTTGTTGGTTTGAGAACCAAGGTAATGATCAATAGGGACAGTTGGGGGTATTCGTATTCAGTTGTCAGAGGTGAAATTCTTAGATTTACGGAAGACGAACTACTGCGAAAGCATTTACCAAGGATGTTTTCATTA,0,0,0,0,0,0,0,0,0,0,...,7,Eukaryota,TSAR,Stramenopiles,Gyrista,Mediophyceae,Chaetocerotales,Chaetocerotaceae,Chaetoceros,
TAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGAATTTTTGGTATAGGTGGCCGACCTTCCACTTTGTGGATTACGCGGTAGCCTTTGCCTTTTTTGAGTAGAGTTTGGGTGGCATTAATTTGTTGCTCTTACAATACTCATCGTTTACTGTGAAGAAATTAGAGTGTTCAAAGCAGGCGTTAGCTGTGAATACATTAGCATGGAATAATAGAATAGGACTTGGGACTATTTTGTTGGTTTACAGACCGAAGTAATGATTAATAGGGACAGTTGGGGGTATTCGTATTTCAGTGTCAGAGGTGAAATTCTTGGATTTCTGAAAGACGAACTACTGCGAAAGCATTTATCAAGGATGTTTTCATTA,0,0,0,0,0,0,0,0,0,0,...,3,Eukaryota,TSAR,Stramenopiles,Gyrista,Coscinodiscophyceae,Melosirales,Melosiraceae,Melosira,Melosira_arctica
TAGCGTATATTAAAGTTGTTGCGGTTAAAAAGCTCGTAGTTGGAGTTCTGCCAGGTGCCACCTGTCCGCCCCAGTGGTGAGTACGTGGTGCGCATTTGGCCCTTTCAAGGGGAGCGTATTTGCACTTAATTGTGTGGTGCGGGATCCTTGACTTTTACTTTGAGGAAATAAGAGTGTTCCAAGCAGGCTCTCGTCGTGCATAGCTCAGCATGGAATAATAGCATTGGACCTCGATTCTAAGCTGTTGGTTGCCAGAAGCGAGGTAATGATGAAGAGGGATAGTTGGGGGCATTCGTATTTAACTGTCAGAGGTGAAATTCTTGGATTTGTTAAAGACGGACTACTGCGAAAGCATCTGCCATGGATGTTTTCATTG,5594,5747,7195,11881,14624,2,0,0,0,1,...,1168,Eukaryota,TSAR,Alveolata,Dinoflagellata,Syndiniales,Dino-Group-I,Dino-Group-I-Clade-1,Dino-Group-I-Clade-1_X,Dino-Group-I-Clade-1_X_sp.


### SBR

In [59]:
# SBR
file  = '../../SBR/SBR_original_files/outputs/asv_table/18s_dada2_v1.0.filtered.table.with.taxo.vsearch_BH.tsv'

df = pd.read_csv(file, sep='\t')
# pare down to otu table, taxonomy table
print(list(df))
df = df.drop('amplicon', axis=1)
df.set_index('sequence', inplace=True)
#df = df[['taxonomy']]
tax_cols = ['Kingdom','Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
for i in range(len(tax_cols)):
    df[tax_cols[i]] = df['taxonomy'].str.split(',').str[i]
    df[tax_cols[i]] = df[tax_cols[i]].str.replace('[dpcofgs]:','',regex=True)
    df[tax_cols[i]] = df[tax_cols[i]].str.strip()
#df = df.drop('taxonomy', axis=1)
df = df.drop(['total', 'spread', 'identity'], axis=1)
#df = df.str.replace(r'[d,p,c,o,f,g,s]:','')
#Eukaryota,d:Hacrobia,p:Haptophyta,c:Prymnesiophyceae,o:Phaeocystales,f:Phaeocystaceae,g:Phaeocystis,s:Phaeocystis_pouchetii
SBR = df.copy()
df.head()

['amplicon', 'sequence', 'total', 'spread', 'cj-BMk13', 'cj-BMk15', 'cj-BMk17', 'cj-BMk21', 'cj-BMk25', 'cj-DAL03', 'cj-DAL09', 'cj-DAL15', 'cj-DAL21', 'cj-DAL27', 'cj-EMk03', 'cj-EMk08', 'cj-EMk09', 'cj-EMk15', 'cj-EMk17', 'cj-MBA03', 'cj-MBA07', 'cj-MBA12', 'cj-MBA16', 'cj-MBA20', 'cj-NOAA07', 'cj-NOAA13', 'cj-NOAA18', 'cj-NOAA24', 'cj-NOAA30', 'cj-ROS01', 'cj-ROS05', 'cj-ROS09', 'cj-ROS13', 'cj-ROS17', 'cj-SOC05', 'cj-SOC12', 'cj-SOC18', 'cj-SOC24', 'cj-SOC28', 'identity', 'taxonomy']


Unnamed: 0_level_0,cj-BMk13,cj-BMk15,cj-BMk17,cj-BMk21,cj-BMk25,cj-DAL03,cj-DAL09,cj-DAL15,cj-DAL21,cj-DAL27,...,cj-SOC28,taxonomy,Kingdom,Domain,Phylum,Class,Order,Family,Genus,Species
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AGCTCCAATAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTCGGATTTCGGGTCGGGCCGAGCGGTCTGCCGATGGGTATGCACTGTTTGGCGCGGCCTTCTTTCCGGAGACCGCGGCTACTCTTAACTGAGCGGGCGTGGGAGACGGATCGTTTACTTTGAAAAAATCAGAGTGTTTCTAGCAGGCAGCTCGCTCTTGCATAGGTTAGCATGGGATAATTTAATAGGACTCTGGTGCTATTTTGTTGGTTTCGAACACCGGAGTAATGATTAAAAGGGGCAGTCAGGGGCACTCGTATTCCGTCGAGAGAGGTGAAATTCTCAGACCAATGGAAGACGAACCACTGCGAAAGCATTTGCCAGGGATGTTTTCA,11080,11583,11988,10976,7834,188,104,61,129,152,...,37,"Eukaryota,d:Hacrobia,p:Haptophyta,c:Prymnesiop...",Eukaryota,Hacrobia,Haptophyta,Prymnesiophyceae,Phaeocystales,Phaeocystaceae,Phaeocystis,Phaeocystis_pouchetii
AGCTCCAATAGCGTATATTAAAGTTGTTGCGGTTAAAAAGCTCGTAGTTGGATTTCTGCCGAGGACGACCGGTCCGCCCTCTGGGTGAGTATCTGGCTCGGCCTGGGCATCTTCTTGGAGAACGTAGCTGCACTTGACTGTGTGGTGCGGTATCCAGGACTTTTACTTTGAGGAAATTAGAGTGTTTCAAGCAGGCTTACGCCTTGAATACATTAGCATGGAATAATAAGATAGGACCTCGGTTCTATTTTGTTGGTTTCTAGAGCTGAGGTAATGATTAATAGGGATAGTTGGGGGCATTCGTATTTAACTGTCAGAGGTGAAATTCTTGGATTTGTTAAAGACGGACTACTGCGAAAGCATTTGCCAAGGATGTTTTCA,5935,6643,6312,6181,4398,26,20,41,24,36,...,4,"Eukaryota,d:Alveolata,p:Dinoflagellata,c:Dinop...",Eukaryota,Alveolata,Dinoflagellata,Dinophyceae,Prorocentrales,Prorocentraceae,Prorocentrum,Prorocentrum_micans
AGCTCCAATAGCGTATATTAAAGTTGTTGTGGTTAAAAAGCTCGTAGTTGGATTTCGGCGGGCATAGGTCGGTTTGAATCGCTTCAACACTGACTTTTTTGCCCGTATGTTTTGCCAGAATCCAGCGGGTGATCTTTACCGATTGTCCGTTGGGGCTGGTAGGTTTACTTTGAAAAAATTAGAGTGCTCAAAGCAAGCTTGATTGCTTGAATATTCGTGCATGGAATAATAGAATAGGAAGTCGTTTCTATTTTGTTGGTTTTCGGAGATTGACTTAATGATTAATAGGGATAGCCGGGGGCATTTGTATTCAAACGACAGAGGTGAAATTCTTGGACCGTTTGAAGACAAACTACTGCGAAAGCATTTGCCAAGAATGTTTTCA,0,0,0,0,0,0,0,0,0,0,...,0,"Eukaryota,d:Opisthokonta,p:Metazoa,c:Arthropod...",Eukaryota,Opisthokonta,Metazoa,Arthropoda,Crustacea,Maxillopoda,Anomalocera,Anomalocera_patersoni
AGCTCCAATAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGGATCCTAGACTTTCCGGGCGAGCGGTCCGCCTCGCTGGCGGACGTCGGCGTCGCCCGGGTCTGAGTTGCTGGGGAGGCCGGTGGTGCCCTTTACCGGGTGCCGTCGGCTGCCCGCAAGTTTTACTTTGAAAAAATCAGAGTGCTCAAAGCAGGCCTTTCATGCCTGAATGCAATTTCATGGAATAAGCGAACAAGACCTCCGTTCCGTTGGGTTCCGGTTCTGGAGGTAATGATCAAGAGGGCCTGACGGGGGCATACGTATTGCAGGGCGAGAGGTGAAATTCAGTGACCCTTGCAAGACGAACTAAAGCGAAAGCATTTGCCAAGAATGGTTTCC,0,0,0,0,0,0,0,0,0,0,...,0,"Eukaryota,d:Opisthokonta,p:Metazoa,c:Annelida,...",Eukaryota,Opisthokonta,Metazoa,Annelida,Annelida_X,Annelida_XX,Capitella,Capitella_teleta
AGCTCCAATAGCGTATATTAAAGTTGTTGCAGTTAAAACGCTCGTAGTCGGATTTCGGGGCGGGGCCGCCGGTCTGCCGATGGGTATGCACTGGCGGGCGCGTCCTTCCTTCCGGAGACTGGCCCTACTCTTAACTGAGCGGGGTCGGGAATCGGATCGTTTACTTTGAAAAAATCAGAGTGTTTCAAGCAGGCAGCTCGCTCTTGCATGGATTAGCATGGGATAATGAAATAGGACTTTGGTGCTATTTTGTTGGTTTCGAACACCGAAGTAATGATTAACAGGGACAGTCAGGGGCACTCGTATTCCGCAGAGAGAGGTGAAATTCTCAGACCCGCGGAAGACGAACCACTGCGAAAGCATTTGCCAGGGATGTTTTCA,426,422,396,412,292,5,7,9,3,4,...,4,"Eukaryota,d:Hacrobia,p:Haptophyta,c:Prymnesiop...",Eukaryota,Hacrobia,Haptophyta,Prymnesiophyceae,Prymnesiales,Prymnesiaceae,Prymnesium,Prymnesium_nemamethecum


### MBARI

In [60]:
# MBARI
directory = '../../MBARI/Analysis_In_Progress/data/filtered_seq_data/'
#GLOMICON_18S_otu_Filtered.csv
# otu table
file = 'GLOMICON_18S_otu_Filtered.csv'
print(directory+file)
df = pd.read_csv(directory + file)
df = df.rename(columns={'Unnamed: 0':'ASV'})
df.set_index('ASV', inplace=True)
otu_mbari = df.copy()
print('Number ASVs:', len(df.index))

# taxa table
file = 'GLOMICON_18S_taxa_Filtered.csv'
df = pd.read_csv(directory+file)
df = df.rename(columns={'Unnamed: 0':'ASV'})
df.set_index('ASV', inplace=True)
taxa_mbari = df.copy()
print('Number ASVs:', len(df.index))

# metadata
file = 'GLOMICON_18S_meta_Filtered.csv'
df = pd.read_csv(directory+file)
df.set_index('sample_name', inplace=True)
meta_all = df.copy()
print('Number samples:', len(df.index))

# sequence table
file = 'GLOMICON_18S_seq_Filtered.csv'
df = pd.read_csv(directory +file)
df = df.rename(columns={'Unnamed: 0':'ASV'})
df.set_index('ASV', inplace=True)
seq_mbari = df.copy()
print('Number ASVs:', len(df.index))
df.head()


# Join to get otu table + taxa table, indexed by sequence:
df = pd.concat([seq_mbari, taxa_mbari, otu_mbari], axis=1)
df.set_index('sequence', inplace=True)
MBARI = df.copy()
df

../../MBARI/Analysis_In_Progress/data/filtered_seq_data/GLOMICON_18S_otu_Filtered.csv
Number ASVs: 3701
Number ASVs: 3701
Number samples: 28
Number ASVs: 3701


Unnamed: 0_level_0,Kingdom,Phylum,Class,Order,Family,Genus,Species,CN18Sc37_12_Rep_Stdy5_AO,CN18Sc37_12_Rep_Stdy10_AO,CN18Sc37_12_Rep_Stdy14_AO,...,UDalhousie2_AO,UDalhousie8_AO,UDalhousie14_AO,UDalhousie20_AO,UDalhousie26_AO,AWIMOCKEVEN1_AO,AWIMOCKEVEN2_AO,AWIMOCKEVEN3_AO,AWIMOCKEVEN4_AO,AWIMOCKEVEN5_AO
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCTACTACCGATTGAACATTTTAGTGAGGTCCTCGGACTGTGAGCCAGGCGGGTCGCCCTGCCTGGTCTACGGGAAGACGACCAAACTGTAGTGTTTAGAGGAAGTAAAAGTCGTAACAAGGTTTCC,Eukaryota,Arthropoda,Hexanauplia,Calanoida,Paracalanidae,Paracalanus,unassigned,368,1985,3774,...,0,2,4,7,0,0,0,1,1,2
GCTACTACCGATTGAACGTTTTAGTGAGGTCCTCGGACTGTTTGCCTGGCGGATTACTCTGCCTGGCTGGCGGGAAGACGACCAAACTGTAGCGTTTAGAGGAAGTAAAAGTCGTAACAAGGTTTCC,Eukaryota,Arthropoda,Hexanauplia,Calanoida,Calanidae,unassigned,unassigned,226,16579,9178,...,0,2,10,3,7,60,55,85,103,84
GCTCCTACCGATTGAGTGATCCGGTGAATAATTCGGACTGCAGCAGTGTTCAGTTCCTGAACGTTGCAGTGGAAAGTTTAGTGAACCTTATCACTTAGAGGAAGGAGAAGTCGTAACAAGGTTTCC,Eukaryota,unknown,Dinophyceae,Thoracosphaerales,Thoracosphaeraceae,Ensiculifera,Ensiculifera imariensis,22534,25588,20551,...,2085,599,1699,2218,926,23,21,10,11,22
GCTCCTACCGATTGGATGATTCGGTAAGCTCTTGGGATTGATTGACGACCTGCATGTCAGACGGATGTTGACAACTTGATCAAACCTAATCATCTAGAGGAAGGAGAAGTCGTAACAAGGTTTCC,Eukaryota,unknown,unknown,unknown,unknown,unknown,Acantharian sp. 6201,116,97,90,...,190,196,156,141,142,264,199,163,204,262
GCTCCTACCGATTGAGTGATCCGGTGAATAATTCGGACTGCAGCAGTGTTCAGTTCCTGAACGTTGCAGCGGAAAGTTTAGTGAACCTTATCACTTAGAGGAAGGAGAAGTCGTAACAAGGTTTCC,Eukaryota,unknown,Dinophyceae,unassigned,unassigned,unassigned,unassigned,374,440,407,...,663,1219,1109,750,900,1745,1832,1696,1793,1939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCTACCATCGGTGGACGGCCTAGCGAGGTCTTTCGGACCGGAACGGGGGCGGCTCGCGCCGTCCCTACCGGGAAGAGGCCCCAACTCGGTCGTTCGGAGATAGTAAAAGTCGTAACAAGGTTTCC,Eukaryota,Mollusca,Gastropoda,Nudibranchia,Dotidae,Doto,unassigned,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCTACTACCGATTGAATGACTTAGTGATTACAGGGGATGACGCCTTGCGGCTGGCGACAGCTGCTTGGCATCCAAACCTGGGCAAACTTGGTCATTTAGAGGAAGTAAAAGTCGTAACAAGGTTTCC,no_hit,no_hit,no_hit,no_hit,no_hit,g_,s_,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCTACTACCGATGGAACGATTTAGTGAGTTGCACGGACCAGTGCTCTCGATTGGTTTCCAGTCTTGAGCACTAGGGAAGTGACGCAAACTTGACCGTTTAGAGGAAGTAAAAGTCGTAACAAGGTTTCC,no_hit,no_hit,no_hit,no_hit,no_hit,g_,s_,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCATCTATTGATTAGGTTGTATCATGAGTTAGGCGGAGGTTGCATGGTCGAAGCAATTCTTTCATGCTCCCAAAGCTTTGCAAATGGTGCTACCCAGAAACAGATAAAGTCATAACACGGCAGCT,Eukaryota,unassigned,unassigned,unassigned,unassigned,g_,s_,0,0,0,...,0,0,2,0,0,0,0,0,0,0


### UDalhousie

In [61]:
# UDalhousie
directory = '../../UDalhousie/UDalhousie_GLOMICON_data/'
# otu table, taxa table by ASV hash
file = 'GLOMICON_UDAL_f20_feature-table_w_tax.txt'
print(directory+file)
df1 = pd.read_csv(directory + file, sep='\t', skiprows=1)
df1.set_index('#OTU ID', inplace=True)

# fasta file
file = 'GLOMICON_UDAL_f20_dna-sequences.fasta'
print(directory+file)
df2 = from_fasta_to_df(directory+file)

# join together
df = pd.concat([df1,df2], axis=1)
df = df.set_index('sequence')

# split out taxonomy
tax_cols = ['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
for i in range(len(tax_cols)):
    df[tax_cols[i]] = df['taxonomy'].str.split(';').str[i]
    df[tax_cols[i]] = df[tax_cols[i]].str.replace('[dpcofgs]__','',regex=True)
    df[tax_cols[i]] = df[tax_cols[i]].str.strip()

#df = df.drop('taxonomy', axis=1)

Udalhousie = df.copy()
Udalhousie.head()

../../UDalhousie/UDalhousie_GLOMICON_data/GLOMICON_UDAL_f20_feature-table_w_tax.txt
../../UDalhousie/UDalhousie_GLOMICON_data/GLOMICON_UDAL_f20_dna-sequences.fasta
../../UDalhousie/UDalhousie_GLOMICON_data/GLOMICON_UDAL_f20_dna-sequences.fasta
Number of Ids: 881
Number of Seqs: 881


Unnamed: 0_level_0,G8r-AWI19,G19r-AWI7,G12r-NOC23,E-G2-NOAA29,G22r-DAL24,E-G16-NOC11,E-G27-eMock2-2,G4r-ROS16,G7r-NOC20,E-G14-ROS20,...,E-G1-AWI11,E-G33-bMock9,taxonomy,Domain,Phylum,Class,Order,Family,Genus,Species
sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CAATAGCGTATATTAAAGTTGTTGCGGTTAAAAAGCTCGTAGTTGGATTTCTGCTGAGGACGACCGGTCCGCCCTCCGGGTGAGCATCTGGTTCGGCCTTGGCATCTTCTTGGTGAACGTATCTGCACTTGACTGTGTGGTGCGGTACCCAGGACTTTTACTTTGAGGAAATTAGAGTGTTTCAAGCAGGCATACGCCTTGAATACATTAGCATGGAATAATAAGATAGGACCTCGGTTCTATTTTGTTGGTTTCTAGAGCTGAGGTAATGATTAATAGGGATAGTTGGGGGCATTCGTATTTAACTGTCAGAGGTGAAATTCTTGGATTTGTTAAAGACGGACTACTGC,2541.0,2061.0,283.0,0.0,0.0,213.0,0.0,32.0,359.0,34.0,...,2043.0,0.0,d__Eukaryota; p__Dinoflagellata; c__Dinophycea...,Eukaryota,Dinoflagellata,Dinophyceae,Gymnodiniphycidae,Gymnodiniphycidae,Gyrodinium,
CAATAGCGTATATTAAAGTTGTTGCGGTTAAAAAGCTCGTAGTTGGATTTCTGCTGAGGACGACCGGTCCGCCCTCTGGGTGAGTATCTGGCTTGGCCTTGGCATCTTCTTGGAGAACGTAGCTGCACTTGACTGTGTGGTGCGGTATCCAGGACTTTTACTTTGAGGAAATTAGAGTGTTTCAAGCAGGCACACGCCTTGAATACATTAGCATGGAATAATAAGATAGGACCTTGGTTCTATTTTGTTGGTTTCTAGAGCTGAGGTAATGATTAATAGGGATAGTTGGGGGCATTCGTATTTAACTGTCAGAGGTGAAATTCTTGGATTTGTTAAAGACGGACTACTGC,1555.0,2450.0,19.0,33.0,558.0,36.0,0.0,0.0,54.0,0.0,...,2309.0,0.0,d__Eukaryota; p__Dinoflagellata; c__Dinophyceae,Eukaryota,Dinoflagellata,Dinophyceae,,,,
CAATAGCGTATATTAAAGTTGTTGCGGTTAAAAAGCTCGTAGTTGGATTTCTGTTGAGGACGACCGGTCCGCCCTCTGGGTGAGTATCTGGCTCGGCCTTGGCATCTTCTTGGAGAACGTAACTGCACTTGACTGTGTGGTGCGGTATCCAGGACTTTTACTTTGAGGAAATTAGAGTGTTTCAAGCAGGCGTACGCCTTGAATACATTAGCATGGAATAATGAGATAGGACCTTGGTTCTATTTTGTTGGTTTCTAGAGCTGAGGTAATGATTAATAGGGATAGTTGGGGGCATTCGTATTTAATTGTCAGAGGTGAAATTCTTGGATTTATTAAAGACGGACTACTGC,823.0,1107.0,31.0,0.0,0.0,12.0,0.0,0.0,12.0,0.0,...,840.0,0.0,d__Eukaryota; p__Dinoflagellata; c__Dinophycea...,Eukaryota,Dinoflagellata,Dinophyceae,Gymnodiniphycidae,Gymnodinium_clade,Gymnodinium,uncultured_eukaryote
CAATAGCGTATATTTAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGGATTTCGGTTGAGAACGGCCGGTCCGCCGTTTGGTGTGCACTGGCTGGTCTCAACTTCCTGTAGAGGACGCGCTCTGGGTTAACGCTCGGACGCGGAGTCTACGTGGTTACTTTGAAAAAATTAGAGTGTTCAAAGCGGGCTTACGCTTGAATATTTCAGCATGGAATAACACTATAGGACTCCTGTCCTATTTCGTTGGTCTCGGGACGGGAGTAATGATTAAGAGGAACAGTTGGGGGCATTCGTATTTCATTGTCAGAGGTGAAATTCTTGGATTTATGAAAGACGAACTTCTGCGAAAGCA,729.0,1494.0,0.0,0.0,0.0,0.0,408.0,0.0,0.0,0.0,...,469.0,1438.0,d__Eukaryota; p__Chlorophyta; c__Mamiellophyce...,Eukaryota,Chlorophyta,Mamiellophyceae,Mamiellales,Mamiellales,Micromonas,uncultured_eukaryote
CAATAGCGTATATTAAAGTTGTTGCGGTTAAAAAGCTCGTAGTTGGATTTCTGCTGAGGACGACCGGTCCGCCCTCTGGGTGAGCATCTGGTTCGGCCTTGGCATCTTCTTGGTGAACGTATCTGCACTTGACTGTGTGGTGCGGTGCCCAGGACTTTTACTTTGAGGAAATTAGAGTGTTTCAAGCAGGCATACGCCTTGAATACATTAGCATGGAATAATAAGATAGGACCTCGGTTCTATTTTGTTGGTTTCTAGAGCTGAGGTAATGATTAATAGGGATAGTTGGGGGCATTCGTATTTAACTGTCAGAGGTGAAATTCTTGGATTTGTTAAAGACGGACTACTGC,530.0,565.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,690.0,0.0,d__Eukaryota; p__Dinoflagellata; c__Dinophycea...,Eukaryota,Dinoflagellata,Dinophyceae,Gymnodiniphycidae,,,


## Join data files together

- create merged otu table, taxa table, seq table, and metadata table
- will leave all taxonomy columns in table for now
- create simplified 'ASV' ID based on read abundance, ASV_1 = highest number of reads

### ASV Table, taxonomy table, sequence table

In [62]:
df = pd.concat([AWI, SBR, MBARI, NOAA, Udalhousie], axis=0, keys=['AWI', 'SBR', 'MBARI', 'NOAA', 'UDAL'])

# taxonomy column names across partners
tax_cols = ['Kingdom','Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'taxonomy','Supergroup']

print(list(df))

df['total'] = df.sum(axis=1, numeric_only=True)
df= df.sort_values('total', ascending=False)
df = df.reset_index()
df['ASV'] = df.index + 1
df['ASV'] = 'ASV_' + df['ASV'].astype(str)

df = df.rename(columns={'level_0':'Analyzing_Institute'})
df.set_index('ASV', inplace=True)

# sequence table:
seq_all = df[['sequence', 'Analyzing_Institute']].copy()
# taxonomy table:
taxa_all = df[tax_cols].copy()
# simplify taxonomy table
for i in ['no_hit', 'g_', 's_']:
    taxa_all = taxa_all.replace(i, 'unassigned')
taxa_all = taxa_all.fillna('not_provided')

df = df.drop(tax_cols, axis=1)
df = df.drop(['sequence', 'Analyzing_Institute', 'total'], axis=1)


# asv_table
asv_all = df.copy()

['Davenport_02_0008', 'Davenport_06_0008', 'Davenport_11_0008', 'Davenport_15_0008', 'Davenport_19_0008', 'Framstrait_01_0008', 'Framstrait_05_0008', 'Framstrait_09_0008', 'Framstrait_13_0008', 'Framstrait_17_0008', 'Bedford_01_0049', 'Bedford_07_0049', 'Bedford_13_0049', 'Bedford_19_0049', 'Bedford_25_0049', 'blank_01_0049', 'bloomMock_01_0049', 'bloomMock_02_0049', 'bloomMock_03_0049', 'bloomMock_04_0049', 'bloomMock_05_0049', 'Davenport_02_0049', 'Davenport_06_0049', 'Davenport_11_0049', 'Davenport_15_0049', 'Davenport_19_0049', 'evenMock_01_0049', 'evenMock_02_0049', 'evenMock_03_0049', 'evenMock_04_0049', 'evenMock_05_0049', 'Framstrait_01_0049', 'Framstrait_05_0049', 'Framstrait_09_0049', 'Framstrait_13_0049', 'Framstrait_17_0049', 'LaJolla_03_0049', 'LaJolla_09_0049', 'LaJolla_15_0049', 'LaJolla_21_0049', 'LaJolla_27_0049', 'Plymouth_06_0049', 'Plymouth_13_0049', 'Plymouth_19_0049', 'Plymouth_29_0049', 'Plymouth_30_0049', 'Domain', 'Kingdom', 'Supergroup', 'Phylum', 'Class', 'Or

### Metadata table

In [63]:
#metadata table (just which samples belong to which partner)
# easier to just remerge tables
df = pd.concat([AWI, SBR, MBARI, NOAA, Udalhousie], axis=1, keys=['AWI', 'SBR', 'MBARI', 'NOAA', 'UDAL'])
# df = df.drop(tax_cols, axis=1)
df = df.T
df['count'] = 1
df = df[['count']]
df = df.reset_index()
df = df.rename(columns={'level_0':'Analyzing_Institute',
                       'level_1':'sample_name'})

# drop taxonomy columns
df = df.loc[df['sample_name'].isin(tax_cols)==False]

# pull out some quick labels:
df['Collecting_Institute'] = ''

df.loc[df['sample_name'].str.contains('Davenport|MBA|CN18S'), 'Collecting_Institute'] = 'MBARI'
df.loc[df['sample_name'].str.contains('Framstrait|AWI'), 'Collecting_Institute'] = 'AWI'
df.loc[df['sample_name'].str.contains('MOCKBLOOM|bloomMock|BMk|BloomMock|bMock'), 'Collecting_Institute'] = 'BLOOMMOCK'
df.loc[df['sample_name'].str.contains('MOCKEVEN|evenMock|EMk|EvenMock|eMock'), 'Collecting_Institute'] = 'EVENMOCK'
df.loc[df['sample_name'].str.contains('Bedford|DAL|UDalhousie'), 'Collecting_Institute'] = 'UDalhousie'
df.loc[df['sample_name'].str.contains('LaJolla|NOAA'), 'Collecting_Institute'] = 'NOAA'
df.loc[df['sample_name'].str.contains('Plymouth|NOC'), 'Collecting_Institute'] = 'NOC'
df.loc[df['sample_name'].str.contains('ROS|Roscoff'), 'Collecting_Institute'] = 'SBR'
df.loc[df['sample_name'].str.contains('SOC'), 'Collecting_Institute'] = 'NOC'   # Should be NOC but double check.

df.set_index('sample_name', inplace=True)

#look at total number of samples:
print(df.groupby(['Analyzing_Institute', 'Collecting_Institute']).sum(numeric_only=True))

df = df.drop('count', axis=1)

#metadata table:
meta_all = df[['Analyzing_Institute','Collecting_Institute']].copy()

df = df.loc[df['Collecting_Institute']=='']
df

sequence                                  count
Analyzing_Institute Collecting_Institute       
AWI                                           1
                    AWI                      10
                    BLOOMMOCK                 5
                    EVENMOCK                  5
                    MBARI                    10
                    NOAA                      5
                    NOC                       5
                    UDalhousie                5
MBARI               BLOOMMOCK                 5
                    EVENMOCK                  5
                    MBARI                     5
                    NOAA                      3
                    NOC                       5
                    UDalhousie                5
NOAA                AWI                       5
                    BLOOMMOCK                 5
                    EVENMOCK                  5
                    NOC                       5
                    SBR                 

sequence,Analyzing_Institute,Collecting_Institute
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1
blank_01_0049,AWI,


## Export and Save Files

In [64]:
folder = '../data/'
prefix = 'GLOMICON'

### Create Fasta File of ASV Sequences

In [65]:
df = seq_all.copy()
df = df.reset_index()
df['label'] = df['ASV'] +'|' + df['Analyzing_Institute']

outfile = folder + prefix+'_merged.fasta'

seqs = df['sequence'].tolist()
otus = df.label.tolist()

##Write File
resultsFile = open(outfile, "w") #open resultsfile

for i in range(len(otus)):
    resultsFile.write('>' + otus[i])
    resultsFile.write('\n')
    resultsFile.write(seqs[i])
    resultsFile.write('\n')       
print("End")
resultsFile.close()

df.head()

End


Unnamed: 0,ASV,sequence,Analyzing_Institute,label
0,ASV_1,CAATAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTCGG...,UDAL,ASV_1|UDAL
1,ASV_2,GCACCTACCGATTGAATGGTCCGGTGAAGACTCGGGATTGTGGTCT...,MBARI,ASV_2|MBARI
2,ASV_3,GCACCTACCGATTGAATGGTCCGGTGAGGCCTCGGGATCGTGGCGA...,MBARI,ASV_3|MBARI
3,ASV_4,GCACCTACCGATTGAATGGTCCGGTGAAGCCTCGGGATTGTGGTTG...,MBARI,ASV_4|MBARI
4,ASV_5,GCTCCTACCGATTGAGTGATCCGGTGAATAATTCGGACTGCAGCAG...,MBARI,ASV_5|MBARI


### Export CSV files of tables

In [66]:
#export to csv files
dfs = [asv_all, taxa_all, seq_all, meta_all]
names = ['asv', 'taxa', 'seq', 'meta']
for df, name in zip(dfs,names):
    df.to_csv(folder + prefix + '_' +name+'_merged.csv')
    print(folder + prefix + '_' +name+'_merged.csv')

../data/GLOMICON_asv_merged.csv
../data/GLOMICON_taxa_merged.csv
../data/GLOMICON_seq_merged.csv
../data/GLOMICON_meta_merged.csv


### Write fasta file of unique sequences in project:

In [83]:
# drop duplicates
df = seq_all.copy()
df = df.drop_duplicates('sequence')
outfile = folder + prefix+'_seq_merged_unique.fasta'

seqs = df['sequence'].tolist()
otus = df.index.tolist()

##Write File
resultsFile = open(outfile, "w") #open resultsfile

for i in range(len(otus)):
    resultsFile.write('>' + otus[i])
    resultsFile.write('\n')
    resultsFile.write(seqs[i])
    resultsFile.write('\n')       
print("End")
resultsFile.close()

df.head()

End


Unnamed: 0_level_0,sequence,Analyzing_Institute
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1
ASV_1,CAATAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTCGG...,UDAL
ASV_2,GCACCTACCGATTGAATGGTCCGGTGAAGACTCGGGATTGTGGTCT...,MBARI
ASV_3,GCACCTACCGATTGAATGGTCCGGTGAGGCCTCGGGATCGTGGCGA...,MBARI
ASV_4,GCACCTACCGATTGAATGGTCCGGTGAAGCCTCGGGATTGTGGTTG...,MBARI
ASV_5,GCTCCTACCGATTGAGTGATCCGGTGAATAATTCGGACTGCAGCAG...,MBARI


In [82]:
df = seq_all.copy()
print(len(df.index))
df = df.drop_duplicates('sequence')
print(len(df.index))
df

14547
13542


Unnamed: 0_level_0,sequence,Analyzing_Institute
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1
ASV_1,CAATAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTCGG...,UDAL
ASV_2,GCACCTACCGATTGAATGGTCCGGTGAAGACTCGGGATTGTGGTCT...,MBARI
ASV_3,GCACCTACCGATTGAATGGTCCGGTGAGGCCTCGGGATCGTGGCGA...,MBARI
ASV_4,GCACCTACCGATTGAATGGTCCGGTGAAGCCTCGGGATTGTGGTTG...,MBARI
ASV_5,GCTCCTACCGATTGAGTGATCCGGTGAATAATTCGGACTGCAGCAG...,MBARI
...,...,...
ASV_14543,TAGCGTATATTAATGTTGTTGCAGTTAAAAAGCTCGTAGTTGGATT...,AWI
ASV_14544,TAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGAATT...,AWI
ASV_14545,TAGCGTATATTAAAGTTGTTGCAGTTAAAAAGCTCGTAGTTGGATT...,AWI
ASV_14546,TAGCGTATACTAATGTTGTTGCAGTTAAAAAGCTCGTAGTCGGATT...,AWI


## Export unique taxonomy to query through WORMS tool
- limited to 1500 names at a time

In [75]:
pat_list = [':nucl','_sp\\.','_sp1', '_Group-1','_Clade.*','_Group.*','-Group.*','-lineage','_[1-9][a-z]*', '_[a-z][1-9]', 'Basal']

In [77]:
df = taxa_all.copy()
df = df.loc[df['Species'].str.contains('X')]
df['Species'] = df['Species'].str.strip()
df['Species'] = df['Species'].str.replace('_X*_sp\\.', '', regex=True)
df['Species'] = df['Species'].str.replace('_X*_sp\\.', '', regex=True)
df['Species'] = df['Species'].str.replace('_X*_', '', regex=True)
df['Species'] = df['Species'].str.replace(' X* ', '', regex=True)
df['Species'] = df['Species'].str.replace('_X*', '', regex=True)
for pat in pat_list:
    df['Species'] = df['Species'].str.replace(pat, '', regex=True)
#df['Species'] = df['Species'].str.replace('_',' ')
df['Species'] = df['Species'].str.replace(' sp.','')

df = df.drop_duplicates('Species')
df = df.sort_values('Species')
df.to_csv('../data/species_list.csv')

In [70]:
print(df['Species'].tolist())

['Abedinium dasypus', 'Abeoformidae Group MAIP 1', 'Abeoformidae Group MAIP 2', 'Abeoformidae Group MAIP 4', 'Abollifer globosa', 'Abollifer prolabens', 'Acanthamoeba', 'Acantharea', 'Acantharea 1', 'Acantharea 2b', 'Acantharea 3b', 'Acantharea 4d', 'Acantharea 4d X', 'Acantharea E', 'Acantharian 6201', 'Acanthochiasma', 'Acanthocorbis unguiculata', 'Acanthoeca SIOpierAcanth1', 'Acanthoecida', 'Acanthoecidae Group G', 'Acanthometra', 'Acanthometra 3 ICG-2009', 'Acanthometra pellucida', 'Acanthoplegma', 'Acartia hudsonica', 'Acartia negligens', 'Achnanthes bongranii', 'Achradina pulchra', 'Acipenser fulvescens', 'Acrocalanus', 'Acrochaete leptochaete', 'Acrostichus', 'Actinocyclus', 'Actinocyclus curvatulus', 'Actinomma boreale', 'Actinoptychus', 'Adenoides eludens', 'Aglantha digitale', 'Aglaura hemistoma', 'Akashiwo sanguinea', 'Alcyonidioides mytili', 'Alexandrium', 'Alexandrium catenella', 'Alexandrium hiranoi', 'Alexandrium insuetum', 'Alexandrium minutum', 'Alexandrium ostenfeldii

In [56]:
test = df.loc[df['Species']=='Acartia hudsonica ']
test


Unnamed: 0_level_0,Kingdom,Domain,Phylum,Class,Order,Family,Genus,Species,taxonomy,Supergroup
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
