In [44]:
import pandas as pd
import requests
from urllib.parse import quote_plus
import os

#### Reading in and Deduplicating the Primary File by Taxa (probably downloaded from Laminar)

In [45]:
# Specify the input and output files (include the entire path name)
original_file = input("Enter the original CSV file name (with extension). The file path is relative from where this program is run: ")

# Save the original file within a variable called df
original_df = pd.read_csv(original_file)

Enter the original CSV file name (with extension). The file path is relative from where this program is run: ../input.csv


In [46]:
# Run this block if you want to view the read-in original file
original_df

Unnamed: 0,Cruise,ISO_DateTime_UTC,ISO_DateTime_PDT,Line,St,Lat,Long,Tow_Depth,Mesh Size,Larva_ID,Growth_Stage,Species,SL_(mm),BD_(mm),HL_(mm),Prey_Taxa,Prey_Stage,Length_(mm),Width_(mm)
0,BBB2101,2021-01-24T20:24Z,2021-01-24T12:24,90.0,30.0,33.41795,-117.904683,30.00,505,RF101,Postflexion,Sebates semicinctus,10.1106,1.7712,3.6162,Cyclopoida,Nauplii,0.2220,0.0740
1,BBB2101,2021-01-24T20:24Z,2021-01-24T12:24,90.0,30.0,33.41795,-117.904683,30.00,505,RF101,Postflexion,Sebates semicinctus,10.1106,1.7712,3.6162,Calanoida,Copepodite,0.7030,0.2960
2,BBB2101,2021-01-24T20:24Z,2021-01-24T12:24,90.0,30.0,33.41795,-117.904683,30.00,505,RF101,Postflexion,Sebates semicinctus,10.1106,1.7712,3.6162,Cyclopoida,Copepodite,0.4625,0.1850
3,BBB2101,2021-01-24T20:24Z,2021-01-24T12:24,90.0,30.0,33.41795,-117.904683,30.00,505,RF101,Postflexion,Sebates semicinctus,10.1106,1.7712,3.6162,Calanoida,Copepodite,0.5920,0.3145
4,BBB2101,2021-01-24T20:24Z,2021-01-24T12:24,90.0,30.0,33.41795,-117.904683,30.00,505,RF101,Postflexion,Sebates semicinctus,10.1106,1.7712,3.6162,Calanoida,Nauplii,0.1480,0.1295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3076,SW2104,2021-04-15T20:46Z,2021-04-15T13:46,85.0,42.9,33.85000,-119.370000,32.42,505,RF220,Flexion,Sebates unidentified,6.1992,0.6642,1.5498,Cyclopoida,Nauplii,0.2220,0.1480
3077,SW2104,2021-04-15T20:46Z,2021-04-15T13:46,85.0,42.9,33.85000,-119.370000,32.42,505,RF220,Flexion,Sebates unidentified,6.1992,0.6642,1.5498,Copepoda,Nauplii,0.2220,0.1480
3078,SW2104,2021-04-15T20:46Z,2021-04-15T13:46,85.0,42.9,33.85000,-119.370000,32.42,505,RF220,Flexion,Sebates unidentified,6.1992,0.6642,1.5498,Cyclopoida,Nauplii,0.1850,0.1295
3079,SW2104,2021-04-15T20:46Z,2021-04-15T13:46,85.0,42.9,33.85000,-119.370000,32.42,505,RF220,Flexion,Sebates unidentified,6.1992,0.6642,1.5498,Calanoida,Nauplii,0.2405,0.1295


In [9]:
# Identify the column containing species names in the original file
while True:
        column_name = input("Enter the name of the column that contains the scientific names: ")
        if column_name in df.columns:
            break  # Exit the loop if the column name is valid
        else:
            print(f"Column '{column_name}' not found. Please try again. (make sure capitalization pattern is accurate.)")

Enter the name of the column that contains the scientific names: Spe
Column 'Spe' not found. Please try again. (make sure capitalization pattern is accurate.)
Enter the name of the column that contains the scientific names: test
Column 'test' not found. Please try again. (make sure capitalization pattern is accurate.)
Enter the name of the column that contains the scientific names: Species


In [11]:
# Read out a list of deduplicated taxa from df
unique_values = get_unique_values(df, column_name)
print(f"Unique values in column '{column_name}':\n{unique_values}")

Unique values in column 'Species':
['Sebates semicinctus' 'Sebates jordani' 'Sebates levis' 'Sebates goodei'
 'Sebates wilsoni' 'Sebates miniatus' 'Sebates paucispinis'
 'Sebates unidentified' 'Sebates hopkinsi' 'Sebates saxicola'
 'Sebates simulator' 'Sebates moseri' 'Sebates rufus'
 'Sebates melanostomus' 'Sebates helvomaculatus'
 'Sebates chlorosticus/rosenblatti' 'Sebates diploproa'
 'Sebates macdonaldi' 'Sebates ensifer' 'Sebates rufinanus'
 'Sebates mystinus/entomelas' 'Sebates aurora']


#### Fetching data from the WoRMs API

In [12]:
# Step 5: Fetch data from the API

def fetch_api_data(unique_values):
    """Fetch data from the WoRMS API for a list of unique scientific names."""
    base_url = 'https://www.marinespecies.org/rest/AphiaRecordsByMatchNames?scientificnames%5B%5D='
    api_results = []

    for each in unique_values:
        try:
            url_sp = quote_plus(each)
            url = f'{base_url}{url_sp}&marine_only=true'
            print("Next api query to run:")
            print(url)

            response = requests.get(url)
            response.raise_for_status()  # Raise an error for bad responses
            data = response.json()

            for i in data:
                for y in i:
                    y['PI_entered_name'] = each  # Add the scientific name to the result
                    api_results.append(y)

        except requests.exceptions.HTTPError as e:
            print(f"HTTP error: {e}")
            api_results.append({'PI_entered_name': each, 'error_message': 'HTTP error'})
        except Exception as e:
            print(f"Error fetching data: {e}. Likely this means no close match was found.")
            api_results.append({'PI_entered_name': each, 'error_message': 'Error fetching data'})
        print("The program finished processing and printing all API results.")

    return api_results

api_results = fetch_api_data(unique_values)

Next api query to run:
https://www.marinespecies.org/rest/AphiaRecordsByMatchNames?scientificnames%5B%5D=Sebates+semicinctus&marine_only=true
Next api query to run:
https://www.marinespecies.org/rest/AphiaRecordsByMatchNames?scientificnames%5B%5D=Sebates+jordani&marine_only=true
Next api query to run:
https://www.marinespecies.org/rest/AphiaRecordsByMatchNames?scientificnames%5B%5D=Sebates+levis&marine_only=true
Next api query to run:
https://www.marinespecies.org/rest/AphiaRecordsByMatchNames?scientificnames%5B%5D=Sebates+goodei&marine_only=true
Next api query to run:
https://www.marinespecies.org/rest/AphiaRecordsByMatchNames?scientificnames%5B%5D=Sebates+wilsoni&marine_only=true
Next api query to run:
https://www.marinespecies.org/rest/AphiaRecordsByMatchNames?scientificnames%5B%5D=Sebates+miniatus&marine_only=true
Next api query to run:
https://www.marinespecies.org/rest/AphiaRecordsByMatchNames?scientificnames%5B%5D=Sebates+paucispinis&marine_only=true
Next api query to run:
https

In [47]:
# Convert the API results into a dataframe
api_results_df = pd.DataFrame(api_results)

# Display the API results dataframe within the Jupyter Notebook
## This DF represents all of the WoRMS metadata from the deduplicated Taxa in the original file")
api_results_df

Unnamed: 0,AphiaID,url,scientificname,authority,status,unacceptreason,taxonRankID,rank,valid_AphiaID,valid_name,...,lsid,isMarine,isBrackish,isFreshwater,isTerrestrial,isExtinct,match_type,modified,PI_entered_name,error_message
0,274851.0,https://www.marinespecies.org/aphia.php?p=taxd...,Sebastes semicinctus,"(Gilbert, 1897)",accepted,,220.0,Species,274851.0,Sebastes semicinctus,...,urn:lsid:marinespecies.org:taxname:274851,1.0,0.0,0.0,0.0,,near_1,2008-01-15T17:27:08.177Z,Sebates semicinctus,
1,274806.0,https://www.marinespecies.org/aphia.php?p=taxd...,Sebastes jordani,"(Gilbert, 1896)",accepted,,220.0,Species,274806.0,Sebastes jordani,...,urn:lsid:marinespecies.org:taxname:274806,1.0,0.0,0.0,0.0,,near_1,2008-01-15T17:27:08.177Z,Sebates jordani,
2,274812.0,https://www.marinespecies.org/aphia.php?p=taxd...,Sebastes levis,"(Eigenmann & Eigenmann, 1889)",accepted,,220.0,Species,274812.0,Sebastes levis,...,urn:lsid:marinespecies.org:taxname:274812,1.0,0.0,0.0,0.0,,near_1,2008-01-15T17:27:08.177Z,Sebates levis,
3,274798.0,https://www.marinespecies.org/aphia.php?p=taxd...,Sebastes goodei,"(Eigenmann & Eigenmann, 1890)",accepted,,220.0,Species,274798.0,Sebastes goodei,...,urn:lsid:marinespecies.org:taxname:274798,1.0,1.0,0.0,0.0,,near_1,2008-01-15T17:27:08.177Z,Sebates goodei,
4,274868.0,https://www.marinespecies.org/aphia.php?p=taxd...,Sebastes wilsoni,"(Gilbert, 1915)",accepted,,220.0,Species,274868.0,Sebastes wilsoni,...,urn:lsid:marinespecies.org:taxname:274868,1.0,0.0,0.0,0.0,,near_1,2008-01-15T17:27:08.177Z,Sebates wilsoni,
5,274820.0,https://www.marinespecies.org/aphia.php?p=taxd...,Sebastes miniatus,"(Jordan & Gilbert, 1880)",accepted,,220.0,Species,274820.0,Sebastes miniatus,...,urn:lsid:marinespecies.org:taxname:274820,1.0,0.0,0.0,0.0,,near_1,2008-01-15T17:27:08.177Z,Sebates miniatus,
6,274833.0,https://www.marinespecies.org/aphia.php?p=taxd...,Sebastes paucispinis,"Ayres, 1854",accepted,,220.0,Species,274833.0,Sebastes paucispinis,...,urn:lsid:marinespecies.org:taxname:274833,1.0,0.0,0.0,0.0,,near_1,2008-01-15T17:27:08.177Z,Sebates paucispinis,
7,,,,,,,,,,,...,,,,,,,,,Sebates unidentified,Error fetching data
8,274800.0,https://www.marinespecies.org/aphia.php?p=taxd...,Sebastes hopkinsi,"(Cramer, 1895)",accepted,,220.0,Species,274800.0,Sebastes hopkinsi,...,urn:lsid:marinespecies.org:taxname:274800,1.0,0.0,0.0,0.0,,near_1,2008-01-15T17:27:08.177Z,Sebates hopkinsi,
9,274848.0,https://www.marinespecies.org/aphia.php?p=taxd...,Sebastes saxicola,"(Gilbert, 1890)",accepted,,220.0,Species,274848.0,Sebastes saxicola,...,urn:lsid:marinespecies.org:taxname:274848,1.0,0.0,0.0,0.0,,near_1,2008-01-15T17:27:08.177Z,Sebates saxicola,


#### Saving the deduplicated and merged WoRMs results to a CSV

In [48]:
# File name handling
deduplicated_worms_output_file = f'{original_file[:-4]}_deduplicated_worms_taxa_results.csv'
deduplicated_worms_output_file_name = str(deduplicated_worms_output_file)
save_results_to_csv(api_results, deduplicated_worms_output_file)
print(deduplicated_worms_output_file_name + " was saved in the same folder as the input file")


../input_deduplicated_worms_taxa_results.csv was saved in the same folder as the input file


#### Joining the API results to the original file


In [49]:
# Specify columns to keep from the API df after join is completed

merge_columns = [
        'PI_entered_name', 'AphiaID', 'scientificname',
        'status', 'rank', 'valid_name', 'lsid', 'match_type'
    ]

merged_df = original_df.merge(results_df[merge_columns],
                                  left_on=column_name,
                                  right_on='PI_entered_name',
                                  how='left')

## This df shows the merge_columns from the API results df joined to the original_df
merged_df

Unnamed: 0,Cruise,ISO_DateTime_UTC,ISO_DateTime_PDT,Line,St,Lat,Long,Tow_Depth,Mesh Size,Larva_ID,...,Length_(mm),Width_(mm),PI_entered_name,AphiaID,scientificname,status,rank,valid_name,lsid,match_type
0,BBB2101,2021-01-24T20:24Z,2021-01-24T12:24,90.0,30.0,33.41795,-117.904683,30.00,505,RF101,...,0.2220,0.0740,Sebates semicinctus,274851.0,Sebastes semicinctus,accepted,Species,Sebastes semicinctus,urn:lsid:marinespecies.org:taxname:274851,near_1
1,BBB2101,2021-01-24T20:24Z,2021-01-24T12:24,90.0,30.0,33.41795,-117.904683,30.00,505,RF101,...,0.7030,0.2960,Sebates semicinctus,274851.0,Sebastes semicinctus,accepted,Species,Sebastes semicinctus,urn:lsid:marinespecies.org:taxname:274851,near_1
2,BBB2101,2021-01-24T20:24Z,2021-01-24T12:24,90.0,30.0,33.41795,-117.904683,30.00,505,RF101,...,0.4625,0.1850,Sebates semicinctus,274851.0,Sebastes semicinctus,accepted,Species,Sebastes semicinctus,urn:lsid:marinespecies.org:taxname:274851,near_1
3,BBB2101,2021-01-24T20:24Z,2021-01-24T12:24,90.0,30.0,33.41795,-117.904683,30.00,505,RF101,...,0.5920,0.3145,Sebates semicinctus,274851.0,Sebastes semicinctus,accepted,Species,Sebastes semicinctus,urn:lsid:marinespecies.org:taxname:274851,near_1
4,BBB2101,2021-01-24T20:24Z,2021-01-24T12:24,90.0,30.0,33.41795,-117.904683,30.00,505,RF101,...,0.1480,0.1295,Sebates semicinctus,274851.0,Sebastes semicinctus,accepted,Species,Sebastes semicinctus,urn:lsid:marinespecies.org:taxname:274851,near_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3141,SW2104,2021-04-15T20:46Z,2021-04-15T13:46,85.0,42.9,33.85000,-119.370000,32.42,505,RF220,...,0.2220,0.1480,Sebates unidentified,,,,,,,
3142,SW2104,2021-04-15T20:46Z,2021-04-15T13:46,85.0,42.9,33.85000,-119.370000,32.42,505,RF220,...,0.2220,0.1480,Sebates unidentified,,,,,,,
3143,SW2104,2021-04-15T20:46Z,2021-04-15T13:46,85.0,42.9,33.85000,-119.370000,32.42,505,RF220,...,0.1850,0.1295,Sebates unidentified,,,,,,,
3144,SW2104,2021-04-15T20:46Z,2021-04-15T13:46,85.0,42.9,33.85000,-119.370000,32.42,505,RF220,...,0.2405,0.1295,Sebates unidentified,,,,,,,


#### Saving the merged API results with the original data file as a new CSV

In [50]:
# File name handling
merged_worms_output_file = f'{original_file[:-4]}_merged_worms_bcodmo.csv'
merged_worms_output_file_name = str(merged_worms_output_file)
save_results_to_csv(merged_df, merged_worms_output_file)
print(merged_worms_output_file_name + " was saved in the same folder as the input file.")    

../input_merged_worms_bcodmo.csv was saved in the same folder as the input file.
