## CCFRP data conversion

In [42]:
## Imports

import pandas as pd
import numpy as np
import random

from datetime import datetime # for handline dates
import pytz # for handling time zones

import urllib.request, urllib.parse, json # for dealing with WoRMS API and output - consider using requests library instead of urllib?

import warnings # for including warning messages in function

In [4]:
## Load csv

path = 'C:\\Users\\dianalg\\Documents\\Work\\MBARI\\MPA Data Integration\\CCFRP\\'
filename = 'Counts.csv'
data = pd.read_csv(path+filename)

data.head()

Unnamed: 0,Area,Site,Lat Center Point,Lon Center Point,Year,Barred Sand Bass,Bat Ray,Bigmouth Sole,Black-and-Yellow Rockfish,Black Rockfish,...,Vermilion Rockfish,White Croaker,White Seabass,Widow Rockfish,Wolf Eel,Yelloweye Rockfish,Yellowfin Croaker,Yellowtail Jack,Yellowtail Rockfish,Total
0,Trinidad,REF,41.115,-124.173,2018,0,0,0,0,708,...,2,0,0,0,0,0,0,0,22,898
1,Trinidad,REF,41.115,-124.173,2019,0,0,0,0,384,...,1,0,0,0,0,1,0,0,16,504
2,Cape Mendocino,MPA,40.426,-124.478,2017,0,0,0,0,113,...,9,0,0,0,0,3,0,0,4,229
3,Cape Mendocino,MPA,40.426,-124.478,2018,0,0,0,0,58,...,20,0,0,0,0,10,0,0,7,300
4,Cape Mendocino,MPA,40.426,-124.478,2019,0,0,0,0,52,...,15,0,0,0,0,6,0,0,10,234


Patrick suggested that I start by mapping all species names to a WoRMS ID.

In [5]:
## Get species names

common_names = data.columns[5:-1].to_series()
common_names.reset_index(drop=True, inplace=True)

# pd.set_option('display.max_rows', None) # See all rows
pd.set_option('display.max_rows', 60) # Default
common_names

0              Barred Sand Bass
1                       Bat Ray
2                 Bigmouth Sole
3     Black-and-Yellow Rockfish
4                Black Rockfish
                ...            
85                     Wolf Eel
86           Yelloweye Rockfish
87            Yellowfin Croaker
88              Yellowtail Jack
89          Yellowtail Rockfish
Length: 90, dtype: object

These are all common names, and based on the WoRMS REST API documentation, it's unclear how to request a species' data using its common name. Instead, Patrick and I downloaded a species table from the CCFRP submission on the MLML digital commons.

### Get scientific names and lookup in WoRMS

In [6]:
## Load species table

path = 'C:\\Users\\dianalg\\Documents\\Work\\MBARI\\MPA Data Integration\\CCFRP\\'
filename = 'Fish_species_table.csv'
species = pd.read_csv(path+filename)

species.head()

Unnamed: 0,Species Code,Common Name,Genus,Species,Rockfish
0,ANC,Northern Anchovy,Engraulis,mordax,False
1,BAR,Pacific Barracuda,Sphyraena,argentea,False
2,BAY,Black-and-Yellow Rockfish,Sebastes,chrysomelas,True
3,BCO,Bocaccio,Sebastes,paucispinis,True
4,BID,UnID Blue Rockfish,Sebastes,mystinus,True


Note that there are some rows where both species and genus are missing. In this case, the family name is included in the common name field.

In [7]:
## Join genus and species into a single column

scientificName = []

for i in range(0, species.shape[0]):
    
    # get genus and species names from file
    genus = species['Genus'].iloc[i]
    sp = species['Species'].iloc[i]
    
    # catch instances where species is spp.
    if sp == 'spp.': sp = ''
    
    # catch instances where only species is nan
    if type(genus) == str and type(sp) != str: sp = ''
        
    # catch instances where both genus and species are nan
    if type(genus) != str and type(sp) != str:
        genus = ''
        sp = ''
    
    # create scientific name and save to list
    name = ' '.join([genus.strip(), sp.strip()])
    scientificName.append(name.strip())
    
scientificName[0:10]

['Engraulis mordax',
 'Sphyraena argentea',
 'Sebastes chrysomelas',
 'Sebastes paucispinis',
 'Sebastes mystinus',
 'Sebastes melanops',
 'Chromis punctipinnus',
 'Sebastes mystinus',
 'Paralabrax nebulifer',
 'Embiotoca jacksoni']

In [8]:
## Which entries have no genus or species names?

blank_idx = [i for i, x in enumerate(scientificName) if x == '']
species['Common Name'].iloc[blank_idx]

37           Mackerel (Family Scombridae)
63    Silversides (Family Atherinopsidae)
79                                Unknown
Name: Common Name, dtype: object

In [9]:
## Enter family name for Mackerel and Silversides, enter nan for Unknown

missing = species['Common Name'].iloc[blank_idx]
missing_names = []

for name in missing:
    
    try: missing_names.append(name[name.find("(")+1:name.find(")")].split()[1])
    except: missing_names.append(np.nan)
        

In [10]:
## Replace missing values in scientificName

i = 0

for idx, val in enumerate(scientificName):
    if val == '':
        scientificName[idx] = missing_names[i]
        i += 1
    
scientificName[30:40]

['Trachurus symmetricus',
 'Atherinopsis californiensis',
 'Hexagrammos decagrammus',
 'Sebastes atrovirens',
 'Paralabrax clathratus',
 'Ophiodon elongatus',
 'Synodus lucioceps',
 'Scombridae',
 'Sebastes',
 'Sebastes serranoides']

In [11]:
## Create a dictionary with common names as keys and scientific names as values

sp_dict = dict(zip(species['Common Name'], scientificName)) 

In [12]:
## Create a new row with common name values, and then use dictionary to replace with scientific names

species['scientificName'] = species['Common Name']
species.replace({'scientificName':sp_dict}, inplace=True)
species.head()

Unnamed: 0,Species Code,Common Name,Genus,Species,Rockfish,scientificName
0,ANC,Northern Anchovy,Engraulis,mordax,False,Engraulis mordax
1,BAR,Pacific Barracuda,Sphyraena,argentea,False,Sphyraena argentea
2,BAY,Black-and-Yellow Rockfish,Sebastes,chrysomelas,True,Sebastes chrysomelas
3,BCO,Bocaccio,Sebastes,paucispinis,True,Sebastes paucispinis
4,BID,UnID Blue Rockfish,Sebastes,mystinus,True,Sebastes mystinus


In [13]:
## Are all of the species in the data also in the species table?

common_names[common_names.isin(species['Common Name']) == False]


2        Bigmouth Sole
37     Longfin Sanddab
47     Pacific Halibut
51    Pelagic Stingray
dtype: object

#### Looked the following species up by hand:
- Bigmouth Sole - Hippoglossina stomata
- Longfin Sanddab - Citharichthys xanthostigma
- Pacific Halibut - Hippoglossus stenolepis
- Pelagic Stingray - Pteroplatytrygon violacea

Ok, so that was easy enough. However, the WoRMS help folks just got back to me with instructions on how to query common names on the WoRMS API. So let me try to find the same scientific names by querying WoRMS.

#### Obtain scientific names from common names using the WoRMS REST API

In [58]:
test_names = common_names[common_names.isin(species['Common Name']) == False]
test_names

2        Bigmouth Sole
37     Longfin Sanddab
47     Pacific Halibut
51    Pelagic Stingray
dtype: object

In [91]:
## Create function to query WoRMS REST api by common name

def get_worms_from_common_name(common_name):
    """
    Using the WoRMS REST API, retrieve WoRMS ID, scientific name and taxon ID given a common name.
    
    Dependencies:
        import urllib.request, urllib.parse, json, warnings
    
    Usage:
        worms_from_common_name(common_name)
        
    Inputs:
        The common name of interest as a string, e.g. 'Bigmouth sole'
        
    Outputs:
        1. scientificName: WoRMS specified scientific name
        2. scientificNameID: WoRMS specified ID
        3. taxonID: WoRMS specified taxon ID
        
    Diana LaScala-Gruenewald
    Based on worms_from_scientific_name by Patrick Daniels
    2020-04-20
    Python 3.7
    """
    
    # Ensure name is lower case, has no trailing whitespace
    common_name = common_name.lower().strip()
    
    # Create url to query
    name_url = urllib.parse.quote(common_name)
    _url = 'http://www.marinespecies.org/rest/AphiaRecordsByVernacular/'+ name_url + '?like=false&offset=1'
    
    # Try query
    try:
        with urllib.request.urlopen(_url) as url:
            data = json.loads(url.read().decode())
            
            # If more than one match is found, warn and return first match with status 'accepted'
            if len(data) > 1:
                warnings.warn('More than one match found for ' + common_name + '. Returning data from first match with status \'accepted\'.')
                
                for record in data:
                    if record['status'] == 'accepted':
                        return(record['scientificname'], record['lsid'], record['AphiaID'])
            else:
                return(data[0]['scientificname'], data[0]['lsid'], data[0]['AphiaID'])
            
    except Exception as e:
        print('Query wasn\'t successful, check name: ', common_name)
       

In [92]:
## Run function on test names

com_sci_dict = {}

for name in test_names:
    try:
        sci_name, sci_name_id, tax_id = get_worms_from_common_name(name)
        com_sci_dict[name] = sci_name
    except Exception as e:
        print('Could not match ' + name + '. Function output: ' + str(e))
     



In [64]:
## Add content of com_sci_dict to sp_dict

sp_dict = {**com_sci_dict, **sp_dict}

In [81]:
## Save species conversion information in csv

sp_df = pd.DataFrame(sp_dict.keys())
sp_df.rename(columns={0:'common_names'}, inplace=True)
sp_df['scientific_names'] = sp_dict.keys()
sp_df['scientific_names'] = sp_df['scientific_names'].replace(sp_dict)

sp_df.to_csv('CCFRP_common_to_scientific.csv', index=False, na_rep='NaN')

In [90]:
## Get scientific_names for lookup in WoRMS

scientific_names = sp_df['scientific_names']
scientific_names

0          Hippoglossina stomata
1     Citharichthys xanthostigma
2        Hippoglossus stenolepis
3      Pteroplatytrygon violacea
4               Engraulis mordax
                 ...            
89          Atractoscion nobilis
90              Umbrina roncador
91           Sebastes ruberrimus
92               Seriola lalandi
93             Sebastes flavidus
Name: scientific_names, Length: 94, dtype: object

In [93]:
## Function to query WoRMS REST api by scientific name

def get_worms_from_scientific_name(sci_name):
    """
    Using the WoRMS REST API, retrieve WoRMS ID and taxon ID given a scientific name.
    
    Dependencies:
        import urllib.request, urllib.parse, json
    
    Usage:
        get_worms_from_scientific_name(sci_name)
        
    Inputs:
        The scientific name of interest as a string, e.g. 'Dosidicus gigas'
        
    Outputs:
        1. scientificName: WoRMS specified scientific name that matched to sci_name
        2. scientificNameID: WoRMS specified ID
        3. taxonID: WoRMS specified taxon ID
        
    Patrick Daniels
    Small changes from Diana LaScala-Gruenewald
    2020-04-20
    Python 3.7
    """
    
    # Create url to query
    sci_name_url = urllib.parse.quote(sci_name)
    _url = 'http://www.marinespecies.org/rest/AphiaRecordsByNames?scientificnames%5B%5D='+ sci_name_url + '&like=false&marine_only=false'
    
    # Try query
    try:
        with urllib.request.urlopen(_url) as url:
            data = json.loads(url.read().decode())
            return (data[0][0]['scientificname'], data[0][0]['lsid'], data[0][0]['AphiaID'])
    
    # If it fails, try searching for just the genus
    except Exception as e:
        if len(sci_name_url.split('%20')) > 1: 
            # If species is unknown and listed as spp. or sp.
            return get_worms_from_scientific_name(sci_name_url.split('%20')[0])
        else:
            print("Url didn't work, check name, ", sci_name)

In [96]:
%%time

## Get scientific name id and taxon id and store in dictionaries
name_id_dic = {}
name_dic = {}
id_dic = {}

for sci_name in scientific_names:
    print(sci_name)
    sci_name = sci_name.strip()
    try:
        sname, sname_id, id = get_worms_from_scientific_name(sci_name)
        name_id_dic[sci_name] = sname_id
        name_dic[sci_name] = sname
        id_dic[sci_name] = id
    except:
        pass # very hacky 

Hippoglossina stomata
Citharichthys xanthostigma
Hippoglossus stenolepis
Pteroplatytrygon violacea
Engraulis mordax
Sphyraena argentea
Sebastes chrysomelas
Sebastes paucispinis
Sebastes mystinus
Sebastes melanops
Chromis punctipinnus
Sebastes mystinus
Paralabrax nebulifer
Embiotoca jacksoni
Enophrys bison
Enophrys taurina
Sebastes auriculatus
Sebastes dalli
Scorpaenichthys marmoratus
Sebastes nebulosus
Sebastes goodei
Sebastes pinniger
Sebastes caurinus
Oncorhynchus tshawytscha
Scorpaena guttata
Sebastes diaconus
Xystreurys liolepis
Sebastes lentiginosus
Hypsypops rubicundus
Sebastes carnatus
Sebastes rastrelliger
Paralichthys californicus
Sebastes umbrosus
Medialuna californiensis
Trachurus symmetricus
Atherinopsis californiensis
Hexagrammos decagrammus
Sebastes atrovirens
Paralabrax clathratus
Ophiodon elongatus
Synodus lucioceps
Scombridae
Sebastes
Sebastes serranoides
Girella nigricans
Sebastes serranoides or flavidus
Sarda chiliensis
Oxylebius pictus
Rhacochilus vacca
Scomber japo

AttributeError: 'float' object has no attribute 'strip'

**Next steps:**
- need to get rid of nan in species list
- also a problem with Alopius vulpinus
- clarify "Sebastes serranoides or flavidus" - are these different names for the same fish, or two fishes that can't be distinguished, or what?