## CCFRP data conversion

In [53]:
## Imports

import pandas as pd
import numpy as np
import random

import re # for extracting family names when needed

from datetime import datetime # for handline dates
import pytz # for handling time zones

import urllib.request, urllib.parse, json # for dealing with WoRMS API and output - consider using requests library instead of urllib?

In [2]:
## Load csv

path = 'C:\\Users\\dianalg\\Documents\\Work\\MBARI\\MPA Data Integration\\CCFRP\\'
filename = 'Counts.csv'
data = pd.read_csv(path+filename)

data.head()

Unnamed: 0,Area,Site,Lat Center Point,Lon Center Point,Year,Barred Sand Bass,Bat Ray,Bigmouth Sole,Black-and-Yellow Rockfish,Black Rockfish,...,Vermilion Rockfish,White Croaker,White Seabass,Widow Rockfish,Wolf Eel,Yelloweye Rockfish,Yellowfin Croaker,Yellowtail Jack,Yellowtail Rockfish,Total
0,Trinidad,REF,41.115,-124.173,2018,0,0,0,0,708,...,2,0,0,0,0,0,0,0,22,898
1,Trinidad,REF,41.115,-124.173,2019,0,0,0,0,384,...,1,0,0,0,0,1,0,0,16,504
2,Cape Mendocino,MPA,40.426,-124.478,2017,0,0,0,0,113,...,9,0,0,0,0,3,0,0,4,229
3,Cape Mendocino,MPA,40.426,-124.478,2018,0,0,0,0,58,...,20,0,0,0,0,10,0,0,7,300
4,Cape Mendocino,MPA,40.426,-124.478,2019,0,0,0,0,52,...,15,0,0,0,0,6,0,0,10,234


Patrick suggested that I start by mapping all species names to a WoRMS ID.

In [101]:
## Get species names

common_names = data.columns[5:-1].to_series()
common_names.reset_index(drop=True, inplace=True)

# pd.set_option('display.max_rows', None) # See all rows
pd.set_option('display.max_rows', 60) # Default
common_names

0              Barred Sand Bass
1                       Bat Ray
2                 Bigmouth Sole
3     Black-and-Yellow Rockfish
4                Black Rockfish
                ...            
85                     Wolf Eel
86           Yelloweye Rockfish
87            Yellowfin Croaker
88              Yellowtail Jack
89          Yellowtail Rockfish
Length: 90, dtype: object

These are all common names, and based on the WoRMS REST API documentation, it's unclear how to request a species' data using its common name. Instead, Patrick and I downloaded a species table from the CCFRP submission on the MLML digital commons.

### Get scientific names and lookup in WoRMS

In [5]:
## Load species table

path = 'C:\\Users\\dianalg\\Documents\\Work\\MBARI\\MPA Data Integration\\CCFRP\\'
filename = 'Fish_species_table.csv'
species = pd.read_csv(path+filename)

species.head()

Unnamed: 0,Species Code,Common Name,Genus,Species,Rockfish
0,ANC,Northern Anchovy,Engraulis,mordax,False
1,BAR,Pacific Barracuda,Sphyraena,argentea,False
2,BAY,Black-and-Yellow Rockfish,Sebastes,chrysomelas,True
3,BCO,Bocaccio,Sebastes,paucispinis,True
4,BID,UnID Blue Rockfish,Sebastes,mystinus,True


Note that there are some rows where both species and genus are missing. In this case, the family name is included in the common name field.

In [66]:
## Join genus and species into a single column

scientificName = []

for i in range(0, species.shape[0]):
    
    # get genus and species names from file
    genus = species['Genus'].iloc[i]
    sp = species['Species'].iloc[i]
    
    # catch instances where species is spp.
    if sp == 'spp.': sp = ''
    
    # catch instances where only species is nan
    if type(genus) == str and type(sp) != str: sp = ''
        
    # catch instances where both genus and species are nan
    if type(genus) != str and type(sp) != str:
        genus = ''
        sp = ''
    
    # create scientific name and save to list
    name = ' '.join([genus.strip(), sp.strip()])
    scientificName.append(name.strip())
    
scientificName[0:10]

['Engraulis mordax',
 'Sphyraena argentea',
 'Sebastes chrysomelas',
 'Sebastes paucispinis',
 'Sebastes mystinus',
 'Sebastes melanops',
 'Chromis punctipinnus',
 'Sebastes mystinus',
 'Paralabrax nebulifer',
 'Embiotoca jacksoni']

In [68]:
## Which entries have no genus or species names?

blank_idx = [i for i, x in enumerate(scientificName) if x == '']
species['Common Name'].iloc[blank_idx]

37           Mackerel (Family Scombridae)
63    Silversides (Family Atherinopsidae)
79                                Unknown
Name: Common Name, dtype: object

In [72]:
## Enter family name for Mackerel and Silversides, enter nan for Unknown

missing = species['Common Name'].iloc[blank_idx]
missing_names = []

for name in missing:
    
    try: missing_names.append(name[name.find("(")+1:name.find(")")].split()[1])
    except: missing_names.append(np.nan)
        

In [74]:
## Replace missing values in scientificName

i = 0

for idx, val in enumerate(scientificName):
    if val == '':
        scientificName[idx] = missing_names[i]
        i += 1
    
scientificName[30:40]

['Trachurus symmetricus',
 'Atherinopsis californiensis',
 'Hexagrammos decagrammus',
 'Sebastes atrovirens',
 'Paralabrax clathratus',
 'Ophiodon elongatus',
 'Synodus lucioceps',
 'Scombridae',
 'Sebastes',
 'Sebastes serranoides']

In [78]:
## Create a dictionary with common names as keys and scientific names as values

sp_dict = dict(zip(species['Common Name'], scientificName)) 

In [83]:
## Create a new row with common name values, and then use dictionary to replace with scientific names

species['scientificName'] = species['Common Name']
species.replace({'scientificName':sp_dict}, inplace=True)
species.head()

Unnamed: 0,Species Code,Common Name,Genus,Species,Rockfish,scientificName
0,ANC,Northern Anchovy,Engraulis,mordax,False,Engraulis mordax
1,BAR,Pacific Barracuda,Sphyraena,argentea,False,Sphyraena argentea
2,BAY,Black-and-Yellow Rockfish,Sebastes,chrysomelas,True,Sebastes chrysomelas
3,BCO,Bocaccio,Sebastes,paucispinis,True,Sebastes paucispinis
4,BID,UnID Blue Rockfish,Sebastes,mystinus,True,Sebastes mystinus


In [105]:
## Are all of the species in the data also in the species table?

common_names[common_names.isin(species['Common Name']) == False]


2        Bigmouth Sole
37     Longfin Sanddab
47     Pacific Halibut
51    Pelagic Stingray
dtype: object

#### Need to look the following species up by hand:
- Bigmouth Sole
- Longfin Sanddab
- Pacific Halibut
- Pelagic Stingray

-------------------------------------

In [90]:
## Once I've done that, I can use replace to create a series of scientific names to look up in WoRMS

# pd.set_option('display.max_rows', None) # See all rows
pd.set_option('display.max_rows', 60)
scientific_names = common_names.replace(sp_dict)
scientific_names

0        Paralabrax nebulifer
1      Myliobatis californica
2               Bigmouth Sole
3        Sebastes chrysomelas
4           Sebastes melanops
               ...           
85    Anarrhichthys ocellatus
86        Sebastes ruberrimus
87           Umbrina roncador
88            Seriola lalandi
89          Sebastes flavidus
Length: 90, dtype: object

Ok, so generally that worked, but there are definitely a few species in this list that were not in the species table. How to identify them?

In [96]:
for name in common_names:
    
    if species['Common Name'].isin([name])

any(species['Common Name'].isin(['Bigmouth Sole']))

False

In [100]:
species['Common Name'].isin(common_names)

0     False
1     False
2     False
3     False
4     False
      ...  
85    False
86    False
87    False
88    False
89    False
Length: 90, dtype: bool

In [None]:
def worms_from_common_name(common_name):
    """
    Using the WoRMS REST API, retrieve WoRMS ID, scientific name and taxon ID given a common name.
    
    Dependencies:
        import urllib.request, urllib.parse, json
    
    Usage:
        worms_from_common_name(common_name)
        
    Inputs:
        The common name of interest as a string.
        
    Outputs:
        1. scientificName: WoRMS specified scientific name
        2. scientificNameID: WoRMS specified ID
        3. taxonID: WoRMS specified taxon ID
    """
    
    name_url = urllib.parse.quote(common_name)
    _url = 'http://www.marinespecies.org/rest/AphiaRecordsByNames?scientificnames%5B%5D='+ name_url + '&like=false&marine_only=false'
    
    try:
        with urllib.request.urlopen(_url) as url:
            data = json.loads(url.read().decode())
            return (data[0][0]['scientificname'], data[0][0]['lsid'], data[0][0]['AphiaID'])
    except Exception as e:
        # Try passing just the genus if there the species name is unrecognized
        if len(sci_name_url.split('%20')) > 1: #If species is unknown and listed as spp. or sp.
            return get_worms_from_scientific_name(sci_name_url.split('%20')[0])
        else:
            print("Url didn't work, check name, ", sci_name)

In [42]:
test = subset.iloc[3]
test

'Kelp Rockfish'

In [43]:
test_url = urllib.parse.quote(test)

In [45]:
 _url = 'http://www.marinespecies.org/rest/AphiaRecordsByNames?commonnames%5B%5D='+ test_url + '&like=false&marine_only=false'
_url

'http://www.marinespecies.org/rest/AphiaRecordsByNames?commonnames%5B%5D=Kelp%20Rockfish&like=false&marine_only=false'

In [34]:
with urllib.request.urlopen(_url) as url:
            data = json.loads(url.read().decode())
            print(data[0][0]['scientificname'], data[0][0]['lsid'], data[0][0]['AphiaID'])

JSONDecodeError: Expecting value: line 1 column 1 (char 0)