# CCFRP data: convert grid-level data to DwC, separate MeasurementOrFact file

Draws code from earlier conversion draft: CCFRP_grid-level_conversion.ipynb

### Resources
- https://dwc.tdwg.org/terms/
- https://tools.gbif.org/dwca-validator/extension.do?id=dwc:Occurrence
- https://www.gbif.org/data-quality-requirements-occurrences

### Preprocessing
CCFRP data were originally shared as an excel file (CPUE_IDcell_Summary_Tables_2019.xls) with multiple sheets. Saved each sheet as a .csv:
1. CPUE.per.IDcell_2019 --> Grid-level_CPUE.csv
2. Counts.per.IDcell_2019 --> Grid-level_Counts.csv

This could be automated if desired.


In [1]:
## Imports

import pandas as pd
import numpy as np
import random

from datetime import datetime # for handline dates

In [2]:
## Ensure my general functions for the MPA data integration project can be imported, and import them

import sys
sys.path.insert(0, "C:\\Users\\dianalg\\PycharmProjects\\PythonScripts\\MPA data integration")

import WoRMS # functions for querying WoRMS REST API

## Load data

In [191]:
## Load grid-level count data

path = 'C:\\Users\\dianalg\\Documents\\Work\\MBARI\\MPA Data Integration\\CCFRP\\'
filename = 'Grid-level_Counts.csv'
data = pd.read_csv(path+filename)

data.head()

Unnamed: 0,ID.Cell.per.Trip,Date,Area,Site,Year,Total.Angler.Hours,Grid.Cell.ID,Lat Center Point,Lon Center Point,Lat 1,...,Vermilion Rockfish,White Croaker,White Seabass,Widow Rockfish,Wolf Eel,Yelloweye Rockfish,Yellowfin Croaker,Yellowtail Jack,Yellowtail Rockfish,Total
0,AIM09181901,9/18/2019,Anacapa Island,MPA,2019,5.75,AI01,34.0215,-119.3668,34.0189,...,0,0,0,0,0,0,0,0,0,123
1,AIM09191901,9/19/2019,Anacapa Island,MPA,2019,6.033333,AI01,34.0215,-119.3668,34.0189,...,0,0,0,0,0,0,0,0,0,121
2,AIM10251701,10/25/2017,Anacapa Island,MPA,2017,7.329722,AI01,34.0215,-119.3668,34.0189,...,0,0,0,0,0,0,0,0,0,161
3,AIM10291801,10/29/2018,Anacapa Island,MPA,2018,4.416667,AI01,34.0215,-119.3668,34.0189,...,0,0,0,0,0,0,0,0,0,33
4,AIM10181802,10/18/2018,Anacapa Island,MPA,2018,4.916667,AI02,34.0204,-119.3723,34.022,...,0,0,0,0,0,0,0,0,0,59


In [192]:
## Check for duplicate records

any(data.duplicated())

False

In [193]:
## Load scientific names

path = 'C:\\Users\\dianalg\\PycharmProjects\\PythonScripts\\MPA data integration\\CCFRP\\'
filename = 'CCFRP_common_to_scientific.csv'
species = pd.read_csv(path+filename)

species.head()

Unnamed: 0,common_names,scientific_names
0,Bigmouth Sole,Hippoglossina stomata
1,Longfin Sanddab,Citharichthys xanthostigma
2,Pacific Halibut,Hippoglossus stenolepis
3,Pelagic Stingray,Pteroplatytrygon violacea
4,Northern Anchovy,Engraulis mordax


In [194]:
## Load CPUE data

path = 'C:\\Users\\dianalg\\Documents\\Work\\MBARI\\MPA Data Integration\\CCFRP\\'
filename = 'Grid-level_CPUE.csv'
cpue = pd.read_csv(path+filename)

cpue.head()

Unnamed: 0,ID.Cell.per.Trip,Date,Area,Site,Year,Total.Angler.Hours,Grid.Cell.ID,Lat Center Point,Lon Center Point,Lat 1,...,Vermilion Rockfish,White Croaker,White Seabass,Widow Rockfish,Wolf Eel,Yelloweye Rockfish,Yellowfin Croaker,Yellowtail Jack,Yellowtail Rockfish,Total
0,AIM09181901,9/18/2019,Anacapa Island,MPA,2019,5.75,AI01,34.0215,-119.3668,34.0189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.391304
1,AIM09191901,9/19/2019,Anacapa Island,MPA,2019,6.033333,AI01,34.0215,-119.3668,34.0189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.055249
2,AIM10251701,10/25/2017,Anacapa Island,MPA,2017,7.329722,AI01,34.0215,-119.3668,34.0189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.965362
3,AIM10291801,10/29/2018,Anacapa Island,MPA,2018,4.416667,AI01,34.0215,-119.3668,34.0189,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.471698
4,AIM10181802,10/18/2018,Anacapa Island,MPA,2018,4.916667,AI02,34.0204,-119.3723,34.022,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0


### Convert from wide to long format

In [195]:
## Drop Total column

data.drop('Total', axis=1, inplace=True)

In [196]:
## Melt data

data_long = pd.melt(data, id_vars=data.columns[0:17].tolist(), var_name='species_common_name', value_name='count')
data_long.head()

Unnamed: 0,ID.Cell.per.Trip,Date,Area,Site,Year,Total.Angler.Hours,Grid.Cell.ID,Lat Center Point,Lon Center Point,Lat 1,Lon 1,Lat 2,Lon 2,Lat 3,Lon 3,Lat 4,Lon 4,species_common_name,count
0,AIM09181901,9/18/2019,Anacapa Island,MPA,2019,5.75,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0
1,AIM09191901,9/19/2019,Anacapa Island,MPA,2019,6.033333,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0
2,AIM10251701,10/25/2017,Anacapa Island,MPA,2017,7.329722,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0
3,AIM10291801,10/29/2018,Anacapa Island,MPA,2018,4.416667,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0
4,AIM10181802,10/18/2018,Anacapa Island,MPA,2018,4.916667,AI02,34.0204,-119.3723,34.022,-119.3757,34.0232,-119.3705,34.0189,-119.369,34.0176,-119.3742,Barred Sand Bass,0


### Join to obtain scientific names

In [197]:
## Merge

data_long = data_long.merge(species, how='left', left_on='species_common_name', right_on='common_names')
data_long.head()

Unnamed: 0,ID.Cell.per.Trip,Date,Area,Site,Year,Total.Angler.Hours,Grid.Cell.ID,Lat Center Point,Lon Center Point,Lat 1,...,Lat 2,Lon 2,Lat 3,Lon 3,Lat 4,Lon 4,species_common_name,count,common_names,scientific_names
0,AIM09181901,9/18/2019,Anacapa Island,MPA,2019,5.75,AI01,34.0215,-119.3668,34.0189,...,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0,Barred Sand Bass,Paralabrax nebulifer
1,AIM09191901,9/19/2019,Anacapa Island,MPA,2019,6.033333,AI01,34.0215,-119.3668,34.0189,...,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0,Barred Sand Bass,Paralabrax nebulifer
2,AIM10251701,10/25/2017,Anacapa Island,MPA,2017,7.329722,AI01,34.0215,-119.3668,34.0189,...,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0,Barred Sand Bass,Paralabrax nebulifer
3,AIM10291801,10/29/2018,Anacapa Island,MPA,2018,4.416667,AI01,34.0215,-119.3668,34.0189,...,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0,Barred Sand Bass,Paralabrax nebulifer
4,AIM10181802,10/18/2018,Anacapa Island,MPA,2018,4.916667,AI02,34.0204,-119.3723,34.022,...,34.0232,-119.3705,34.0189,-119.369,34.0176,-119.3742,Barred Sand Bass,0,Barred Sand Bass,Paralabrax nebulifer


In [198]:
## Drop unnecessary columns

data_long.drop(['species_common_name', 'common_names'], axis=1, inplace=True)
data_long.head()

Unnamed: 0,ID.Cell.per.Trip,Date,Area,Site,Year,Total.Angler.Hours,Grid.Cell.ID,Lat Center Point,Lon Center Point,Lat 1,Lon 1,Lat 2,Lon 2,Lat 3,Lon 3,Lat 4,Lon 4,count,scientific_names
0,AIM09181901,9/18/2019,Anacapa Island,MPA,2019,5.75,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,0,Paralabrax nebulifer
1,AIM09191901,9/19/2019,Anacapa Island,MPA,2019,6.033333,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,0,Paralabrax nebulifer
2,AIM10251701,10/25/2017,Anacapa Island,MPA,2017,7.329722,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,0,Paralabrax nebulifer
3,AIM10291801,10/29/2018,Anacapa Island,MPA,2018,4.416667,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,0,Paralabrax nebulifer
4,AIM10181802,10/18/2018,Anacapa Island,MPA,2018,4.916667,AI02,34.0204,-119.3723,34.022,-119.3757,34.0232,-119.3705,34.0189,-119.369,34.0176,-119.3742,0,Paralabrax nebulifer


In [200]:
## Groupby to handle the fact that Unknown rockfish and Olive or yellowtail rockfish both match to 'Sebastes'; UnID blue rockfish and Blue rockfish both match to 'Sebastes mystinus'

data_long = data_long.groupby(['ID.Cell.per.Trip', 'Date', 'Area', 'Site', 'Year',
                               'Total.Angler.Hours', 'Grid.Cell.ID', 'Lat Center Point',
                               'Lon Center Point', 'Lat 1', 'Lon 1', 'Lat 2', 'Lon 2', 'Lat 3',
                               'Lon 3', 'Lat 4', 'Lon 4', 'scientific_names']).sum()
data_long.reset_index(inplace=True)

print(data_long.shape)
data_long.head()

(189112, 19)


Unnamed: 0,ID.Cell.per.Trip,Date,Area,Site,Year,Total.Angler.Hours,Grid.Cell.ID,Lat Center Point,Lon Center Point,Lat 1,Lon 1,Lat 2,Lon 2,Lat 3,Lon 3,Lat 4,Lon 4,scientific_names,count
0,AIM09171903,9/17/2019,Anacapa Island,MPA,2019,5.216667,AI03,34.0177,-119.3795,34.0189,-119.3831,34.0206,-119.3781,34.0165,-119.376,34.0148,-119.381,Alopias vulpinus,0
1,AIM09171903,9/17/2019,Anacapa Island,MPA,2019,5.216667,AI03,34.0177,-119.3795,34.0189,-119.3831,34.0206,-119.3781,34.0165,-119.376,34.0148,-119.381,Anarrhichthys ocellatus,0
2,AIM09171903,9/17/2019,Anacapa Island,MPA,2019,5.216667,AI03,34.0177,-119.3795,34.0189,-119.3831,34.0206,-119.3781,34.0165,-119.376,34.0148,-119.381,Artedius harringtoni,0
3,AIM09171903,9/17/2019,Anacapa Island,MPA,2019,5.216667,AI03,34.0177,-119.3795,34.0189,-119.3831,34.0206,-119.3781,34.0165,-119.376,34.0148,-119.381,Atherinops affinis,0
4,AIM09171903,9/17/2019,Anacapa Island,MPA,2019,5.216667,AI03,34.0177,-119.3795,34.0189,-119.3831,34.0206,-119.3781,34.0165,-119.376,34.0148,-119.381,Atherinopsidae,0


## Assemble occurrence file

In [201]:
### Build eventID and put it in a new data frame

eventID = data_long['ID.Cell.per.Trip']
converted = pd.DataFrame({'eventID':eventID})
converted.head()

Unnamed: 0,eventID
0,AIM09171903
1,AIM09171903
2,AIM09171903
3,AIM09171903
4,AIM09171903


In [202]:
## Format dates and add eventDate

eventDate = [datetime.strptime(dt, '%m/%d/%Y').date().isoformat() for dt in data_long['Date']]
converted['eventDate'] = eventDate
converted.head()

Unnamed: 0,eventID,eventDate
0,AIM09171903,2019-09-17
1,AIM09171903,2019-09-17
2,AIM09171903,2019-09-17
3,AIM09171903,2019-09-17
4,AIM09171903,2019-09-17


In [12]:
## Add datasetID

converted['datasetID'] = 'CCFRP'
converted.head()

Unnamed: 0,eventID,eventDate,datasetID
0,AIM09181901,2019-09-18,CCFRP
1,AIM09191901,2019-09-19,CCFRP
2,AIM10251701,2017-10-25,CCFRP
3,AIM10291801,2018-10-29,CCFRP
4,AIM10181802,2018-10-18,CCFRP


In [13]:
## Add locality and locationRemarks

converted['locality'] = data_long['Area']
converted['locationRemarks'] = data_long['Site']

# Change MPA and REF to something more interpretable
habitat_dict = {
    'REF':'fished area',
    'MPA':'marine protected area'
}
converted['locationRemarks'].replace(habitat_dict, inplace=True)
converted.head()

Unnamed: 0,eventID,eventDate,datasetID,locality,locationRemarks
0,AIM09181901,2019-09-18,CCFRP,Anacapa Island,marine protected area
1,AIM09191901,2019-09-19,CCFRP,Anacapa Island,marine protected area
2,AIM10251701,2017-10-25,CCFRP,Anacapa Island,marine protected area
3,AIM10291801,2018-10-29,CCFRP,Anacapa Island,marine protected area
4,AIM10181802,2018-10-18,CCFRP,Anacapa Island,marine protected area


In [14]:
## Add countryCode

converted['countryCode'] = 'US'
converted.head()

Unnamed: 0,eventID,eventDate,datasetID,locality,locationRemarks,countryCode
0,AIM09181901,2019-09-18,CCFRP,Anacapa Island,marine protected area,US
1,AIM09191901,2019-09-19,CCFRP,Anacapa Island,marine protected area,US
2,AIM10251701,2017-10-25,CCFRP,Anacapa Island,marine protected area,US
3,AIM10291801,2018-10-29,CCFRP,Anacapa Island,marine protected area,US
4,AIM10181802,2018-10-18,CCFRP,Anacapa Island,marine protected area,US


In [15]:
%%time

## Add the bounding box of the grid cell as footprintWKT

bb = ['POLYGON ((' + str(data_long['Lat 3'].iloc[i]) + ' ' + str(data_long['Lon 3'].iloc[i]) + ', ' + \
        str(data_long['Lat 1'].iloc[i]) + ' ' + str(data_long['Lon 1'].iloc[i]) + ', ' + \
        str(data_long['Lat 2'].iloc[i]) + ' ' + str(data_long['Lon 2'].iloc[i]) + ', ' + \
        str(data_long['Lat 4'].iloc[i]) + ' ' + str(data_long['Lon 4'].iloc[i]) + ', ' + \
        str(data_long['Lat 3'].iloc[i]) + ' ' + str(data_long['Lon 3'].iloc[i]) + '))' for i in range(data_long.shape[0])]
converted['footprintWKT'] = bb
converted.head()

Wall time: 18.6 s


Unnamed: 0,eventID,eventDate,datasetID,locality,locationRemarks,countryCode,footprintWKT
0,AIM09181901,2019-09-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689..."
1,AIM09191901,2019-09-19,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689..."
2,AIM10251701,2017-10-25,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689..."
3,AIM10291801,2018-10-29,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689..."
4,AIM10181802,2018-10-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0189 -119.369, 34.022 -119.3757, ..."


In [16]:
## Add decimal latitude and decimal longitude

converted['decimalLatitude'] = data_long['Lat Center Point']
converted['decimalLongitude'] = data_long['Lon Center Point']
converted.head()

Unnamed: 0,eventID,eventDate,datasetID,locality,locationRemarks,countryCode,footprintWKT,decimalLatitude,decimalLongitude
0,AIM09181901,2019-09-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668
1,AIM09191901,2019-09-19,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668
2,AIM10251701,2017-10-25,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668
3,AIM10291801,2018-10-29,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668
4,AIM10181802,2018-10-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0189 -119.369, 34.022 -119.3757, ...",34.0204,-119.3723


In [17]:
## Add coordinateUncertaintyInMeters

converted['coordinateUncertaintyInMeters'] = 354 
converted.head()

Unnamed: 0,eventID,eventDate,datasetID,locality,locationRemarks,countryCode,footprintWKT,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters
0,AIM09181901,2019-09-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354
1,AIM09191901,2019-09-19,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354
2,AIM10251701,2017-10-25,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354
3,AIM10291801,2018-10-29,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354
4,AIM10181802,2018-10-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0189 -119.369, 34.022 -119.3757, ...",34.0204,-119.3723,354


In [18]:
## Add samplingEffort

converted['samplingEffort'] = '30-45 minutes of timed fishing activity conducted along 1-3 drifts by between 4 and 18 volunteer anglers (mode of 12)'
converted.head()

Unnamed: 0,eventID,eventDate,datasetID,locality,locationRemarks,countryCode,footprintWKT,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,samplingEffort
0,AIM09181901,2019-09-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...
1,AIM09191901,2019-09-19,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...
2,AIM10251701,2017-10-25,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...
3,AIM10291801,2018-10-29,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...
4,AIM10181802,2018-10-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0189 -119.369, 34.022 -119.3757, ...",34.0204,-119.3723,354,30-45 minutes of timed fishing activity conduc...


In [19]:
## Add occurrenceID

converted['occurrenceID'] = converted.groupby('eventID').cumcount()+1
converted['occurrenceID'] = converted['eventID'] + '_occ' + converted['occurrenceID'].astype(str)
converted.head()

Unnamed: 0,eventID,eventDate,datasetID,locality,locationRemarks,countryCode,footprintWKT,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,samplingEffort,occurrenceID
0,AIM09181901,2019-09-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM09181901_occ1
1,AIM09191901,2019-09-19,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM09191901_occ1
2,AIM10251701,2017-10-25,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM10251701_occ1
3,AIM10291801,2018-10-29,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM10291801_occ1
4,AIM10181802,2018-10-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0189 -119.369, 34.022 -119.3757, ...",34.0204,-119.3723,354,30-45 minutes of timed fishing activity conduc...,AIM10181802_occ1


### Access WoRMS to add species information

In [20]:
## Get unique scientific names, remove nan's

sci_names = data_long['scientific_names'].dropna().unique()

In [21]:
## Replace NaN values in scientific_names with Teleostei

data_long[data_long['scientific_names'].isnull() == True] = 'Teleostei'

In [22]:
## Chromus punctipinnis is misspelled in the data as Chromus punctipinnus. Correcting:

sci_names[sci_names == ['Chromis punctipinnus']] = 'Chromis punctipinnis'

In [23]:
%%time

## Call run_get_worms_from_scientific_name

name_id_dict, name_name_dict, name_taxid_dict = WoRMS.run_get_worms_from_scientific_name(sci_names, verbose_flag=True)

Wall time: 57.4 s


In [24]:
## Add scientific name-related columns

converted['scientificName'] = data_long['scientific_names']

converted['scientificNameID'] = data_long['scientific_names']
converted['scientificNameID'].replace(name_id_dict, inplace=True)

converted['taxonID'] = data_long['scientific_names']
converted['taxonID'].replace(name_taxid_dict, inplace=True)
converted.head()

Unnamed: 0,eventID,eventDate,datasetID,locality,locationRemarks,countryCode,footprintWKT,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,samplingEffort,occurrenceID,scientificName,scientificNameID,taxonID
0,AIM09181901,2019-09-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM09181901_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059
1,AIM09191901,2019-09-19,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM09191901_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059
2,AIM10251701,2017-10-25,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM10251701_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059
3,AIM10291801,2018-10-29,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM10291801_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059
4,AIM10181802,2018-10-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0189 -119.369, 34.022 -119.3757, ...",34.0204,-119.3723,354,30-45 minutes of timed fishing activity conduc...,AIM10181802_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059


In [25]:
## Create identificationQualifier to handle Sebastes serranoides or flavidus species name

identificationQualifier = ['Sebastes serranoides or Sebastes flavidus' if name == 'Sebastes serranoides or flavidus' else np.nan for name in converted['scientificName']]

In [26]:
## Replace scientificName using name_name_dict

converted['scientificName'].replace(name_name_dict, inplace=True)

In [27]:
## Add final name-related columns

converted['nameAccordingTo'] = 'WoRMS'
converted['identificationQualifier'] = identificationQualifier
converted['occurrenceStatus'] = 'present'
converted['basisOfRecord'] = 'HumanObservation'

converted.head()

Unnamed: 0,eventID,eventDate,datasetID,locality,locationRemarks,countryCode,footprintWKT,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,samplingEffort,occurrenceID,scientificName,scientificNameID,taxonID,nameAccordingTo,identificationQualifier,occurrenceStatus,basisOfRecord
0,AIM09181901,2019-09-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM09181901_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059,WoRMS,,present,HumanObservation
1,AIM09191901,2019-09-19,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM09191901_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059,WoRMS,,present,HumanObservation
2,AIM10251701,2017-10-25,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM10251701_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059,WoRMS,,present,HumanObservation
3,AIM10291801,2018-10-29,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM10291801_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059,WoRMS,,present,HumanObservation
4,AIM10181802,2018-10-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0189 -119.369, 34.022 -119.3757, ...",34.0204,-119.3723,354,30-45 minutes of timed fishing activity conduc...,AIM10181802_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059,WoRMS,,present,HumanObservation


### Add count data

In [28]:
## Add count data

converted['individualCount'] = data_long['count']
converted.head()

Unnamed: 0,eventID,eventDate,datasetID,locality,locationRemarks,countryCode,footprintWKT,decimalLatitude,decimalLongitude,coordinateUncertaintyInMeters,samplingEffort,occurrenceID,scientificName,scientificNameID,taxonID,nameAccordingTo,identificationQualifier,occurrenceStatus,basisOfRecord,individualCount
0,AIM09181901,2019-09-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM09181901_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059,WoRMS,,present,HumanObservation,0
1,AIM09191901,2019-09-19,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM09191901_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059,WoRMS,,present,HumanObservation,0
2,AIM10251701,2017-10-25,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM10251701_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059,WoRMS,,present,HumanObservation,0
3,AIM10291801,2018-10-29,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0242 -119.3647, 34.0189 -119.3689...",34.0215,-119.3668,354,30-45 minutes of timed fishing activity conduc...,AIM10291801_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059,WoRMS,,present,HumanObservation,0
4,AIM10181802,2018-10-18,CCFRP,Anacapa Island,marine protected area,US,"POLYGON ((34.0189 -119.369, 34.022 -119.3757, ...",34.0204,-119.3723,354,30-45 minutes of timed fishing activity conduc...,AIM10181802_occ1,Paralabrax nebulifer,urn:lsid:marinespecies.org:taxname:282059,282059,WoRMS,,present,HumanObservation,0


In [29]:
## Update occurrenceStatus based on count

converted.loc[converted['individualCount'] == 0, ['occurrenceStatus']] = 'absent'

## Save occurrence file

In [30]:
## Save

converted.to_csv('CCFRP_grid-level_occurrence.csv', index=False, na_rep='NaN')

## Create MeasurementOrFact file

### Load CPUE data

In [31]:
## Perform initial processing steps and convert to long-form

# Drop species 'Total'
cpue.drop('Total', axis=1, inplace=True)

## Melt data
cpue_long = pd.melt(cpue, id_vars=data.columns[0:17].tolist(), var_name='species_common_name', value_name='cpue')
cpue_long.head()

Unnamed: 0,ID.Cell.per.Trip,Date,Area,Site,Year,Total.Angler.Hours,Grid.Cell.ID,Lat Center Point,Lon Center Point,Lat 1,Lon 1,Lat 2,Lon 2,Lat 3,Lon 3,Lat 4,Lon 4,species_common_name,cpue
0,AIM09181901,9/18/2019,Anacapa Island,MPA,2019,5.75,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0.0
1,AIM09191901,9/19/2019,Anacapa Island,MPA,2019,6.033333,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0.0
2,AIM10251701,10/25/2017,Anacapa Island,MPA,2017,7.329722,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0.0
3,AIM10291801,10/29/2018,Anacapa Island,MPA,2018,4.416667,AI01,34.0215,-119.3668,34.0189,-119.3689,34.0233,-119.37,34.0242,-119.3647,34.0197,-119.3636,Barred Sand Bass,0.0
4,AIM10181802,10/18/2018,Anacapa Island,MPA,2018,4.916667,AI02,34.0204,-119.3723,34.022,-119.3757,34.0232,-119.3705,34.0189,-119.369,34.0176,-119.3742,Barred Sand Bass,0.0


### Get total angler hours

In [99]:
test = pd.DataFrame(converted['eventID'])
out = test.merge(data[['ID.Cell.per.Trip', 'Total.Angler.Hours']], how='left', left_on='eventID', right_on='ID.Cell.per.Trip')
out

Unnamed: 0,eventID,ID.Cell.per.Trip,Total.Angler.Hours
0,AIM09181901,AIM09181901,5.750000
1,AIM09191901,AIM09191901,6.033333
2,AIM10251701,AIM10251701,7.329722
3,AIM10291801,AIM10291801,4.416667
4,AIM10181802,AIM10181802,4.916667
...,...,...,...
193585,TMM07081935,TMM07081935,3.000000
193586,TMM07301735,TMM07301735,3.000000
193587,TMM08061835,TMM08061835,3.200000
193588,TMM08121935,TMM08121935,2.900000


In [103]:
len(test['eventID'].unique())

2148

In [105]:
len(data['ID.Cell.per.Trip'].unique())

2148

In [109]:
len(data['Total.Angler.Hours'])

2149

In [145]:
data[data['ID.Cell.per.Trip'] == 'BLM082012MM']

Unnamed: 0,ID.Cell.per.Trip,Date,Area,Site,Year,Total.Angler.Hours,Grid.Cell.ID,Lat Center Point,Lon Center Point,Lat 1,...,Unknown rockfish,Vermilion Rockfish,White Croaker,White Seabass,Widow Rockfish,Wolf Eel,Yelloweye Rockfish,Yellowfin Croaker,Yellowtail Jack,Yellowtail Rockfish
551,BLM082012MM,8/20/2012,Piedras Blancas,MPA,2012,5.0,BL07,35.6936,-121.3309,35.6959,...,0,1,0,0,0,0,0,0,0,0
634,BLM082012MM,8/20/2012,Piedras Blancas,MPA,2012,5.0,BL19,35.6754,-121.3191,35.6776,...,0,0,0,0,0,0,0,0,0,1


In [156]:
pd.set_option('display.max_rows', 350)
data[(data['Area'] == 'Piedras Blancas') & (data['Date'] == '8/20/2012')]

Unnamed: 0,ID.Cell.per.Trip,Date,Area,Site,Year,Total.Angler.Hours,Grid.Cell.ID,Lat Center Point,Lon Center Point,Lat 1,...,Unknown rockfish,Vermilion Rockfish,White Croaker,White Seabass,Widow Rockfish,Wolf Eel,Yelloweye Rockfish,Yellowfin Croaker,Yellowtail Jack,Yellowtail Rockfish
507,BLM08201201,8/20/2012,Piedras Blancas,MPA,2012,4.902778,BL01,35.7079,-121.3323,35.7101,...,0,1,0,0,0,0,0,0,0,10
550,BLM08201207,8/20/2012,Piedras Blancas,MPA,2012,4.991667,BL07,35.6936,-121.3309,35.6959,...,0,3,0,0,0,0,0,0,0,1
551,BLM082012MM,8/20/2012,Piedras Blancas,MPA,2012,5.0,BL07,35.6936,-121.3309,35.6959,...,0,1,0,0,0,0,0,0,0,0
557,BLM08201208,8/20/2012,Piedras Blancas,MPA,2012,7.677778,BL08,35.6936,-121.3246,35.6958,...,0,4,0,0,0,0,0,0,0,1
633,BLM08201219,8/20/2012,Piedras Blancas,MPA,2012,5.0,BL19,35.6754,-121.3191,35.6776,...,0,2,0,0,0,0,0,0,0,0
634,BLM082012MM,8/20/2012,Piedras Blancas,MPA,2012,5.0,BL19,35.6754,-121.3191,35.6776,...,0,0,0,0,0,0,0,0,0,1


In [117]:
len(data_long['scientific_names'].unique())

88

In [140]:
90*2149

193410

In [162]:
test = converted[converted['eventID'] == 'AIM09181901']

In [177]:
data_long[data_long.duplicated()]

Unnamed: 0,ID.Cell.per.Trip,Date,Area,Site,Year,Total.Angler.Hours,Grid.Cell.ID,Lat Center Point,Lon Center Point,Lat 1,Lon 1,Lat 2,Lon 2,Lat 3,Lon 3,Lat 4,Lon 4,count,scientific_names
167622,AIM09181901,9/18/2019,Anacapa Island,MPA,2019,5.75,AI01,34.0215,-119.367,34.0189,-119.369,34.0233,-119.37,34.0242,-119.365,34.0197,-119.364,0,Sebastes mystinus
167623,AIM09191901,9/19/2019,Anacapa Island,MPA,2019,6.03333,AI01,34.0215,-119.367,34.0189,-119.369,34.0233,-119.37,34.0242,-119.365,34.0197,-119.364,0,Sebastes mystinus
167624,AIM10251701,10/25/2017,Anacapa Island,MPA,2017,7.32972,AI01,34.0215,-119.367,34.0189,-119.369,34.0233,-119.37,34.0242,-119.365,34.0197,-119.364,0,Sebastes mystinus
167625,AIM10291801,10/29/2018,Anacapa Island,MPA,2018,4.41667,AI01,34.0215,-119.367,34.0189,-119.369,34.0233,-119.37,34.0242,-119.365,34.0197,-119.364,0,Sebastes mystinus
167626,AIM10181802,10/18/2018,Anacapa Island,MPA,2018,4.91667,AI02,34.0204,-119.372,34.022,-119.376,34.0232,-119.371,34.0189,-119.369,34.0176,-119.374,0,Sebastes mystinus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174064,TMM07081935,7/8/2019,Ten Mile,MPA,2019,3,TM35,39.5765,-123.784,39.5788,-123.787,39.5788,-123.781,39.5743,-123.781,39.5742,-123.787,0,Sebastes
174065,TMM07301735,7/30/2017,Ten Mile,MPA,2017,3,TM35,39.5765,-123.784,39.5788,-123.787,39.5788,-123.781,39.5743,-123.781,39.5742,-123.787,0,Sebastes
174066,TMM08061835,8/6/2018,Ten Mile,MPA,2018,3.2,TM35,39.5765,-123.784,39.5788,-123.787,39.5788,-123.781,39.5743,-123.781,39.5742,-123.787,0,Sebastes
174067,TMM08121935,8/12/2019,Ten Mile,MPA,2019,2.9,TM35,39.5765,-123.784,39.5788,-123.787,39.5788,-123.781,39.5743,-123.781,39.5742,-123.787,0,Sebastes


In [188]:
test = data_long

In [189]:
out = test.groupby(['ID.Cell.per.Trip', 'Date', 'Area', 'Site', 'Year',
       'Total.Angler.Hours', 'Grid.Cell.ID', 'Lat Center Point',
       'Lon Center Point', 'Lat 1', 'Lon 1', 'Lat 2', 'Lon 2', 'Lat 3',
       'Lon 3', 'Lat 4', 'Lon 4', 'scientific_names']).sum()
out.reset_index(inplace=True)
out.shape

(189112, 19)

In [190]:
88*2149

189112

### Assemble MeasurementOrFact file

In [63]:
## Add occurrenceID

mof = converted['occurrenceID']
mof = pd.concat([mof, converted['occurrenceID'], converted['occurrenceID']], ignore_index=True)
mof = pd.DataFrame({'occurrenceID':mof})
mof.head()

Unnamed: 0,occurrenceID
0,AIM09181901_occ1
1,AIM09191901_occ1
2,AIM10251701_occ1
3,AIM10291801_occ1
4,AIM10181802_occ1


In [77]:
## Add measurementType

measurementType = ['individual count']*converted.shape[0]
measurementType.extend(['effort']*converted.shape[0])
measurementType.extend(['catch-per-unit-effort']*converted.shape[0])

mof['measurementType'] = measurementType
mof.head()

Unnamed: 0,occurrenceID,measurementType
0,AIM09181901_occ1,individual count
1,AIM09191901_occ1,individual count
2,AIM10251701_occ1,individual count
3,AIM10291801_occ1,individual count
4,AIM10181802_occ1,individual count


In [82]:
## Add measurementValue

measurementValue = pd.concat([converted['individualCount'], data['Total.Angler.Hours'], cpue_long['cpue']])
measurementValue

0         0
1         0
2         0
3         0
4         0
         ..
193405    0
193406    0
193407    0
193408    0
193409    0
Length: 388969, dtype: object

In [84]:
mof.shape

(580230, 2)

In [86]:
data['Total.Angler.Hours']

0       5.750000
1       6.033333
2       7.329722
3       4.416667
4       4.916667
          ...   
2144    3.000000
2145    3.000000
2146    3.200000
2147    2.900000
2148    3.000000
Name: Total.Angler.Hours, Length: 2149, dtype: float64

In [87]:
data

Unnamed: 0,ID.Cell.per.Trip,Date,Area,Site,Year,Total.Angler.Hours,Grid.Cell.ID,Lat Center Point,Lon Center Point,Lat 1,...,Unknown rockfish,Vermilion Rockfish,White Croaker,White Seabass,Widow Rockfish,Wolf Eel,Yelloweye Rockfish,Yellowfin Croaker,Yellowtail Jack,Yellowtail Rockfish
0,AIM09181901,9/18/2019,Anacapa Island,MPA,2019,5.750000,AI01,34.0215,-119.3668,34.0189,...,0,0,0,0,0,0,0,0,0,0
1,AIM09191901,9/19/2019,Anacapa Island,MPA,2019,6.033333,AI01,34.0215,-119.3668,34.0189,...,0,0,0,0,0,0,0,0,0,0
2,AIM10251701,10/25/2017,Anacapa Island,MPA,2017,7.329722,AI01,34.0215,-119.3668,34.0189,...,0,0,0,0,0,0,0,0,0,0
3,AIM10291801,10/29/2018,Anacapa Island,MPA,2018,4.416667,AI01,34.0215,-119.3668,34.0189,...,0,0,0,0,0,0,0,0,0,0
4,AIM10181802,10/18/2018,Anacapa Island,MPA,2018,4.916667,AI02,34.0204,-119.3723,34.0220,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,TMM07081935,7/8/2019,Ten Mile,MPA,2019,3.000000,TM35,39.5765,-123.7842,39.5788,...,0,0,0,0,0,0,0,0,0,0
2145,TMM07301735,7/30/2017,Ten Mile,MPA,2017,3.000000,TM35,39.5765,-123.7842,39.5788,...,0,0,0,0,0,0,0,0,0,0
2146,TMM08061835,8/6/2018,Ten Mile,MPA,2018,3.200000,TM35,39.5765,-123.7842,39.5788,...,0,0,0,0,0,0,0,0,0,0
2147,TMM08121935,8/12/2019,Ten Mile,MPA,2019,2.900000,TM35,39.5765,-123.7842,39.5788,...,0,1,0,0,0,0,0,0,0,0
