# Processing zooplankton data to DWC compliant files

In [1]:
#get coding environment ready
#bring in pandas for manipulating columns, put no limit on the amount of columns in display
import pandas as pd
pd.options.display.max_columns = None

#bring in numpy to manipulate numbers
import numpy as np

#I need random numbers
import random

#I need to manipulate dates
import datetime

# Let start with data: read in existing csv

## take a look at the file

In [2]:
#Make a dataframe, parse dates
raw = pd.read_csv('https://raw.githubusercontent.com/ioos/bio_data_guide/main/datasets/AMBON_zooplankton/data/raw/AMBON2017150.csv', parse_dates=['Date_Time'])

raw.head()



Unnamed: 0,Cruise,Station,Type,Date_Time,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],Accepted_Organism_Identification,Life_Stage,APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3]
0,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pisces,larvae,11676.0,1.172,
1,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Isopoda,,1131.0,1.172,
2,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Evadne nordmanni,,106273.0,1.172,0.0086
3,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pseudocalanus newmani,,157679.0,4.686,0.0121
4,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pseudocalanus acuspes,,104514.0,4.686,0.0335


In [3]:
raw.tail()

Unnamed: 0,Cruise,Station,Type,Date_Time,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],Accepted_Organism_Identification,Life_Stage,APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3]
2931,AMBON2017,DBO3.6,TWINRING_150UM_MICROSCOPY,2017-08-22 03:47:00,-168.2368,67.897,59,,56,Parasagitta elegans,,105440.0,0.998,2.0528
2932,AMBON2017,DBO3.6,TWINRING_150UM_MICROSCOPY,2017-08-22 03:47:00,-168.2368,67.897,59,,56,Calanus glacialis/marshallae,,196770.0,10.112,2.8979
2933,AMBON2017,DBO3.6,TWINRING_150UM_MICROSCOPY,2017-08-22 03:47:00,-168.2368,67.897,59,,56,Neocalanus plumchrus,,196772.0,7.983,5.0574
2934,AMBON2017,DBO3.6,TWINRING_150UM_MICROSCOPY,2017-08-07 01:39:00,-168.2368,67.897,59,,56,Decapoda,megalopa,1130.0,2.195,5.0972
2935,AMBON2017,DBO3.6,TWINRING_150UM_MICROSCOPY,2017-08-07 01:39:00,-168.2368,67.897,59,,56,Neocalanus cristatus,,104470.0,1.064,5.0994


In [4]:
raw.describe()
#so it's count of 2936


Unnamed: 0,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3]
count,2936.0,2936.0,2936.0,0.0,2936.0,2934.0,2936.0,2909.0
mean,-164.283995,70.388515,42.691757,,39.691757,93961.67,223.325846,0.745213
std,2.515989,1.17715,8.678432,,8.678432,78012.04,1478.179158,4.116812
min,-168.9541,67.6664,15.0,,12.0,101.0,0.025,1e-05
25%,-166.4723,69.8214,41.0,,38.0,1268.0,0.42075,0.0068
50%,-163.9609,70.648,45.0,,42.0,104514.0,3.985,0.0462
75%,-162.2508,71.232,48.0,,45.0,117849.0,31.89525,0.2763
max,-159.3972,72.4964,59.0,,56.0,1434803.0,37568.289,99.1136


In [5]:
#let's check for non identified observations, just in case
raw["APHIA_ID"].isnull().sum()

2

In [6]:
raw.dropna(subset=['APHIA_ID'], inplace=True)

In [7]:
raw.tail()

Unnamed: 0,Cruise,Station,Type,Date_Time,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],Accepted_Organism_Identification,Life_Stage,APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3]
2931,AMBON2017,DBO3.6,TWINRING_150UM_MICROSCOPY,2017-08-22 03:47:00,-168.2368,67.897,59,,56,Parasagitta elegans,,105440.0,0.998,2.0528
2932,AMBON2017,DBO3.6,TWINRING_150UM_MICROSCOPY,2017-08-22 03:47:00,-168.2368,67.897,59,,56,Calanus glacialis/marshallae,,196770.0,10.112,2.8979
2933,AMBON2017,DBO3.6,TWINRING_150UM_MICROSCOPY,2017-08-22 03:47:00,-168.2368,67.897,59,,56,Neocalanus plumchrus,,196772.0,7.983,5.0574
2934,AMBON2017,DBO3.6,TWINRING_150UM_MICROSCOPY,2017-08-07 01:39:00,-168.2368,67.897,59,,56,Decapoda,megalopa,1130.0,2.195,5.0972
2935,AMBON2017,DBO3.6,TWINRING_150UM_MICROSCOPY,2017-08-07 01:39:00,-168.2368,67.897,59,,56,Neocalanus cristatus,,104470.0,1.064,5.0994


In [8]:
# checked NaNs removed
raw["APHIA_ID"].isnull().sum()

0

In [9]:
#Ok, doublechecking that it was an ok thing
raw.describe()
#count of 2934, as expected
#raw.head()

Unnamed: 0,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3]
count,2934.0,2934.0,2934.0,0.0,2934.0,2934.0,2934.0,2907.0
mean,-164.281753,70.389463,42.68439,,39.68439,93961.67,223.476458,0.711606
std,2.51532,1.17664,8.675609,,8.675609,78012.04,1478.671789,3.691712
min,-168.9541,67.6664,15.0,,12.0,101.0,0.025,1e-05
25%,-166.4723,69.8214,41.0,,38.0,1268.0,0.42025,0.0068
50%,-163.9609,70.648,45.0,,42.0,104514.0,3.985,0.0462
75%,-162.2508,71.232,48.0,,45.0,117849.0,32.00375,0.2761
max,-159.3972,72.4964,59.0,,56.0,1434803.0,37568.289,82.354


## list existing column headers, match to DWC terms

In [10]:

# what are the existing column headers?
raw.columns

Index(['Cruise', 'Station', 'Type', 'Date_Time',
       'Longitude_[decimal_degrees_east]', 'Latitude_[decimal _degrees_north]',
       'Bottom_Depth_[m]', 'Cast_Number', 'Depth_[m]',
       'Accepted_Organism_Identification', 'Life_Stage', 'APHIA_ID',
       'Abundance_[#/m3]', 'Biomass_[mg dw/m3]'],
      dtype='object')

In [11]:
# dataframe processing
# make a copy
df = raw.copy()
df.head()


Unnamed: 0,Cruise,Station,Type,Date_Time,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],Accepted_Organism_Identification,Life_Stage,APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3]
0,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pisces,larvae,11676.0,1.172,
1,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Isopoda,,1131.0,1.172,
2,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Evadne nordmanni,,106273.0,1.172,0.0086
3,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pseudocalanus newmani,,157679.0,4.686,0.0121
4,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pseudocalanus acuspes,,104514.0,4.686,0.0335


In [12]:
#what I need is class(df[Station]) and levels. In Pandas that's...
#df['Station'].astype('category')
#hmm there are 78, that's, not going to work
df['APHIA_ID'].astype('category')
#hmmm 78

#OK, also, unique()
#df.describe(include='all', datetime_is_numeric=True)

  for val, m in zip(values.ravel(), mask.ravel())


0        11676.0
1         1131.0
2       106273.0
3       157679.0
4       104514.0
          ...   
2931    105440.0
2932    196770.0
2933    196772.0
2934      1130.0
2935    104470.0
Name: APHIA_ID, Length: 2934, dtype: category
Categories (99, float64): [101.0, 105.0, 883.0, 1082.0, ..., 346398.0, 353708.0, 355067.0, 1434803.0]

## Enough looking, let's get the two files as one

Zooplankton has two file system going on.
because of net size.
Let's rowbind the 150 and the 505 datasets together. luckily, their columns already look alike.

In [13]:
#Rename df to it's netsize

df150 = df

df150.head()

#read in the 505 data

#%ls ../Zooplankton_505

raw2 = pd.read_csv('https://raw.githubusercontent.com/ioos/bio_data_guide/main/datasets/AMBON_zooplankton/data/raw/AMBON2017505.csv', parse_dates=['Date_Time'])

raw2.tail()

Unnamed: 0,Cruise,Station,Type,Date_Time,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],Accepted_Organism_Identification,Life_Stage,APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3]
1788,AMBON2017,DBO3.8,BONGO_505UM_MICROSCOPY,2017-08-06 19:33:00,-168.9554,67.6645,48,1,45,Beroe,,106331,0.047,1.0295
1789,AMBON2017,DBO3.8,BONGO_505UM_MICROSCOPY,2017-08-06 19:33:00,-168.9554,67.6645,48,1,45,Parasagitta elegans,,105440,5.985,2.8788
1790,AMBON2017,DBO3.8,BONGO_505UM_MICROSCOPY,2017-08-06 19:33:00,-168.9554,67.6645,48,1,45,Euphausiacea,juvenile,1128,10.259,5.4358
1791,AMBON2017,DBO3.8,BONGO_505UM_MICROSCOPY,2017-08-06 19:33:00,-168.9554,67.6645,48,1,45,Bolinopsis infundibulum,,106939,0.404,5.4557
1792,AMBON2017,DBO3.8,BONGO_505UM_MICROSCOPY,2017-08-06 19:33:00,-168.9554,67.6645,48,1,45,Pisces,larvae,11676,0.024,


In [14]:
#check for NaN
raw2["APHIA_ID"].isnull().sum()
#hooray, 0
#raw2[1200:1206]

0

In [15]:
raw2.describe()
#1793 observations

Unnamed: 0,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3]
count,1793.0,1793.0,1793.0,1793.0,1793.0,1793.0,1793.0,1748.0
mean,-159.825056,70.587651,44.101506,40.776352,41.101506,104628.774679,3.79943,1.341224
std,37.27822,1.212839,7.305878,24.323931,7.305878,70729.662967,30.159061,7.11591
min,-168.9554,67.6645,23.0,1.0,20.0,883.0,0.005,0.0
25%,-166.0658,70.0066,41.0,21.0,38.0,103256.0,0.028,0.0101
50%,-163.7979,70.9344,45.0,38.0,42.0,106673.0,0.102,0.0608
75%,-162.1516,71.449,48.0,62.0,45.0,117755.0,0.523,0.386325
max,159.4106,72.4969,59.0,85.0,56.0,355067.0,804.396,210.0


In [16]:
#rowbind these two csvs together
#first make the csvs raw copies
df505 = raw2.copy()

df505.head()



Unnamed: 0,Cruise,Station,Type,Date_Time,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],Accepted_Organism_Identification,Life_Stage,APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3]
0,AMBON2017,ML1.2,BONGO_505UM_MICROSCOPY,2017-08-17 18:22:00,-163.0342,70.1434,23,56,20,Acartia longiremis,,104257,0.057,0.0002
1,AMBON2017,ML1.2,BONGO_505UM_MICROSCOPY,2017-08-17 18:22:00,-163.0342,70.1434,23,56,20,Eurytemora pacifica,,232028,0.057,0.0007
2,AMBON2017,ML1.2,BONGO_505UM_MICROSCOPY,2017-08-17 18:22:00,-163.0342,70.1434,23,56,20,Hyperia,,101796,0.014,0.0012
3,AMBON2017,ML1.2,BONGO_505UM_MICROSCOPY,2017-08-17 18:22:00,-163.0342,70.1434,23,56,20,Limacina helicina,,140223,0.014,0.0013
4,AMBON2017,ML1.2,BONGO_505UM_MICROSCOPY,2017-08-17 18:22:00,-163.0342,70.1434,23,56,20,Pseudocalanus acuspes,,104514,0.114,0.0015


In [17]:
df505.describe()

Unnamed: 0,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3]
count,1793.0,1793.0,1793.0,1793.0,1793.0,1793.0,1793.0,1748.0
mean,-159.825056,70.587651,44.101506,40.776352,41.101506,104628.774679,3.79943,1.341224
std,37.27822,1.212839,7.305878,24.323931,7.305878,70729.662967,30.159061,7.11591
min,-168.9554,67.6645,23.0,1.0,20.0,883.0,0.005,0.0
25%,-166.0658,70.0066,41.0,21.0,38.0,103256.0,0.028,0.0101
50%,-163.7979,70.9344,45.0,38.0,42.0,106673.0,0.102,0.0608
75%,-162.1516,71.449,48.0,62.0,45.0,117755.0,0.523,0.386325
max,159.4106,72.4969,59.0,85.0,56.0,355067.0,804.396,210.0


In [18]:
#so now with have df150 and df505, let's concat

df = pd.concat([df150, df505])
df.describe()
#4727 count total
#df[2930:2940]
df['Life_Stage'].unique()

array(['larvae', ' ', 'male', 'nauplii', 'cyprid', 'bipinnaria',
       'cyphonautes', 'calytopsis', 'zoea', 'furcilia', 'juvenile',
       'megalopa', 'eudoxid', '0'], dtype=object)

In [19]:
# Yay for one big dataframe

df.reset_index(drop=True, inplace=True)
df.tail()

Unnamed: 0,Cruise,Station,Type,Date_Time,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],Accepted_Organism_Identification,Life_Stage,APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3]
4722,AMBON2017,DBO3.8,BONGO_505UM_MICROSCOPY,2017-08-06 19:33:00,-168.9554,67.6645,48,1.0,45,Beroe,,106331.0,0.047,1.0295
4723,AMBON2017,DBO3.8,BONGO_505UM_MICROSCOPY,2017-08-06 19:33:00,-168.9554,67.6645,48,1.0,45,Parasagitta elegans,,105440.0,5.985,2.8788
4724,AMBON2017,DBO3.8,BONGO_505UM_MICROSCOPY,2017-08-06 19:33:00,-168.9554,67.6645,48,1.0,45,Euphausiacea,juvenile,1128.0,10.259,5.4358
4725,AMBON2017,DBO3.8,BONGO_505UM_MICROSCOPY,2017-08-06 19:33:00,-168.9554,67.6645,48,1.0,45,Bolinopsis infundibulum,,106939.0,0.404,5.4557
4726,AMBON2017,DBO3.8,BONGO_505UM_MICROSCOPY,2017-08-06 19:33:00,-168.9554,67.6645,48,1.0,45,Pisces,larvae,11676.0,0.024,


## EventDate as ISO format

In [20]:
#let's get the eventDate looking ISO_perfect
#df.head()

list = df['Date_Time']
list

0      2017-08-20 22:48:00
1      2017-08-20 22:48:00
2      2017-08-20 22:48:00
3      2017-08-20 22:48:00
4      2017-08-20 22:48:00
               ...        
4722   2017-08-06 19:33:00
4723   2017-08-06 19:33:00
4724   2017-08-06 19:33:00
4725   2017-08-06 19:33:00
4726   2017-08-06 19:33:00
Name: Date_Time, Length: 4727, dtype: datetime64[ns]

In [21]:
#iso format the thing? from datetime package
test = list[1]
test
#est.isoformat()
test = test.isoformat()
#est
print(test)
test

2017-08-20T22:48:00


'2017-08-20T22:48:00'

In [22]:
#Make a list, build it with iso format, add 'Z' for utc timezone

storage = []
for x in list:
    x=x.isoformat()
    x=x+'Z'
    storage.append(x)

    #I have no idea why I can't write over in place in the list, but dumping the output into another list works for me

In [23]:
storage[1:6]

['2017-08-20T22:48:00Z',
 '2017-08-20T22:48:00Z',
 '2017-08-20T22:48:00Z',
 '2017-08-20T22:48:00Z',
 '2017-08-20T22:48:00Z']

In [24]:
#done
#storage
df['eventDate']=storage
df.head()

Unnamed: 0,Cruise,Station,Type,Date_Time,Longitude_[decimal_degrees_east],Latitude_[decimal _degrees_north],Bottom_Depth_[m],Cast_Number,Depth_[m],Accepted_Organism_Identification,Life_Stage,APHIA_ID,Abundance_[#/m3],Biomass_[mg dw/m3],eventDate
0,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pisces,larvae,11676.0,1.172,,2017-08-20T22:48:00Z
1,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Isopoda,,1131.0,1.172,,2017-08-20T22:48:00Z
2,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Evadne nordmanni,,106273.0,1.172,0.0086,2017-08-20T22:48:00Z
3,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pseudocalanus newmani,,157679.0,4.686,0.0121,2017-08-20T22:48:00Z
4,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pseudocalanus acuspes,,104514.0,4.686,0.0335,2017-08-20T22:48:00Z


---
# eventID and datasetID and OccurenceID for all the tables

So my understanding is.....
One occurenceID for each row as is here, in the untidy, because 2 measurements were taken off a given species from the 'station stop' on the cruise. And then reference that occurenceID in the long tidy measurementorFact table


In [25]:
# added by mmb 2021-10-04
# cast number should be a string not a number, so let's replace NaN with empty and convert number to string as integer.
df['Cast_Number_conv'] = df['Cast_Number'].astype('Int64').astype(str).replace('<NA>','?')


# find where lifeStage is either 'male' or 'female' and return that to another column called 'sex'
df['sex'] = df.loc[ (df['Life_Stage'] == 'male') | (df['Life_Stage'] == 'female'), ['Life_Stage'] ]
df['sex'].replace(np.nan, '?', inplace=True)
#df.loc[ (df['Life_Stage'] == 'male'), ['Life_Stage']] = ''
#df.loc[ (df['Life_Stage'] == 'female'), ['Life_Stage']] = ''
        
# Adjust life stage to not have null values but empty cells
df['Life_Stage_conv'] = df['Life_Stage'].replace(' ','?')

# adjust Accepted_Organism_Identification to separate by underscores, replace / with _ as well
df['Accepted_Organism_Identification_conv'] = df['Accepted_Organism_Identification'].str.replace(' ','_').str.replace('/','_')


df[['Life_Stage_conv','Cast_Number_conv','Accepted_Organism_Identification_conv','sex']]

Unnamed: 0,Life_Stage_conv,Cast_Number_conv,Accepted_Organism_Identification_conv,sex
0,larvae,?,Pisces,?
1,?,?,Isopoda,?
2,?,?,Evadne_nordmanni,?
3,?,?,Pseudocalanus_newmani,?
4,?,?,Pseudocalanus_acuspes,?
...,...,...,...,...
4722,?,1,Beroe,?
4723,?,1,Parasagitta_elegans,?
4724,juvenile,1,Euphausiacea,?
4725,?,1,Bolinopsis_infundibulum,?


In [26]:
df['sex'].unique()

array(['?', 'male'], dtype=object)

In [27]:
#let's do two eggs in one nest with the datasetID and occurenceID stuff
#first, datasetID
df['datasetID'] = 'AMBON_Zooplankton_2017'

#occurenceID -- I'll have to tackle that separately
#df['occurrenceID'] =df['datasetID']+'_'+df['Station']+'_'

# mmb 2021-10-04 
# join string columns together as id
# need to convert biomass to a string to join
df['depth_str'] = df['Depth_[m]'].astype(str)
df['date_str'] = df['Date_Time'].dt.strftime('%Y-%m-%dT%H:%M:%S%Z')
df['biomass_str'] = df['Biomass_[mg dw/m3]'].astype(str)

df['occurrenceID'] = df[['datasetID',
                         'Station',
                         'Cast_Number_conv',
                         'Accepted_Organism_Identification_conv',
                         'Life_Stage_conv',
                         'sex',
                         'Type',
                         #'biomass_str',
                         'depth_str',
                         'date_str',
                        ]].agg('_'.join, axis=1)

#then eventID
df['eventID'] = df['datasetID']+'_'+df['Station']+'_'+df['eventDate']

In [28]:
#How many rows are in this dataframe?
#df.describe()
df['occurrenceID'].tail().tolist() # edited mmb 2021-07-23
#4726

['AMBON_Zooplankton_2017_DBO3.8_1_Beroe_?_?_BONGO_505UM_MICROSCOPY_45_2017-08-06T19:33:00',
 'AMBON_Zooplankton_2017_DBO3.8_1_Parasagitta_elegans_?_?_BONGO_505UM_MICROSCOPY_45_2017-08-06T19:33:00',
 'AMBON_Zooplankton_2017_DBO3.8_1_Euphausiacea_juvenile_?_BONGO_505UM_MICROSCOPY_45_2017-08-06T19:33:00',
 'AMBON_Zooplankton_2017_DBO3.8_1_Bolinopsis_infundibulum_?_?_BONGO_505UM_MICROSCOPY_45_2017-08-06T19:33:00',
 'AMBON_Zooplankton_2017_DBO3.8_1_Pisces_larvae_?_BONGO_505UM_MICROSCOPY_45_2017-08-06T19:33:00']

Return a sorted list of the duplicate occurrenceID's. 

In [29]:
print('Number of duplicates found = %s\n' % df[df.duplicated(subset='occurrenceID', keep=False)].shape[0])

print('Duplicate occurrenceIDs:')
sorted(df[df.duplicated(subset='occurrenceID', keep=False)]['occurrenceID'].tolist()) # added by mmb 2021-10-04

Number of duplicates found = 410

Duplicate occurrenceIDs:


['AMBON_Zooplankton_2017_BBL10_?_Calanus_glacialis_marshallae_?_?_TWINRING_150UM_MICROSCOPY_48_2017-08-22T03:47:00',
 'AMBON_Zooplankton_2017_BBL10_?_Calanus_glacialis_marshallae_?_?_TWINRING_150UM_MICROSCOPY_48_2017-08-22T03:47:00',
 'AMBON_Zooplankton_2017_BBL10_?_Parasagitta_elegans_?_?_TWINRING_150UM_MICROSCOPY_48_2017-08-22T03:47:00',
 'AMBON_Zooplankton_2017_BBL10_?_Parasagitta_elegans_?_?_TWINRING_150UM_MICROSCOPY_48_2017-08-22T03:47:00',
 'AMBON_Zooplankton_2017_BBL11_?_Calanus_glacialis_marshallae_?_?_TWINRING_150UM_MICROSCOPY_46_2017-08-22T03:47:00',
 'AMBON_Zooplankton_2017_BBL11_?_Calanus_glacialis_marshallae_?_?_TWINRING_150UM_MICROSCOPY_46_2017-08-22T03:47:00',
 'AMBON_Zooplankton_2017_BBL11_?_Parasagitta_elegans_?_?_TWINRING_150UM_MICROSCOPY_46_2017-08-22T03:47:00',
 'AMBON_Zooplankton_2017_BBL11_?_Parasagitta_elegans_?_?_TWINRING_150UM_MICROSCOPY_46_2017-08-22T03:47:00',
 'AMBON_Zooplankton_2017_BBL11_?_Polychaeta_larvae_?_TWINRING_150UM_MICROSCOPY_46_2017-08-22T03:47:0

Show me a duplicate record

In [30]:
df.loc[df['occurrenceID'] == 'AMBON_Zooplankton_2017_BBL10_?_Calanus_glacialis_marshallae_?_?_TWINRING_150UM_MICROSCOPY_48_2017-08-22T03:47:00'].T

Unnamed: 0,2692,2700
Cruise,AMBON2017,AMBON2017
Station,BBL10,BBL10
Type,TWINRING_150UM_MICROSCOPY,TWINRING_150UM_MICROSCOPY
Date_Time,2017-08-22 03:47:00,2017-08-22 03:47:00
Longitude_[decimal_degrees_east],-167.3646,-167.3646
Latitude_[decimal _degrees_north],70.1925,70.1925
Bottom_Depth_[m],51,51
Cast_Number,,
Depth_[m],48,48
Accepted_Organism_Identification,Calanus glacialis/marshallae,Calanus glacialis/marshallae


To resolve the duplicate record, add the biomass (or abundance) value to make `occurrenceID` unique.

In [31]:
df['occurrenceID'] = df[['datasetID',
                         'Station',
                         'Cast_Number_conv',
                         'Accepted_Organism_Identification_conv',
                         'Life_Stage_conv',
                         'sex',
                         'Type',
                         'biomass_str',
                         'depth_str',
                         'date_str',
                        ]].agg('_'.join, axis=1)

Check for duplicate IDs.

In [32]:
print('Number of duplicates found = %s\n' % df[df.duplicated(subset='occurrenceID', keep=False)].shape[0])

print('Duplicate occurrenceIDs:')
sorted(df[df.duplicated(subset='occurrenceID', keep=False)]['occurrenceID'].tolist()) # added by mmb 2021-10-04

Number of duplicates found = 0

Duplicate occurrenceIDs:


[]

# END of ID generation
---

In [33]:
#OK, rename and then add new columns
#rename columns as necessary

df.rename(columns={'Depth_[m]': 'minimumDepthInMeters',
                   'Life_Stage': 'lifeStage',
                   'Type': 'samplingProtocol',
                   'Longitude_[decimal_degrees_east]': 'decimalLongitude',
                   'Latitude_[decimal _degrees_north]': 'decimalLatitude',
                   'Accepted_Organism_Identification': 'scientificName',
                   'APHIA_ID': 'taxonID' }, inplace=True)

# add new parameters

df['scientificNameID'] = 'urn:lsid:marinespecies.org:taxname:'+df['taxonID'].astype(str)
df['identificationReferences'] = 'WoRMS'
df['basisOfRecord'] = 'HumanObservation' #nospace!
df['occurenceStatus'] = 'present'

df.head()

Unnamed: 0,Cruise,Station,samplingProtocol,Date_Time,decimalLongitude,decimalLatitude,Bottom_Depth_[m],Cast_Number,minimumDepthInMeters,scientificName,lifeStage,taxonID,Abundance_[#/m3],Biomass_[mg dw/m3],eventDate,Cast_Number_conv,sex,Life_Stage_conv,Accepted_Organism_Identification_conv,datasetID,depth_str,date_str,biomass_str,occurrenceID,eventID,scientificNameID,identificationReferences,basisOfRecord,occurenceStatus
0,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pisces,larvae,11676.0,1.172,,2017-08-20T22:48:00Z,?,?,larvae,Pisces,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,,AMBON_Zooplankton_2017_BBL1_?_Pisces_larvae_?_...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:11676.0,WoRMS,HumanObservation,present
1,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Isopoda,,1131.0,1.172,,2017-08-20T22:48:00Z,?,?,?,Isopoda,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,,AMBON_Zooplankton_2017_BBL1_?_Isopoda_?_?_TWIN...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:1131.0,WoRMS,HumanObservation,present
2,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Evadne nordmanni,,106273.0,1.172,0.0086,2017-08-20T22:48:00Z,?,?,?,Evadne_nordmanni,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0086,AMBON_Zooplankton_2017_BBL1_?_Evadne_nordmanni...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:106273.0,WoRMS,HumanObservation,present
3,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pseudocalanus newmani,,157679.0,4.686,0.0121,2017-08-20T22:48:00Z,?,?,?,Pseudocalanus_newmani,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0121,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ne...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:157679.0,WoRMS,HumanObservation,present
4,AMBON2017,BBL1,TWINRING_150UM_MICROSCOPY,2017-08-20 22:48:00,-163.5095,69.3443,15,,12,Pseudocalanus acuspes,,104514.0,4.686,0.0335,2017-08-20T22:48:00Z,?,?,?,Pseudocalanus_acuspes,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0335,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ac...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:104514.0,WoRMS,HumanObservation,present


In [34]:
df['maximumDepthInMeters'] = df['minimumDepthInMeters'].copy()

In [35]:
# Remove unnecessary columns

df = df.drop(columns=['Cruise', 'Station', 'Bottom_Depth_[m]', 'Date_Time','Cast_Number'])

#but not the measurements!
df.head()

Unnamed: 0,samplingProtocol,decimalLongitude,decimalLatitude,minimumDepthInMeters,scientificName,lifeStage,taxonID,Abundance_[#/m3],Biomass_[mg dw/m3],eventDate,Cast_Number_conv,sex,Life_Stage_conv,Accepted_Organism_Identification_conv,datasetID,depth_str,date_str,biomass_str,occurrenceID,eventID,scientificNameID,identificationReferences,basisOfRecord,occurenceStatus,maximumDepthInMeters
0,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Pisces,larvae,11676.0,1.172,,2017-08-20T22:48:00Z,?,?,larvae,Pisces,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,,AMBON_Zooplankton_2017_BBL1_?_Pisces_larvae_?_...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:11676.0,WoRMS,HumanObservation,present,12
1,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Isopoda,,1131.0,1.172,,2017-08-20T22:48:00Z,?,?,?,Isopoda,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,,AMBON_Zooplankton_2017_BBL1_?_Isopoda_?_?_TWIN...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:1131.0,WoRMS,HumanObservation,present,12
2,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Evadne nordmanni,,106273.0,1.172,0.0086,2017-08-20T22:48:00Z,?,?,?,Evadne_nordmanni,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0086,AMBON_Zooplankton_2017_BBL1_?_Evadne_nordmanni...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:106273.0,WoRMS,HumanObservation,present,12
3,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Pseudocalanus newmani,,157679.0,4.686,0.0121,2017-08-20T22:48:00Z,?,?,?,Pseudocalanus_newmani,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0121,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ne...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:157679.0,WoRMS,HumanObservation,present,12
4,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Pseudocalanus acuspes,,104514.0,4.686,0.0335,2017-08-20T22:48:00Z,?,?,?,Pseudocalanus_acuspes,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0335,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ac...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:104514.0,WoRMS,HumanObservation,present,12


In [36]:

#taxonid needs to not have trailing .0
taxons = df[['taxonID']].astype('string', errors='ignore')
t=taxons['taxonID'].convert_dtypes()
t=t.str.strip('.0')
t
df['taxonID']=t
df.head()



Unnamed: 0,samplingProtocol,decimalLongitude,decimalLatitude,minimumDepthInMeters,scientificName,lifeStage,taxonID,Abundance_[#/m3],Biomass_[mg dw/m3],eventDate,Cast_Number_conv,sex,Life_Stage_conv,Accepted_Organism_Identification_conv,datasetID,depth_str,date_str,biomass_str,occurrenceID,eventID,scientificNameID,identificationReferences,basisOfRecord,occurenceStatus,maximumDepthInMeters
0,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Pisces,larvae,11676,1.172,,2017-08-20T22:48:00Z,?,?,larvae,Pisces,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,,AMBON_Zooplankton_2017_BBL1_?_Pisces_larvae_?_...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:11676.0,WoRMS,HumanObservation,present,12
1,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Isopoda,,1131,1.172,,2017-08-20T22:48:00Z,?,?,?,Isopoda,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,,AMBON_Zooplankton_2017_BBL1_?_Isopoda_?_?_TWIN...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:1131.0,WoRMS,HumanObservation,present,12
2,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Evadne nordmanni,,106273,1.172,0.0086,2017-08-20T22:48:00Z,?,?,?,Evadne_nordmanni,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0086,AMBON_Zooplankton_2017_BBL1_?_Evadne_nordmanni...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:106273.0,WoRMS,HumanObservation,present,12
3,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Pseudocalanus newmani,,157679,4.686,0.0121,2017-08-20T22:48:00Z,?,?,?,Pseudocalanus_newmani,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0121,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ne...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:157679.0,WoRMS,HumanObservation,present,12
4,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Pseudocalanus acuspes,,104514,4.686,0.0335,2017-08-20T22:48:00Z,?,?,?,Pseudocalanus_acuspes,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0335,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ac...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:104514.0,WoRMS,HumanObservation,present,12


In [37]:
df['kingdom']='Animalia'

In [38]:
#finally!
#nope it's scientificNameID that needs this
sciids = df[['scientificNameID']].astype('string', errors='ignore')
s = sciids['scientificNameID'].convert_dtypes()
s=s.str.strip('.0')
df['scientificNameID']=s
df.head()

Unnamed: 0,samplingProtocol,decimalLongitude,decimalLatitude,minimumDepthInMeters,scientificName,lifeStage,taxonID,Abundance_[#/m3],Biomass_[mg dw/m3],eventDate,Cast_Number_conv,sex,Life_Stage_conv,Accepted_Organism_Identification_conv,datasetID,depth_str,date_str,biomass_str,occurrenceID,eventID,scientificNameID,identificationReferences,basisOfRecord,occurenceStatus,maximumDepthInMeters,kingdom
0,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Pisces,larvae,11676,1.172,,2017-08-20T22:48:00Z,?,?,larvae,Pisces,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,,AMBON_Zooplankton_2017_BBL1_?_Pisces_larvae_?_...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:11676,WoRMS,HumanObservation,present,12,Animalia
1,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Isopoda,,1131,1.172,,2017-08-20T22:48:00Z,?,?,?,Isopoda,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,,AMBON_Zooplankton_2017_BBL1_?_Isopoda_?_?_TWIN...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:1131,WoRMS,HumanObservation,present,12,Animalia
2,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Evadne nordmanni,,106273,1.172,0.0086,2017-08-20T22:48:00Z,?,?,?,Evadne_nordmanni,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0086,AMBON_Zooplankton_2017_BBL1_?_Evadne_nordmanni...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:106273,WoRMS,HumanObservation,present,12,Animalia
3,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Pseudocalanus newmani,,157679,4.686,0.0121,2017-08-20T22:48:00Z,?,?,?,Pseudocalanus_newmani,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0121,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ne...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:157679,WoRMS,HumanObservation,present,12,Animalia
4,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,Pseudocalanus acuspes,,104514,4.686,0.0335,2017-08-20T22:48:00Z,?,?,?,Pseudocalanus_acuspes,AMBON_Zooplankton_2017,12,2017-08-20T22:48:00,0.0335,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ac...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,urn:lsid:marinespecies.org:taxname:104514,WoRMS,HumanObservation,present,12,Animalia


In [39]:
#double checking
df.columns

Index(['samplingProtocol', 'decimalLongitude', 'decimalLatitude',
       'minimumDepthInMeters', 'scientificName', 'lifeStage', 'taxonID',
       'Abundance_[#/m3]', 'Biomass_[mg dw/m3]', 'eventDate',
       'Cast_Number_conv', 'sex', 'Life_Stage_conv',
       'Accepted_Organism_Identification_conv', 'datasetID', 'depth_str',
       'date_str', 'biomass_str', 'occurrenceID', 'eventID',
       'scientificNameID', 'identificationReferences', 'basisOfRecord',
       'occurenceStatus', 'maximumDepthInMeters', 'kingdom'],
      dtype='object')

In [40]:
df['lifeStage'].unique()

array(['larvae', ' ', 'male', 'nauplii', 'cyprid', 'bipinnaria',
       'cyphonautes', 'calytopsis', 'zoea', 'furcilia', 'juvenile',
       'megalopa', 'eudoxid', '0'], dtype=object)

In [41]:
df['lifeStage'].replace(to_replace='male', value=None, inplace= True, method='pad')
df['lifeStage'].replace(to_replace='female', value=None, inplace= True, method='pad')

df['lifeStage'].unique()

array(['larvae', ' ', 'nauplii', 'cyprid', 'bipinnaria', 'cyphonautes',
       'calytopsis', 'zoea', 'furcilia', 'juvenile', 'megalopa',
       'eudoxid', '0'], dtype=object)

# Now for table 1 - event table

## Event table needs:

Event Table  

eventID == 'eventID'   
EventDate== 'eventDate'  
Depth == 'minimumDepthInMeters' , 'maximumDepthInMeters'
decimal Lat== 'decimalLatitude'  
decimal Long == 'decimalLongitude'  
~~basisofRecord== 'basisOfRecord'  ~~
geodeticDatum == 'WGS84'  
countryCode == 'US'  
coordinateUncertaintyInMeters ==~~'NA'~~ Nope, if it's blank do not include.  


Without repeating or duplicate events. So, filtered by unique. IN PANDAS that's...

In [42]:
eventdf = df[['eventID', 'eventDate', 'maximumDepthInMeters','minimumDepthInMeters' , 'decimalLatitude', 'decimalLongitude']]
eventdf.head()
eventdf['eventID'].describe()

#and, that's that? Check!

count                                                  4727
unique                                                  154
top       AMBON_Zooplankton_2017_DBO3.8_2017-08-06T18:14...
freq                                                     54
Name: eventID, dtype: object

In [43]:
#add columns of data needed
eventdf['geodeticDatum'] = 'WGS84'
eventdf['countryCode'] = 'US'
#eventdf['coordinateUncertaintyInMeters'] ='NA'

eventdf.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eventdf['geodeticDatum'] = 'WGS84'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eventdf['countryCode'] = 'US'


Unnamed: 0,eventID,eventDate,maximumDepthInMeters,minimumDepthInMeters,decimalLatitude,decimalLongitude,geodeticDatum,countryCode
0,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,12,12,69.3443,-163.5095,WGS84,US
1,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,12,12,69.3443,-163.5095,WGS84,US
2,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,12,12,69.3443,-163.5095,WGS84,US
3,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,12,12,69.3443,-163.5095,WGS84,US
4,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,12,12,69.3443,-163.5095,WGS84,US


In [44]:
#The event table needs to only hold those 135 unique events, so....
#eventdf.head()
eventdf = eventdf.drop_duplicates(subset=['eventID'])

eventdf['eventID'].describe()

#win

count                                                  154
unique                                                 154
top       AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z
freq                                                     1
Name: eventID, dtype: object

# Now for table 2 - Occurence

Occurence table needs
- Occurence/organism quantity table with the following fields, minimum
    - ~~eventDate
    - ~~eventID
    - ~~dec lat
    - ~~dec long~~
    - kingdom
    - scientificName 
    - basisofRecord (according to controlled vocab, probably HumanObservation)
    - occurenceID
    - datasetid (a shortname for the OPT occurence table ( more at  https://tools.gbif.org/dwca-validator/extension.do?id=dwc:event )
    - organismQuantity (as applicable)
    - organismQuantityType (as applicable)
    - lifeStage (if present)
    - occurrenceRemarks (if present)
    - occurrenceStatus = present
    - identificationReference

The tricky part here is the occurrence ID. Each occurence ID should be unique to each observation in the occurence table. It's recommended out of eventID and TAXON id. Or just make a UUID type deal.
It's also, a unique occurence observation but NOT a unique measurement. SO, for abundance and biomass, they're off the same occurence.

So I'm not tidy-ing the dataset yet, in case I need to brute-force something by row.


In [45]:
#build the dataframe

occdf = df[['datasetID','eventID','occurrenceID', 'kingdom', 'scientificName', 'scientificNameID', 'taxonID','identificationReferences', 
            'lifeStage','basisOfRecord', 'samplingProtocol']]

occdf.describe().T



Unnamed: 0,count,unique,top,freq
datasetID,4727,1,AMBON_Zooplankton_2017,4727
eventID,4727,154,AMBON_Zooplankton_2017_DBO3.8_2017-08-06T18:14...,54
occurrenceID,4727,4727,AMBON_Zooplankton_2017_BBL1_?_Pisces_larvae_?_...,1
kingdom,4727,1,Animalia,4727
scientificName,4727,108,Calanus glacialis/marshallae,227
scientificNameID,4727,111,urn:lsid:marinespecies.org:taxname:1082,216
taxonID,4727,111,1082,216
identificationReferences,4727,1,WoRMS,4727
lifeStage,4727,13,,3444
basisOfRecord,4727,1,HumanObservation,4727


In [46]:
occdf.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4727 entries, 0 to 4726
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   datasetID                 4727 non-null   object
 1   eventID                   4727 non-null   object
 2   occurrenceID              4727 non-null   object
 3   kingdom                   4727 non-null   object
 4   scientificName            4727 non-null   object
 5   scientificNameID          4727 non-null   string
 6   taxonID                   4727 non-null   string
 7   identificationReferences  4727 non-null   object
 8   lifeStage                 4727 non-null   object
 9   basisOfRecord             4727 non-null   object
 10  samplingProtocol          4727 non-null   object
dtypes: object(9), string(2)
memory usage: 406.4+ KB


In [47]:
#where are the messy values in scinameid and taxon id?
n = occdf['scientificNameID'] == 'urn:lsid:marinespecies.org:taxname:nan'
occdf[n]

Unnamed: 0,datasetID,eventID,occurrenceID,kingdom,scientificName,scientificNameID,taxonID,identificationReferences,lifeStage,basisOfRecord,samplingProtocol


In [48]:
#add columns still needed
#occdf.insert(0,'ocurrenceStatus', 'present')
occdf['ocurrenceStatus'] = 'present'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  occdf['ocurrenceStatus'] = 'present'


In [49]:
occdf.head()

Unnamed: 0,datasetID,eventID,occurrenceID,kingdom,scientificName,scientificNameID,taxonID,identificationReferences,lifeStage,basisOfRecord,samplingProtocol,ocurrenceStatus
0,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,AMBON_Zooplankton_2017_BBL1_?_Pisces_larvae_?_...,Animalia,Pisces,urn:lsid:marinespecies.org:taxname:11676,11676,WoRMS,larvae,HumanObservation,TWINRING_150UM_MICROSCOPY,present
1,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,AMBON_Zooplankton_2017_BBL1_?_Isopoda_?_?_TWIN...,Animalia,Isopoda,urn:lsid:marinespecies.org:taxname:1131,1131,WoRMS,,HumanObservation,TWINRING_150UM_MICROSCOPY,present
2,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,AMBON_Zooplankton_2017_BBL1_?_Evadne_nordmanni...,Animalia,Evadne nordmanni,urn:lsid:marinespecies.org:taxname:106273,106273,WoRMS,,HumanObservation,TWINRING_150UM_MICROSCOPY,present
3,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ne...,Animalia,Pseudocalanus newmani,urn:lsid:marinespecies.org:taxname:157679,157679,WoRMS,,HumanObservation,TWINRING_150UM_MICROSCOPY,present
4,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ac...,Animalia,Pseudocalanus acuspes,urn:lsid:marinespecies.org:taxname:104514,104514,WoRMS,,HumanObservation,TWINRING_150UM_MICROSCOPY,present


## Make the measurementorfact table long and tidy

So the occurence table is now, bereft of actual specimen information? it references out to this table.
(it could hold one measurement of the information, but in this particular case study, it made more sense not to)

What I need is melt, so in pandas that's....
melt. ^_^

So, Abundance, and biomass, stacked on top of eachother, as 'organismQuantity' and 'organismQuantitytype' containing 'biomass' or abundance dry weight.

MoFTable needs:
- MeasurementOrFact table for additional attributes/variables, But ONLY those in the [OBIS mof guide](https://tools.gbif.org/dwca-validator/extension.do?id=dwc:MeasurementOrFact)
    - eventID
    - occurenceID
    - measurementType
    - measurementTypeID
    - measurementValue
    - measurementUnit
    - measurementUnitID


In [50]:
#OK, let's make this a tidy dataset of as much of the information as possible, THEN manipulate it into a MOF dataframe. So I have a tidy dataset if I need it.
#DataFrame.melt(id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None, ignore_index=True)


tidydf = df.melt(id_vars=['datasetID','occurrenceID','eventID','eventDate','samplingProtocol','decimalLongitude','decimalLatitude','maximumDepthInMeters', 'minimumDepthInMeters','scientificName','lifeStage','taxonID','scientificNameID','identificationReferences','basisOfRecord'],
        value_vars=['Abundance_[#/m3]', 'Biomass_[mg dw/m3]'])

tidydf.head()

Unnamed: 0,datasetID,occurrenceID,eventID,eventDate,samplingProtocol,decimalLongitude,decimalLatitude,maximumDepthInMeters,minimumDepthInMeters,scientificName,lifeStage,taxonID,scientificNameID,identificationReferences,basisOfRecord,variable,value
0,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_?_Pisces_larvae_?_...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,12,Pisces,larvae,11676,urn:lsid:marinespecies.org:taxname:11676,WoRMS,HumanObservation,Abundance_[#/m3],1.172
1,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_?_Isopoda_?_?_TWIN...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,12,Isopoda,,1131,urn:lsid:marinespecies.org:taxname:1131,WoRMS,HumanObservation,Abundance_[#/m3],1.172
2,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_?_Evadne_nordmanni...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,12,Evadne nordmanni,,106273,urn:lsid:marinespecies.org:taxname:106273,WoRMS,HumanObservation,Abundance_[#/m3],1.172
3,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ne...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,12,Pseudocalanus newmani,,157679,urn:lsid:marinespecies.org:taxname:157679,WoRMS,HumanObservation,Abundance_[#/m3],4.686
4,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ac...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,12,Pseudocalanus acuspes,,104514,urn:lsid:marinespecies.org:taxname:104514,WoRMS,HumanObservation,Abundance_[#/m3],4.686


In [51]:
tidydf.describe()

Unnamed: 0,decimalLongitude,decimalLatitude,maximumDepthInMeters,minimumDepthInMeters,value
count,9454.0,9454.0,9454.0,9454.0,9382.0
mean,-162.591282,70.464638,40.221917,40.221917,71.083501
std,23.140421,1.194188,8.210705,8.210705,833.285075
min,-168.9554,67.6645,12.0,12.0,0.0
25%,-166.4237,69.9114,38.0,38.0,0.027
50%,-163.9288,70.7718,42.0,42.0,0.18
75%,-162.2024,71.3222,45.0,45.0,1.908825
max,159.4106,72.4969,56.0,56.0,37568.289


In [52]:
tidydf['occurrenceID'].describe()

count                                                  9454
unique                                                 4727
top       AMBON_Zooplankton_2017_BBL1_?_Pisces_larvae_?_...
freq                                                      2
Name: occurrenceID, dtype: object

In [53]:
# I will take that, let's rename 'variable ' and 'value'
tidydf.rename(columns={'variable': 'measurementType',
                   'value': 'measurementValue'}, inplace= True)
tidydf.head()

#tidydf.columns


Unnamed: 0,datasetID,occurrenceID,eventID,eventDate,samplingProtocol,decimalLongitude,decimalLatitude,maximumDepthInMeters,minimumDepthInMeters,scientificName,lifeStage,taxonID,scientificNameID,identificationReferences,basisOfRecord,measurementType,measurementValue
0,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_?_Pisces_larvae_?_...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,12,Pisces,larvae,11676,urn:lsid:marinespecies.org:taxname:11676,WoRMS,HumanObservation,Abundance_[#/m3],1.172
1,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_?_Isopoda_?_?_TWIN...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,12,Isopoda,,1131,urn:lsid:marinespecies.org:taxname:1131,WoRMS,HumanObservation,Abundance_[#/m3],1.172
2,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_?_Evadne_nordmanni...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,12,Evadne nordmanni,,106273,urn:lsid:marinespecies.org:taxname:106273,WoRMS,HumanObservation,Abundance_[#/m3],1.172
3,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ne...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,12,Pseudocalanus newmani,,157679,urn:lsid:marinespecies.org:taxname:157679,WoRMS,HumanObservation,Abundance_[#/m3],4.686
4,AMBON_Zooplankton_2017,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ac...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,2017-08-20T22:48:00Z,TWINRING_150UM_MICROSCOPY,-163.5095,69.3443,12,12,Pseudocalanus acuspes,,104514,urn:lsid:marinespecies.org:taxname:104514,WoRMS,HumanObservation,Abundance_[#/m3],4.686


In [54]:
tidydf.describe()
tidydf['occurrenceID'].describe()

count                                                  9454
unique                                                 4727
top       AMBON_Zooplankton_2017_BBL1_?_Pisces_larvae_?_...
freq                                                      2
Name: occurrenceID, dtype: object

In [55]:
tidydf.columns

Index(['datasetID', 'occurrenceID', 'eventID', 'eventDate', 'samplingProtocol',
       'decimalLongitude', 'decimalLatitude', 'maximumDepthInMeters',
       'minimumDepthInMeters', 'scientificName', 'lifeStage', 'taxonID',
       'scientificNameID', 'identificationReferences', 'basisOfRecord',
       'measurementType', 'measurementValue'],
      dtype='object')

In [56]:
#MOF table needs columns dropped, added, renamed etc
mofdf = tidydf.drop(columns=['datasetID','eventDate','samplingProtocol','decimalLongitude','decimalLatitude','maximumDepthInMeters', 'minimumDepthInMeters','scientificName','lifeStage','taxonID','scientificNameID','identificationReferences','basisOfRecord',])

#add static values from DWC vocabs etc
mofdf['measurementUnit'] = 'Number per m3'
mofdf['measurementUnitID'] = 'http://vocab.nerc.ac.uk/collection/P06/current/UPMM/'
#mofdf['measurementTypeID'] = '' #Do not include things that are blank values
mofdf.head()

Unnamed: 0,occurrenceID,eventID,measurementType,measurementValue,measurementUnit,measurementUnitID
0,AMBON_Zooplankton_2017_BBL1_?_Pisces_larvae_?_...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,Abundance_[#/m3],1.172,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...
1,AMBON_Zooplankton_2017_BBL1_?_Isopoda_?_?_TWIN...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,Abundance_[#/m3],1.172,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...
2,AMBON_Zooplankton_2017_BBL1_?_Evadne_nordmanni...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,Abundance_[#/m3],1.172,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...
3,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ne...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,Abundance_[#/m3],4.686,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...
4,AMBON_Zooplankton_2017_BBL1_?_Pseudocalanus_ac...,AMBON_Zooplankton_2017_BBL1_2017-08-20T22:48:00Z,Abundance_[#/m3],4.686,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...


In [57]:
mofdf['measurementTypeID'] = ''

In [58]:
#for abundance mofdf['measurementTypeID'] = 'http://vocab.nerc.ac.uk/collection/S06/current/S0600002/'
#for biomass mofdf['measurementTypeID'] = 'http://vocab.nerc.ac.uk/collection/S06/current/S0600086/'

mofdf.loc[mofdf['measurementType'] == 'Abundance_[#/m3]', 'measurementTypeID']= 'http://vocab.nerc.ac.uk/collection/S06/current/S0600002/'
mofdf.loc[mofdf['measurementType'] == 'Biomass_[mg dw/m3]', 'measurementTypeID']= 'http://vocab.nerc.ac.uk/collection/S06/current/S0600086/'

In [59]:
mofdf[2100:2110]

Unnamed: 0,occurrenceID,eventID,measurementType,measurementValue,measurementUnit,measurementUnitID,measurementTypeID
2100,AMBON_Zooplankton_2017_ML1.12_?_Brachyura_zoea...,AMBON_Zooplankton_2017_ML1.12_2017-08-19T01:25...,Abundance_[#/m3],0.59,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...,http://vocab.nerc.ac.uk/collection/S06/current...
2101,AMBON_Zooplankton_2017_ML1.12_?_Pseudocalanus_...,AMBON_Zooplankton_2017_ML1.12_2017-08-19T01:25...,Abundance_[#/m3],18.872,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...,http://vocab.nerc.ac.uk/collection/S06/current...
2102,AMBON_Zooplankton_2017_ML1.12_?_Euphausiacea_f...,AMBON_Zooplankton_2017_ML1.12_2017-08-19T01:25...,Abundance_[#/m3],0.843,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...,http://vocab.nerc.ac.uk/collection/S06/current...
2103,AMBON_Zooplankton_2017_ML1.12_?_Fritillaria_bo...,AMBON_Zooplankton_2017_ML1.12_2017-08-19T01:25...,Abundance_[#/m3],949.011,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...,http://vocab.nerc.ac.uk/collection/S06/current...
2104,AMBON_Zooplankton_2017_ML1.12_?_Cirripedia_cyp...,AMBON_Zooplankton_2017_ML1.12_2017-08-19T01:25...,Abundance_[#/m3],9.436,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...,http://vocab.nerc.ac.uk/collection/S06/current...
2105,AMBON_Zooplankton_2017_ML1.12_?_Paguridae_zoea...,AMBON_Zooplankton_2017_ML1.12_2017-08-19T01:25...,Abundance_[#/m3],0.337,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...,http://vocab.nerc.ac.uk/collection/S06/current...
2106,AMBON_Zooplankton_2017_ML1.12_?_Polychaeta_lar...,AMBON_Zooplankton_2017_ML1.12_2017-08-19T01:25...,Abundance_[#/m3],121.322,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...,http://vocab.nerc.ac.uk/collection/S06/current...
2107,AMBON_Zooplankton_2017_ML1.12_?_Euphausiacea_j...,AMBON_Zooplankton_2017_ML1.12_2017-08-19T01:25...,Abundance_[#/m3],0.758,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...,http://vocab.nerc.ac.uk/collection/S06/current...
2108,AMBON_Zooplankton_2017_ML1.12_?_Calanus_glacia...,AMBON_Zooplankton_2017_ML1.12_2017-08-19T01:25...,Abundance_[#/m3],0.927,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...,http://vocab.nerc.ac.uk/collection/S06/current...
2109,AMBON_Zooplankton_2017_ML1.12_?_Oithona_simili...,AMBON_Zooplankton_2017_ML1.12_2017-08-19T01:25...,Abundance_[#/m3],399.016,Number per m3,http://vocab.nerc.ac.uk/collection/P06/current...,http://vocab.nerc.ac.uk/collection/S06/current...


## write out dataframe to new csv file

In [60]:
#Event Table
eventdf.to_csv('data/processed/AMBON2017_zooplankton_Event_DWC_mb.csv', index=False)
#occurence table
occdf.to_csv('data/processed/AMBON2017_zooplankton_Occurrence_DWC_mb.csv', index=False)
#MoF table
mofdf.to_csv('data/processed/AMBON2017_zooplankton_MoF_DWC_mb.csv', index=False)

In [61]:
!ls data/processed

AMBON2017_zooplankton_Event_DWC_mb.csv
AMBON2017_zooplankton_MoF_DWC_mb.csv
AMBON2017_zooplankton_Occurrence_DWC_mb.csv


# *Fin*