Code to get the data set for the OBIS workshop

In [1]:
# load required modules 
import xarray as xr
import numpy as np
import pandas as pd
import ifcb_api_access
import pyworms

In [2]:
import requests
import json
# step 1 URL for Santa Cruz
# Solution was curtesy of Joe Futrelle, author of the IFCB API
urlapi='https://ifcb.caloos.org/api/'
furl=urlapi+'filter_options'
response=requests.get(furl)
content=response.content
content=json.loads(content)
options=pd.DataFrame([content])
ifcbs=options['dataset_options'][0]
print(ifcbs)
ifcbtouse=ifcbs[12]

['bloofinz-io', 'bodega-marine-lab', 'calcofi-cruises-ctd', 'calcofi-cruises-underway', 'cal-poly-humboldt-hioc', 'cce-lter-process-cruises-ifcb-151', 'del-mar-mooring', 'mbari-power-buoy', 'newport-beach-pier', 'plumes-and-blooms-cruises', 'san-francisco-bay-cruises', 'san-francisco-pier-17', 'santa-cruz-municipal-wharf', 'scripps-pier-ifcb-151', 'scripps-pier-ifcb-158', 'scripps-pier-ifcb-183', 'stearns-wharf']


In [3]:
# So we can now select which IFCB we want from the list of available ones
url='https://ifcb.caloos.org/'+ifcbtouse

In [4]:
# Moving cells around so that could run as a loop for various sample files
# Moving 1 time loads etc outside of potential loop

In [5]:
# load Patrick's names and the AphiaID he matched to those names
# This unfortunately specific to the model.
class_names_file='c:/users/flbahr/class_names_matched_to_WoRMS_cencoos_classifier_20240412.csv'
class_names=pd.read_csv(class_names_file)

In [6]:
# list of class name table for comparison
class_names.head(3)

Unnamed: 0,Class Name,Class ID,Common Name,AphiaID,URN,Grouping,HAB,Description,Manual Classifier Notes,Example Images
0,Akashiwo,0,Akashiwo sanguinea,232546.0,urn:lsid:marinespecies.org:taxname:232546,Dinoflagellate,Y,"Monophyletic, marine dinoflagellate","large single cells, kidney-shaped when viewed ...",
1,Alexandrium_singlet,1,Alexandium sp.,109470.0,urn:lsid:marinespecies.org:taxname:109470,Dinoflagellate,Y,A. catenella and others,"Originally class was broken out for singles, d...",http://akashiwo.oceandatacenter.ucsc.edu:8000/...
2,Amy_Gony_Protoc,2,"Amylax, Gonyaulax or Protoceratium",109428.0,urn:lsid:marinespecies.org:taxname:109428,Dinoflagellate,Y,Descision was made not to distinguish between ...,“Feet” and points visible.,


In [7]:
# Get the size of Patrick's class name file so we can loop through AphiaIDs and lookup the worms information
[nr,nc]=class_names.shape

In [8]:
# run through AphiaID to get scientific name, rank, kingdom
nlooks=np.arange(0,nr)
for pi in nlooks:
    # get Patrick's class name associated with that AphiaID so we can add it back into the 
    # data frame for later when we need to match things up based upon the class name he used
    cnid=class_names['Class Name'][pi]
    # Set up to catch cases that are not valid AphiaIDs (i.e. NaN is used for unclassified)
    try:
        myaid=int(class_names['AphiaID'][pi])
        wmentry=pyworms.aphiaRecordByAphiaID(myaid)
        mywp=pd.DataFrame([wmentry])
        bool=mywp['status']=='accepted'
        # if it has accepted status then create a dataframe to add to a larger dataframe of all values
        if bool[0]:
            newdf=mywp[["AphiaID","scientificname","lsid","rank","kingdom"]]
            [n,m]=newdf.shape
            # check to see if we get more than one value (probably shouldn't but I'm not sure)
            # note this fails if the first AphiaID is NaN
            if pi==0:
                # add Patrick's class name to the end
                newdf.insert(5,"Class Name",cnid,True)
                wormsdf=newdf
            else:
                newdf.insert(5,"Class Name",cnid,True)
                wormsdf=pd.concat([wormsdf,newdf])
        else:
            print("The record found is not \"accepted\"")
    except:
        print("Found NaN at "+str(pi))

Found NaN at 51


In [9]:
# display dataframe of worms information base upon Patrick's file
wormsdf.head(3)

Unnamed: 0,AphiaID,scientificname,lsid,rank,kingdom,Class Name
0,232546,Akashiwo sanguinea,urn:lsid:marinespecies.org:taxname:232546,Species,Chromista,Akashiwo
0,109470,Alexandrium,urn:lsid:marinespecies.org:taxname:109470,Genus,Chromista,Alexandrium_singlet
0,109428,Gonyaulacaceae,urn:lsid:marinespecies.org:taxname:109428,Family,Chromista,Amy_Gony_Protoc


In [10]:
# display the value that has AphiaID as NaN
class_names.loc[51]

Class Name                                                      unclassified
Class ID                                                                  51
Common Name                                                     Unclassified
AphiaID                                                                  NaN
URN                                      urn:lsid:marinespecies.org:taxname:
Grouping                                                               Other
HAB                                                                      NaN
Description                Classification lacked confidence to asssign a ...
Manual Classifier Notes                                                  NaN
Example Images                                                           NaN
Name: 51, dtype: object

In [11]:
# need to load the class threshold file also
# another file from Patrick that has his threshold values
# again this is model dependent 
threshold=pd.read_csv('c:/users/flbahr/class_thresholds_cencoos_classifier_20240412.csv')

In [12]:
# example of threshold table
threshold.head(3)

Unnamed: 0,Class Name,Threshold
0,Akashiwo,0.95
1,Alexandrium_singlet,0.49
2,Amy_Gony_Protoc,0.35


In [13]:
# find the size of the threshold size 
[l1,l2]=threshold.shape

Files that are unique to the deployments are the class name file and the threshold file</p>
These can NOT be extracted from the api as far as we know and have to be known apriori

In [43]:
# okay now we have most of the pieces 
# start to construct the tables?
# don't know if the random bin id I picked has the species we are looking for above the threshold
# Species of interest are Dinophysis, Alexandrium, and Pseudo-nitzschia
# do we have other aliases we need to keep track of?
# There are no alias we need to look for this particular case!
target_species=['Dinophysis','Alexandrium_singlet','Pseudo-nitzschia']

In [15]:
# Lets get some bin_id's 
bin_ids=ifcb_api_access.get_bins_in_range("2023-07-17","2023-07-18")
bin_ids=bin_ids.str.split('/')
bin_ids.head(3)
#print(bin_ids[0:3]) # print limited number of names for an idea

0    [uz-municipal-wharf, D20230717T000942_IFCB104]
1    [uz-municipal-wharf, D20230717T003329_IFCB104]
2    [uz-municipal-wharf, D20230717T005715_IFCB104]
Name: pid, dtype: object

In [16]:
# Setting up code so you could potentially loop through the flles start with what is returned above

In [17]:
# get the meta data for a bin ID
# In this case we are just loading one file
# Note need to remove all the values prior to "D"
# This is a better, more generic way to do this
# Need to do this for each bin_id used 
bin_id=bin_ids[0][1]
metadatavals=ifcb_api_access.get_ifcb_metadata(bin_ids[0][1])

In [18]:
# Do we have autoclass data for this file
# Not all files have autoclass files.
# Check to see if "True"
# this can now be called as part of the loop
# if result is "False" then skip all the rest 
haveautoclass=ifcb_api_access.bin_has_autoclass(bin_id)
print(haveautoclass)

True


In [19]:
# show the file name prefix we will be using for reference
bin_id

'D20230717T000942_IFCB104'

In [21]:
# get the api autoclass file for the data
# This file will use the names from the model file listed above
bin_url=url+'/'+bin_id+'_class_scores.csv' # defined above
autoclass=pd.read_csv(bin_url)
autoclass.head(3)

Unnamed: 0,pid,Akashiwo,Alexandrium_singlet,Amy_Gony_Protoc,Asterionellopsis,Boreadinium,Centric,Ceratium,Chaetoceros,Ciliates,...,Scrip_Het,Skeletonema,Thalassionema,Thalassiosira,Tiarina,Tintinnid,Tontonia,Torodinium,Tropidoneis,Vicicitus
0,D20230717T000942_IFCB104_00002,1.9e-05,7.09e-05,0.0,1e-06,0.0,6e-06,0.0,0.000354,5.4e-07,...,2.4e-07,4e-06,0.0,0.000644,0.0,0.0,2e-07,0.0,0.0,0.0
1,D20230717T000942_IFCB104_00003,5e-06,1e-07,8.3e-07,2e-05,6e-08,3.1e-05,3.6e-05,0.000559,1.526e-05,...,3.7e-06,2e-06,2e-07,3.9e-05,0.0,6e-08,6.56e-06,0.001355,0.0,0.0
2,D20230717T000942_IFCB104_00004,1e-06,6e-07,6e-08,0.0,4.05e-06,2.3e-05,1.4e-05,4e-06,0.0,...,0.0,2e-06,0.0,2e-06,0.0,0.0,7e-07,4e-07,2e-07,8e-06


In [22]:
# get detail file information
details=ifcb_api_access.get_bin_details(bin_id)

In [23]:
# need to drop the ml from the value below
# This needs to be called for each file read as it changes 
voltmp=details['ml_analyzed'].split()
analyzed_volume=float(voltmp[0])
# we now have the volume sampled as a float so we can use it in equations

Now have metadata values and details.  We also have the autoclass for a particular bin</p>
bin_id="D20230717T003329_IFCB104"</p>
We should have enough details to work through the example.

In [24]:
# get the features file
feature_filename=bin_id+'_features.csv'
features=pd.read_csv(url+'/'+feature_filename)

In [25]:
# Show what the file contains with its header
features.head(3)

Unnamed: 0,roi_number,Area,B180,B90,Bflip,Biovolume,BoundingBox_xwidth,BoundingBox_ywidth,ConvexArea,ConvexPerimeter,...,HOG79,HOG80,HOG81,Area_over_PerimeterSquared,Area_over_Perimeter,H90_over_Hflip,H90_over_H180,Hflip_over_H180,summedConvexPerimeter_over_Perimeter,rotated_BoundingBox_solidity
0,2,3496.0,0.863143,0.800571,0.840857,170807.2,76,78,4155.436392,235.872784,...,0.384116,0.18996,0.367227,0.041825,12.092127,1.18866,1.334921,1.123047,0.815848,0.609756
1,3,2432.0,0.925576,0.392681,0.900493,161885.6,124,34,2835.403637,260.807274,...,0.272964,0.235927,0.315059,0.0309,8.668835,11.166438,13.717263,1.228437,0.929644,0.57685
2,4,35772.0,0.198306,0.133238,0.954282,1505260.0,408,334,86633.314481,1144.628962,...,0.328164,0.404628,0.406124,0.014081,22.443554,35.462351,1.220652,0.034421,0.718147,0.330039


In [29]:
# okay we have the class name but not the intended_worms_taxon
# do I need intended_worms_taxon?  We have the AphialID for the class from another file
a2=autoclass
a2=a2.set_index(['pid'],append=False)
a2.index.name=None

In [30]:
a3=a2.stack(future_stack=True)

In [31]:
ik=np.arange(0,len(a3))
for r in ik:
    junk=str(a3[r:r+1]).split()
    if r==0:
        pid=[junk[0]]
        myclass=[junk[1]]
        score=[float(junk[2])]
    else:
        pid.append(junk[0])
        myclass.append(junk[1])
        score.append(float(junk[2]))

In [32]:
bigtable=pd.DataFrame({'pid':pid,'class':myclass,'score':score})

In [33]:
bigtable.head(3)

Unnamed: 0,pid,class,score
0,D20230717T000942_IFCB104_00002,Akashiwo,1.9e-05
1,D20230717T000942_IFCB104_00002,Alexandrium_singlet,7.1e-05
2,D20230717T000942_IFCB104_00002,Amy_Gony_Protoc,0.0


In [34]:
# It is possible that there could be winners for multiple values
# Dinophysis, Alexandrium, Pseudo-Nitchia
# Note we keep track of full name with ROIs so we can check after this runs to see if we have 
# multiple winners for a class.
# 
# if the pid is duplicated we can make note about that
# So far we have not restricted to our target species
# we will do that after this so we can catch possible duplicate ROI use 
# If that does happen we have to decide which we will declare to be the winner
ix=np.arange(0,l1)
zz=0
for kk in ix:
    id1=bigtable['class']==threshold['Class Name'][kk] # find the same names between bigtable and the threshold list
    subtable=bigtable[['pid','class','score']][id1] # subsample the table to only those with that name
    id2=subtable['score'] >= threshold['Threshold'][kk] # find all values in the subsample that have a value greater than the threshold for that name
    smalltable=subtable[['pid','class','score']][id2] #subsample the subsampled table and keep the pid so we can see if we duplicate
    # put results into what I'm calling a class table
    if smalltable.size > 0:
        # we don't have a classtable yet so create it
        if zz==0:
            classtable=smalltable
            zz=zz+1
        else:
            # let the classtable grow
            classtable=pd.concat([classtable, smalltable],ignore_index=True)

In [35]:
# Example of output
classtable.head(3)

Unnamed: 0,pid,class,score
0,D20230717T000942_IFCB104_00416,Akashiwo,1.0
1,D20230717T000942_IFCB104_01428,Akashiwo,1.0
2,D20230717T000942_IFCB104_01640,Akashiwo,0.9946


In [36]:
autoclass.head(3)

Unnamed: 0,pid,Akashiwo,Alexandrium_singlet,Amy_Gony_Protoc,Asterionellopsis,Boreadinium,Centric,Ceratium,Chaetoceros,Ciliates,...,Scrip_Het,Skeletonema,Thalassionema,Thalassiosira,Tiarina,Tintinnid,Tontonia,Torodinium,Tropidoneis,Vicicitus
0,D20230717T000942_IFCB104_00002,1.9e-05,7.09e-05,0.0,1e-06,0.0,6e-06,0.0,0.000354,5.4e-07,...,2.4e-07,4e-06,0.0,0.000644,0.0,0.0,2e-07,0.0,0.0,0.0
1,D20230717T000942_IFCB104_00003,5e-06,1e-07,8.3e-07,2e-05,6e-08,3.1e-05,3.6e-05,0.000559,1.526e-05,...,3.7e-06,2e-06,2e-07,3.9e-05,0.0,6e-08,6.56e-06,0.001355,0.0,0.0
2,D20230717T000942_IFCB104_00004,1e-06,6e-07,6e-08,0.0,4.05e-06,2.3e-05,1.4e-05,4e-06,0.0,...,0.0,2e-06,0.0,2e-06,0.0,0.0,7e-07,4e-07,2e-07,8e-06


In [37]:
# Now check to see if full pid with ROI has been assigned to multiple class names
# This is very likely to happen and we need to keep track of it
# find common pid with different class names
la=len(autoclass)
for o in np.arange(0,la):
    p=autoclass['pid'][o]
    zc=classtable['pid']==p
    test=classtable[zc]
    if test.empty:
        # do nothing
        i=0
    else:
        if len(test) > 1:
            # in this case multiple classes have been assigned to a single ROI
            # we need to determine a winner
            print(test)

                                 pid              class   score
198   D20230717T000942_IFCB104_01481  Clusterflagellate  0.5930
1701  D20230717T000942_IFCB104_01481      Thalassiosira  0.3804


In [None]:
# This is an example we want to use later commenting out and cleaning up code
#pidsbyclass=classtable['class']=='Centric'
#mytest=classtable[pidsbyclass]
##mytest['pid']
## print(bin_id)
#rois=mytest['pid'].str.replace(bin_id+'_','')
#rois=rois.astype(int)
##print(rois)

In [96]:
summarytable=classtable.groupby('class').count()
summarytable=summarytable.rename(columns={'score':'occurrences'}).drop('pid',axis=1)

In [97]:
summarytable.head(3)

Unnamed: 0_level_0,occurrences
class,Unnamed: 1_level_1
Akashiwo,4
Alexandrium_singlet,15
Amy_Gony_Protoc,8


In [98]:
## test by removing Alexandrium_singlet
#toremove=summarytable.index[:]=='Alexandrium_singlet'
#summarytable=summarytable.drop('Alexandrium_singlet')

In [99]:
# reduce the summary table to only the values that we want...
ns=len(target_species)
ni=0
mvv=[]
for ts in np.arange(0,ns):
    si=summarytable.index[:]==target_species[ts]
    tmp=summarytable[si]
    print(len(tmp))
    if tmp.empty:
        mvv=np.append(mvv,int(ts))
    if ni==0:
        newsum=tmp
        ni=1
    else:
        newsum=pd.concat([newsum,tmp])

1
0
1


In [100]:
summarytable=newsum

In [101]:
mvv=mvv.astype(int)

In [126]:
# add missing values
lv=len(mvv)
for v in np.arange(0,lv):
    blank={'class':target_species[mvv[v]],'occurrences':0}
    tmp=pd.DataFrame.from_dict([blank])
    tmp.set_index('class',inplace=True) # set the first column to index
    summarytable=pd.concat([summarytable,tmp])

In [114]:
# create event table
# want to create this as a module which we pass information to but for now mostly static...
## event table
# datasetName="santa-cruz-municipal-wharf"
# eventID=bin_id
# eventDate=details['timestamp_iso'] # need to truncate to just the date
# decimalLongitude=details['lng']
# decimalLatitude=details['lat']
# countryCode='US'
# geodeticDatum='WGS84'
# minimumDepthInMeters=
# maximumDepthInMeters=
# sampleSizeValue=details['ml_analyzed']
# sampleSizeUnit='milliliter'
########
# sampling protocol NEED TO ADD
#########
eventdict={'datasetName':'https://ifcb.caloos.org/santa-cruz-municipal-wharf',
           'eventID':bin_id,
           'eventDate':details['timestamp_iso'],
           'institutionCode':'UCSC',
           'decimalLongitude':details['lng'],
           'decimalLatitude':details['lat'],
           'countryCode':'US',
           'geodeticDatum':'WGS84',
           'minimumDepthInMeters':1,
           'maximumDepthInMeters':3,
           'samplingProtocol':'https://doi.org/10.1002/lno.11443',
           'sampleSizeValue':analyzed_volume,
           #'sampleSizeValue':details['ml_analyzed'], # need to remove the ml from the name
           'sampleSizeUnit':'milliliter'}
eventdf=pd.DataFrame.from_dict([eventdict])

In [115]:
eventdf

Unnamed: 0,datasetName,eventID,eventDate,institutionCode,decimalLongitude,decimalLatitude,countryCode,geodeticDatum,minimumDepthInMeters,maximumDepthInMeters,sampleSizeValue,sampleSizeUnit
0,https://ifcb.caloos.org/santa-cruz-municipal-w...,D20230717T000942_IFCB104,2023-07-17T00:09:42+00:00,UCSC,-122.021868,36.961491,US,WGS84,1,3,4.01 ml,milliliter


In [128]:
summarytable['occurrences'].iloc[0]

3

In [139]:
# 
# size of summary table
#
ls=len(summarytable)
xworms=wormsdf['Class Name'].reset_index()
# need to find classtable to match class name
# note this doesn't create a table/dataframe but needs to be adjusted as we have one to many needs to fill in columns
#for s in np.arange(0,1):
# so we only want summary table and not individuals as this point
for s in np.arange(0,ls):
    eventID=details["bin_id"]
    z=xworms['Class Name'][:]==summarytable.index[s]
    zi=np.where(z)
    iz=wormsdf.iloc[zi[0][0]]
    scientificName=iz['scientificname']
    scientificID=iz['lsid']
    taxonRank=iz['rank']
    kingdom=iz['kingdom']
    zc=classtable['class']==summarytable.index[s] # find the class names
    ci=np.where(zc)
    basisOfRecord='MachineObservation'
    if summarytable['occurrences'].iloc[s]>0:
        occurrenceStatus='Present' # need to deal with adding zeros for absent values but for now this is that case
    else:
        occurrenceStatus='Abscent'
    verbatimIdentification=classtable['class'][ci[0][0]]
    identifiedBy=''
    identificationVerificationStatus='PredictedByMachine'
    identificationReferences="Trained machine learning model: Daniel, P. (2023-02) phytoClassUCSC - A phytoplankton classifier for IFCB data. Version 1.0. Hugging Face repository. https://huggingface.co/patcdaniel/phytoClassUCSC | Software to run the trained machine learning model: Sosik, H., J. Futrelle, E. Peacock,  T. Golden, J. Lopez (2023-11-13) ifcb-analysis. GitHub repository. https://github.com/tsgolden/ifcb-analysis/commit/9e228c9f616edd85b57aefc0792125ec92f20e31 forked from https://github.com/yosoyjay/ifcb-analysis forked from https://github.com/hsosik/ifcb-analysis | Software to interpret autoclass scores: https://github.com/CeNCOOS/OBIS_workshop_2024_IFCB [Add which script or function] | Input parameters to interpret autoclass scores: https://github.com/CeNCOOS/OBIS_workshop_2024_IFCB/blob/main/class_thresholds_cencoos_classifier_20240412.csv"    
    # Needs to be put in
    # associatedMedia 
    # this is for individuals, for now we are just working with summary values
    # 
    # This is NOT proper it needs to be fixed to do it correctly...
    occurrenceID=details["bin_id"]+':Taxon:'+str(iz['AphiaID'])+'_1'
    
    tmpdict={'eventID':eventID,'occurrenceID':occurrenceID,'scienfificName':scientificName,'scientificID':scientificID,
             'taxonRank':taxonRank,'kingdom':kingdom,'basisOfRecord':basisOfRecord,'occurrenceStatus':occurrenceStatus,
             'verbatimIdentification':summarytable.index[s],'identifiedBy':'','identificationVerificationStatus':identificationVerificationStatus,
             'identificationReferences':identificationReferences}
    occframe=pd.DataFrame.from_dict([tmpdict])
    if s==0:
        occurrencedf=occframe
    else:
        occurrencedf=pd.concat([occurrencedf,occframe])

occurrencedf=occurrencedf.reset_index(drop=True)

In [140]:
occurrencedf.head(3)

Unnamed: 0,eventID,occurrenceID,scienfificName,scientificID,taxonRank,kingdom,basisOfRecord,occurrenceStatus,verbatimIdentification,identifiedBy,identificationVerificationStatus,identificationReferences
0,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542_1,Dinophyceae,urn:lsid:marinespecies.org:taxname:19542,Class,Chromista,MachineObservation,Present,Dinophysis,,PredictedByMachine,"Trained machine learning model: Daniel, P. (20..."
1,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:149151_1,Pseudo-nitzschia,urn:lsid:marinespecies.org:taxname:149151,Genus,Chromista,MachineObservation,Present,Pseudo-nitzschia,,PredictedByMachine,"Trained machine learning model: Daniel, P. (20..."
2,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:109470_1,Alexandrium,urn:lsid:marinespecies.org:taxname:109470,Genus,Chromista,MachineObservation,Abscent,Alexandrium_singlet,,PredictedByMachine,"Trained machine learning model: Daniel, P. (20..."


In [135]:
# This is test code to be removed when sure code runs
#zc=classtable['class']==summarytable.index[1]
#mytest=classtable[zc]
#del(rois)
#rois=mytest['pid'].str.replace(bin_id+'_','')
#rois=rois.astype(int).reset_index(drop=True)
#lr=len(rois)
#bv=0
#for i in np.arange(0,lr):
#    if i==0:
#        bv=features.loc[features['roi_number']==rois[0],'Biovolume'].item()
#    else:
#        bv=bv+features.loc[features['roi_number']==rois[0],'Biovolume'].item()
#print(bv)
##features.loc[features['roi_number']==rois[0],'Biovolume'].item()
##bv=features['roi_number'].apply(lambda x: rois[x])

In [142]:
# EMoF table
# every row must have an eventID
# not every column has to be filled 
# occurrenceID is NOT in every row
# this is for a single eventID so could have an outer loop
# row 1
row1={'eventID':details['bin_id'],
      'occurrenceID':'',
      'measurementType':'Sampling Instrument Name',
      'measurementTypeID':'http://vocab.nerc.ac.uk/collection/Q01/current/Q0100002/',
      'measurementValue':'McLane Research Laboratories Imaging FlowCytobot imaging sensor',
      'measurementValueID':'http://vocab.nerc.ac.uk/collection/L22/current/TOOL1588/',
      'measurementUnit':'',
      'measurementUnitID':'',
      'measurementRemarks':''}
emof=pd.DataFrame([row1])
row2={'eventID':details['bin_id'],
      'occurrenceID':'',
      'measurementType':'Sample Volume',
      'measurementTypeID':'http://vocab.nerc.ac.uk/collection/P01/current/VOLXXXXX/',
      'measurementValue':analyzed_volume,
      'measurementValueID':'',
      'measurementUnit':'milliliters',
      'measurementUnitID':'http://vocab.nerc.ac.uk/collection/P06/current/VVML/',
      'measurementRemarks':''}
emof=pd.concat([emof,pd.DataFrame([row2])])
ls=len(summarytable)
for s in np.arange(0,ls):
    z=xworms['Class Name'][:]==summarytable.index[s]
    zi=np.where(z)
    iz=wormsdf.iloc[zi[0][0]]
    zc=classtable['class']==summarytable.index[s] # find the class names
    occurrenceID=details["bin_id"]+':Taxon:'+str(iz['AphiaID'])+'_1'
    # get the ROIs so we can get the biovolumes
    mytest=classtable[zc]
    rois=mytest['pid'].str.replace(bin_id+'_','')
    rois=rois.astype(int).reset_index(drop=True) # will this work with feature dataframe?
    lr=len(rois)
    print(lr)
    bv=0
    if summarytable['occurrences'].iloc[s] > 0:
        for i in np.arange(0,lr):
            if i==0:
                bv=features.loc[features['roi_number']==rois[i],'Biovolume'].item()
            else:
                bv=bv+features.loc[features['roi_number']==rois[i],'Biovolume'].item()
    # now bv is the biovolume in pixels
    bv=bv/2.7 # this is the biovolume in microns?   
    # Now need to get biovolume based on roi number
    # conversion factor is 2.7 pixels per micron
    # Do we need these values if the species is abscent?
    row3={'eventID':details['bin_id'],
          'occurrenceID':occurrenceID,
          'measurementType':'Abundance of biological entity specified elsewhere per unit volume of the water body',
          'measurementTypeID':'http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL01/',
          'measurementValue':float(summarytable['occurrences'][s])/analyzed_volume,
          'measurementValueID':'',
          'measurementUnit':'Number per millilitre', # Don't we need to divide ?
          'measurementUnitID':'http://vocab.nerc.ac.uk/collection/P06/current/UCML/',
          'measurementRemarks':'number is inclusive of single cells and chains'}
    emof=pd.concat([emof,pd.DataFrame([row3])])
    row4={'eventID':details['bin_id'],
          'occurrenceID':occurrenceID,
          'measurementType':'Biovolume of biological entity specified elsewhere per unit volume of the water by calculation using Moberg and Sosik (2012) doi: 10.4319/lom.2012.10.278',
          'measurementTypeID':'',
          'measurementValue':bv, # NEED TO COMPUTE
          'measurementValueID':'',
          'measurementUnit':'cubic microns per millilitre',
          'measurementUnitID':'http://vocab.nerc.ac.uk/collection/P06/current/UCUM/',
          'measurementRemarks':'number is inclusive of single cells and chains'}
    emof=pd.concat([emof,pd.DataFrame([row4])])    
# eventID
# eventID=details['bin_id']
# measurementType='Sampling Instrument Name'
# measurementTypeID='http://vocab.nerc.ac.uk/collection/Q01/current/Q0100002/'
# measurementValue='McLane Research Laboratories Imaging FlowCytobot imaging sensor'
# measurementValueID='http://vocab.nerc.ac.uk/collection/L22/current/TOOL1588/'
# measurementType='Sample Volume'
# measurementTypeID='http://vocab.nerc.ac.uk/collection/P01/current/VOLXXXXX/'
# measurementValue=analyzed_volume
# measurementUnit='Millilitres'
# measurementUnitID='http://vocab.nerc.ac.uk/collection/P06/current/VVML/'
# occurrenceID
# measurementRemarks='abundance per milliliter, inclusive of single cells and chains'
# measurementValueID
# measurementUnit
# measurementUnitID
# measurementID
#           'sampleSizeValue':details['ml_analyzed'], # need to remove the ml from the name
#           'sampleSizeUnit':'milliliter'}
# okay need code to loop through 
#ls=len(summarytable)
# xworms=wormsdf['Class Name'].reset_index() # not sure we need this here
#for s in np.arange(0,1):
#for s in np.arange(0,ls):
#    zc=classtable['class']==summarytable.index[s] # find
# This is an example we want to use later commenting out and cleaning up code
#    mytest=classtable[zc]
#    rois=mytest['pid'].str.replace(bin_id+'_','')
#    rois=rois.astype(int) # will this work with feature dataframe?
#    # I don't think this will work since index 0 is roi_number 2
#    # does not work
#    #roin=features['roi_number'][rois]
#    #biov=features['Biovolume'][rois]
###mytest['pid']
#### print(bin_id)
##rois=mytest['pid'].str.replace(bin_id+'_','')
##rois=rois.astype(int)
###print(rois)
## summarytable.index[s] is Patrick's class name


3
3
15


  'measurementValue':float(summarytable['occurrences'][s])/analyzed_volume,
  'measurementValue':float(summarytable['occurrences'][s])/analyzed_volume,
  'measurementValue':float(summarytable['occurrences'][s])/analyzed_volume,


In [143]:
emof=emof.reset_index(drop=True)
emof

Unnamed: 0,eventID,occurrenceID,measurementType,measurementTypeID,measurementValue,measurementValueID,measurementUnit,measurementUnitID,measurementRemarks
0,D20230717T000942_IFCB104,,Sampling Instrument Name,http://vocab.nerc.ac.uk/collection/Q01/current...,McLane Research Laboratories Imaging FlowCytob...,http://vocab.nerc.ac.uk/collection/L22/current...,,,
1,D20230717T000942_IFCB104,,Sample Volume,http://vocab.nerc.ac.uk/collection/P01/current...,4.01,,milliliters,http://vocab.nerc.ac.uk/collection/P06/current...,
2,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542_1,Abundance of biological entity specified elsew...,http://vocab.nerc.ac.uk/collection/P01/current...,0.74813,,Number per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
3,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542_1,Biovolume of biological entity specified elsew...,,847675.224303,,cubic microns per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
4,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:149151_1,Abundance of biological entity specified elsew...,http://vocab.nerc.ac.uk/collection/P01/current...,0.74813,,Number per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
5,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:149151_1,Biovolume of biological entity specified elsew...,,280178.920664,,cubic microns per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
6,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:109470_1,Abundance of biological entity specified elsew...,http://vocab.nerc.ac.uk/collection/P01/current...,0.0,,Number per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
7,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:109470_1,Biovolume of biological entity specified elsew...,,0.0,,cubic microns per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
