# Format Santa Cruz Wharf IFCB Data for OBIS #


In [79]:
import numpy as np
import pandas as pd
import sys
import pyworms
import requests
import json
import os
import math

sys.path.append('../')
from utilities import ifcb_api_access

In [80]:
# define pi for use later in bio-volume computation
pi=math.pi

Using the IFCB DB API, request the names of all of the datasets on a dashboard instance.

In [2]:
base_url='https://ifcb.caloos.org/'

def get_datasets(dashboard_url):
    """Return a list dashboard datasets from the API

    Args:
        dashboard_url (str): base url of an IFCB dashboard (V2)
    
    Returns:
        list: list of dataset names
    """
    
    request_rul = os.path.join(base_url, 'api/filter_options')

    response=requests.get(request_rul)
    content=response.content
    content=json.loads(content)
    # options=pd.DataFrame([content])
    return content['dataset_options']

dataset_names = get_datasets(base_url)
scw_datset = dataset_names[12]
dataset_names

['bloofinz-io',
 'bodega-marine-lab',
 'calcofi-cruises-ctd',
 'calcofi-cruises-underway',
 'cal-poly-humboldt-hioc',
 'cce-lter-process-cruises-ifcb-151',
 'del-mar-mooring',
 'mbari-power-buoy',
 'newport-beach-pier',
 'plumes-and-blooms-cruises',
 'san-francisco-bay-cruises',
 'san-francisco-pier-17',
 'santa-cruz-municipal-wharf',
 'scripps-pier-ifcb-151',
 'scripps-pier-ifcb-158',
 'scripps-pier-ifcb-183',
 'stearns-wharf']

__Load AphiaID to ClassificationID mapping spreadsheet__

Each classification class is mapped to an AphiaID in the Aphia table.

In [3]:

class_names_file = "../data/class_names_matched_to_WoRMS_cencoos_classifier_20240412.csv"
class_names = pd.read_csv(class_names_file)
class_names.head()

Unnamed: 0,Class Name,Class ID,Common Name,AphiaID,URN,Grouping,HAB,Description,Manual Classifier Notes,Example Images
0,Akashiwo,0,Akashiwo sanguinea,232546.0,urn:lsid:marinespecies.org:taxname:232546,Dinoflagellate,Y,"Monophyletic, marine dinoflagellate","large single cells, kidney-shaped when viewed ...",
1,Alexandrium_singlet,1,Alexandium sp.,109470.0,urn:lsid:marinespecies.org:taxname:109470,Dinoflagellate,Y,A. catenella and others,"Originally class was broken out for singles, d...",http://akashiwo.oceandatacenter.ucsc.edu:8000/...
2,Amy_Gony_Protoc,2,"Amylax, Gonyaulax or Protoceratium",109428.0,urn:lsid:marinespecies.org:taxname:109428,Dinoflagellate,Y,Descision was made not to distinguish between ...,“Feet” and points visible.,
3,Asterionellopsis,3,Asterionellopsis,149138.0,urn:lsid:marinespecies.org:taxname:149138,Diatom,N,"Common species include A. glacialis, A. kariana",,
4,Boreadinium,4,Boredadinium pisiforme,110067.0,urn:lsid:marinespecies.org:taxname:110067,Dinoflagellate,N,Slightly compressed thecate cells,<45 um,


In [4]:
scientificnames = []
lsids = []
ranks = []
kingdoms = []

for i, aphiaid in enumerate(class_names['AphiaID'].values):
    
    # Check if AphiaID is NaN
    if ~np.isnan(aphiaid):
        # Call worms API service
        worms_response = wmentry = pyworms.aphiaRecordByAphiaID(int(aphiaid))
        scientificnames.append(worms_response['scientificname'])
        lsids.append(worms_response['lsid'])
        ranks.append(worms_response['rank'])
        kingdoms.append(worms_response['kingdom'])
        
    else:
        scientificnames.append(None)
        lsids.append(None)
        ranks.append(None)
        kingdoms.append(None)

In [6]:
worms_df = pd.DataFrame(data={
                            "AphiaID":class_names['AphiaID'].values,
                            "scientificname":scientificnames,
                            "lsid":lsids,
                            "rank":ranks,
                            "kingdom":kingdoms
                            })
worms_df['className'] = class_names['Class Name']
worms_df.tail()

Unnamed: 0,AphiaID,scientificname,lsid,rank,kingdom,className
47,101196.0,Tontonia,urn:lsid:marinespecies.org:taxname:101196,Genus,Chromista,Tontonia
48,109479.0,Torodinium,urn:lsid:marinespecies.org:taxname:109479,Genus,Chromista,Torodinium
49,149518.0,Tropidoneis,urn:lsid:marinespecies.org:taxname:149518,Genus,Chromista,Tropidoneis
50,707571.0,Vicicitus,urn:lsid:marinespecies.org:taxname:707571,Genus,Chromista,Vicicitus
51,,,,,,unclassified


__Load Class Specific Thredsholds__

In [7]:
thresholds = pd.read_csv('../data/class_thresholds_cencoos_classifier_20240412.csv')
thresholds.head(3)

Unnamed: 0,Class Name,Threshold
0,Akashiwo,0.95
1,Alexandrium_singlet,0.49
2,Amy_Gony_Protoc,0.35


Files that are unique to the deployments are the class name file and the threshold file</p>
These can NOT be extracted from the api as far as we know and have to be known apriori

In [8]:
# okay now we have most of the pieces 
# start to construct the tables?
# don't know if the random bin id I picked has the species we are looking for above the threshold
# Species of interest are Dinophysis, Alexandrium, and Pseudo-nitzschia
# do we have other aliases we need to keep track of?
# There are no alias we need to look for this particular case!
target_species=['Dinophysis','Alexandrium_singlet','Pseudo-nitzschia']

In [9]:
# Lets get some bin_id's 
bin_ids=ifcb_api_access.get_bins_in_range("2023-07-17","2023-07-18")
bin_ids=bin_ids.str.split('/')
bin_ids.head(3)
#print(bin_ids[0:3]) # print limited number of names for an idea

0    [uz-municipal-wharf, D20230717T000942_IFCB104]
1    [uz-municipal-wharf, D20230717T003329_IFCB104]
2    [uz-municipal-wharf, D20230717T005715_IFCB104]
Name: pid, dtype: object

In [11]:
def get_bins_in_range(start_date, end_date, dataset_name, base_dashboard_url='https://ifcb.caloos.org'):
    """ Given a start date and end date, request all of the ifcb sampled from a given instrument feed

    Args:
        start_date (str): Start date string in the form of yyyy-mm-dd
        end_date (str): End date string in the form of yyyy-mm-dd
    Returns: 
        (pd.DataFrame): dataframe with a series of bin ids 
    """
    # Dates should be of the 
    url = f"{base_dashboard_url}/{dataset_name}/api/feed/temperature/start/{start_date}/end/{end_date}"
    response=requests.get(url)
    
    if response.status_code==200:
        content=response.content
        content=json.loads(content)
        content=pd.DataFrame.from_dict(content)
        content["pid"]=content["pid"].map(lambda x: x.lstrip(f"{base_dashboard_url}/{dataset_name}/"))
        content=content["pid"]
        return(content)
    
    else:
        print('Failed to get all bins with range with code: '+response.status_code)
        return(response.status_code)
    
    
def get_ifcb_metadata(bin, base_dashboard_url='https://ifcb.caloos.org'):    
    """ Return metadata for a given bin using the dashboard API (V2)
    Args:
        bin (str): bin id to get metadata. Bins are in the form of DYYYYMMDDTHHmmss_IFCBNNN (ex. D20230717T000942_IFCB104)
        base_dashboard_url (str): base url of an IFCB dashboard (V2)
    Returns:
        dict: Returns a dictionary of metadata for a given bin
    """
    request_url = f"{base_dashboard_url}/api/metadata/{bin}"
    1
    response=requests.get(request_url)
    
    if response.status_code==200:
        content=response.content
        content=json.loads(content)
    
    else:
        print("Metadata GET request failed with code: "+str(response.status_code))
        content = response.status_code
        
    return content

def get_bin_details(bin, base_dashboard_url='https://ifcb.caloos.org'):
    """ Return information about a bin using the dashboard API (V2)

    Args:
        bin (str): bin id to get metadata. Bins are in the form of DYYYYMMDDTHHmmss_IFCBNNN (ex. D20230717T000942_IFCB104)
        base_dashboard_url (str, optional): _description_. Defaults to 'https://ifcb.caloos.org'.

    Returns:
        _type_: _description_
    """
    url = f"{base_dashboard_url}/api/bin/{bin}"
    
    response=requests.get(url)
    
    if response.status_code==200:
        content = response.content
        content = json.loads(content)
        # newdict = {"bin_id":bin}
        # newdict.update(content)
        content
    
    else:
        print("Bin neighbors GET request failed with code: "+str(response.status_code))
        content = None
    return content

def bin_has_autoclass(bin, base_dashboard_url='https://ifcb.caloos.org'):
    """return boolean if a bin has autoclassification results

    Args:
        bin (str): bin id to get metadata. Bins are in the form of DYYYYMMDDTHHmmss_IFCBNNN (ex. D20230717T000942_IFCB104)
        base_dashboard_url (str, optional): base url of an IFCB dashboard (V2). Defaults to 'https://ifcb.caloos.org'.

    Returns:
        boolean: True if dahsboard has autoclassification results for bin, False otherwise
    """
   
    url = f"{base_dashboard_url}/api/has_products/{bin}"
    response=requests.get(url)
    
    if response.status_code==200:
        content=response.content
        content=json.loads(content)       
        class_score = content['has_class_scores']
    
    else:
        print('Autclass GET faile with code: '+str(response.status_code))
        class_score = None
        
    return class_score

def get_autoclass_data(bin,dataset_name, base_url='https://ifcb.caloos.org'):
    """Return autoclassification data for a given bin

    Args:
        bin (str): bin id to get metadata. Bins are in the form of DYYYYMMDDTHHmmss_IFCBNNN (ex. D20230717T000942_IFCB104)
        base_url (str, optional): base url of an IFCB dashboard (V2). Defaults to 'https://ifcb.caloos.org'.

    Returns:
        dict: dictionary of autoclassification data
    """
    try:
        url = f"{base_url}/{dataset_name}/{bin}_class_scores.csv"
        df = pd.read_csv(url)
        
    except Exception as e:
        print(f"Failed to get autoclassification data for bin {bin} with error: {e}")
        df = None
        
    return df

def get_feature_file(bin, dataset_name, base_url='https://ifcb.caloos.org'):
    """Return feature file for a given bin

    Args:
        bin (str): bin id to get metadata. Bins are in the form of DYYYYMMDDTHHmmss_IFCBNNN (ex. D20230717T000942_IFCB104)
        base_url (str, optional): base url of an IFCB dashboard (V2). Defaults to 'https://ifcb.caloos.org'.

    Returns:
        pd.DataFrame: feature file for a given bin
    """
    try:
        url = f"{base_url}/{dataset_name}/{bin}_features.csv"
        df = pd.read_csv(url)
        
    except Exception as e:
        print(f"Failed to get feature file for bin {bin} with error: {e}")
        df = None
        
    return df


In [12]:

DATASET = scw_datset
start_date = "2023-07-17"
end_date = "2023-07-18"

range_response = get_bins_in_range(start_date=start_date, end_date=end_date, dataset_name=DATASET)
print(f"n files: {len(range_response)}\n{range_response.head()}")

n files: 58
0    D20230717T000942_IFCB104
1    D20230717T003329_IFCB104
2    D20230717T005715_IFCB104
3    D20230717T012101_IFCB104
4    D20230717T014447_IFCB104
Name: pid, dtype: object


In [13]:
metadatavals = get_ifcb_metadata(range_response.iloc[0])
metadatavals

{'metadata': {'FileComment': 'file comment',
  'SyringeSampleVolume': 5,
  'sampleVolume2skip': 0,
  'runTime': 1200.8840277777779,
  'inhibitTime': 238.50505208333334,
  'temperature': 23.13378271152819,
  'humidity': 16.3658518348974,
  'PMTAhighVoltage': 0.45,
  'PMTBhighVoltage': 0.55,
  'PMTAtriggerThreshold_DAQ_MCConly': 0.14,
  'PMTBtriggerThreshold_DAQ_MCConly': 0.14,
  'blobXgrowAmount': 20,
  'blobYgrowAmount': 5,
  'binarizeThreshold': 8,
  'minimumBlobArea': 2000,
  'runSampleFast': False,
  'context': 'SoftwareVersion: 2.3.0.0',
  'AnalogFirmware': 45,
  'HousekeepingFirmware': 32,
  'sampleNumber': 187,
  'sampleType': 'Normal',
  'triggerCount': 2876,
  'roiCount': 2951,
  'ADCFileFormat': 'trigger#, ADCtime, PMTA, PMTB, PMTC, PMTD, PeakA, PeakB, PeakC, PeakD, TimeOfFlight, GrabTimeStart, GrabTimeEnd, RoiX, RoiY, RoiWidth, RoiHeight, StartByte, ComparatorOut, StartPoint, SignalLength, Status, RunTime, InhibitTime',
  'DAQ_MCCserialPort_DAC_MCConly': '/dev/ttyS2',
  'auxP

In [74]:
if bin_has_autoclass(bin=range_response.iloc[0]):
    auto_class = get_autoclass_data(bin=range_response.iloc[0],dataset_name=DATASET)
    print(auto_class.head())
else:
    print('No autoclass file, need a different fileID')

                              pid  Akashiwo  Alexandrium_singlet  \
0  D20230717T000942_IFCB104_00002  0.000019         7.090000e-05   
1  D20230717T000942_IFCB104_00003  0.000005         1.000000e-07   
2  D20230717T000942_IFCB104_00004  0.000001         6.000000e-07   
3  D20230717T000942_IFCB104_00005  0.000023         0.000000e+00   
4  D20230717T000942_IFCB104_00006  0.000001         0.000000e+00   

   Amy_Gony_Protoc  Asterionellopsis   Boreadinium       Centric  Ceratium  \
0     0.000000e+00          0.000001  0.000000e+00  5.540000e-06  0.000000   
1     8.300000e-07          0.000020  6.000000e-08  3.135000e-05  0.000036   
2     6.000000e-08          0.000000  4.050000e-06  2.253000e-05  0.000014   
3     8.100000e-06          0.000000  0.000000e+00  2.980000e-05  0.001306   
4     1.670000e-06          0.000000  0.000000e+00  3.600000e-07  0.000090   

    Chaetoceros      Ciliates  ...     Scrip_Het   Skeletonema  Thalassionema  \
0  3.536000e-04  5.400000e-07  ...  2.400

### Pull __ml_analyzed__ from the IFCB database for each bin. ###

Volume analyzed (ml_analyzed) is the effective sample volume for a given sample. Sample volume can be limited by the processing speed of the camera/computer system and is caluclated as function of the inhibited time of the camera and the flow rate of the water through the camera.

In [15]:
bin_info = get_bin_details(range_response.iloc[0])
vol_analyzed = float(bin_info['ml_analyzed'].split()[0])
print(f"Volume Analyzed: {vol_analyzed} mL for bin {range_response.iloc[0]}")

Volume Analyzed: 4.01 mL for bin D20230717T000942_IFCB104


### Get Image Feature Information ###

Features are properties of the image that are extracted (ie circularity, aspect ratio, biovolume, esd...). These can used to classify the images into classes using random forest classifiers and are important for calculating biovolume and carbon mass.

In [16]:
features = get_feature_file(range_response.iloc[0],dataset_name=DATASET)
features.head()

Unnamed: 0,roi_number,Area,B180,B90,Bflip,Biovolume,BoundingBox_xwidth,BoundingBox_ywidth,ConvexArea,ConvexPerimeter,...,HOG79,HOG80,HOG81,Area_over_PerimeterSquared,Area_over_Perimeter,H90_over_Hflip,H90_over_H180,Hflip_over_H180,summedConvexPerimeter_over_Perimeter,rotated_BoundingBox_solidity
0,2,3496.0,0.863143,0.800571,0.840857,170807.2,76,78,4155.436392,235.872784,...,0.384116,0.18996,0.367227,0.041825,12.092127,1.18866,1.334921,1.123047,0.815848,0.609756
1,3,2432.0,0.925576,0.392681,0.900493,161885.6,124,34,2835.403637,260.807274,...,0.272964,0.235927,0.315059,0.0309,8.668835,11.166438,13.717263,1.228437,0.929644,0.57685
2,4,35772.0,0.198306,0.133238,0.954282,1505260.0,408,334,86633.314481,1144.628962,...,0.328164,0.404628,0.406124,0.014081,22.443554,35.462351,1.220652,0.034421,0.718147,0.330039
3,5,8955.0,0.767606,0.410127,0.917505,310105.0,222,104,14643.525539,523.051077,...,0.230965,0.555786,0.30078,0.021016,13.718427,12.932686,4.058272,0.3138,0.801277,0.469853
4,6,6239.0,0.769169,0.447225,0.929901,177397.7,173,85,9613.432291,419.864582,...,0.280826,0.446211,0.267132,0.020225,11.233139,15.129081,4.41241,0.291651,0.755954,0.540958


In [77]:
#xx=list(features.columns)
#xx[11]

'EquivDiameter'

Combine features with the class information to create a single table.

In [17]:
testing = auto_class.copy()
auto_class.head()


Unnamed: 0,pid,Akashiwo,Alexandrium_singlet,Amy_Gony_Protoc,Asterionellopsis,Boreadinium,Centric,Ceratium,Chaetoceros,Ciliates,...,Scrip_Het,Skeletonema,Thalassionema,Thalassiosira,Tiarina,Tintinnid,Tontonia,Torodinium,Tropidoneis,Vicicitus
0,D20230717T000942_IFCB104_00002,1.9e-05,7.09e-05,0.0,1e-06,0.0,5.54e-06,0.0,0.0003536,5.4e-07,...,2.4e-07,3.9e-06,0.0,0.000644,0.0,0.0,2e-07,0.0,0.0,0.0
1,D20230717T000942_IFCB104_00003,5e-06,1e-07,8.3e-07,2e-05,6e-08,3.135e-05,3.6e-05,0.0005593,1.526e-05,...,3.7e-06,1.7e-06,2e-07,3.934e-05,0.0,6e-08,6.56e-06,0.001355,0.0,0.0
2,D20230717T000942_IFCB104_00004,1e-06,6e-07,6e-08,0.0,4.05e-06,2.253e-05,1.4e-05,4.35e-06,0.0,...,0.0,2.3e-06,0.0,1.85e-06,0.0,0.0,7e-07,4e-07,2e-07,8e-06
3,D20230717T000942_IFCB104_00005,2.3e-05,0.0,8.1e-06,0.0,0.0,2.98e-05,0.001306,7.93e-05,4.3e-06,...,4e-07,1e-07,0.0,6e-08,0.0,6.6e-07,9e-07,2.4e-07,0.0,0.0
4,D20230717T000942_IFCB104_00006,1e-06,0.0,1.67e-06,0.0,0.0,3.6e-07,9e-05,5e-07,6e-08,...,0.0,0.0,0.0,0.0,0.0,6e-08,6e-08,4e-07,0.0,0.0


In [18]:
features['pid'] = range_response.iloc[0] + "_" + features['roi_number'].astype(str).str.zfill(5)
features.head()

Unnamed: 0,roi_number,Area,B180,B90,Bflip,Biovolume,BoundingBox_xwidth,BoundingBox_ywidth,ConvexArea,ConvexPerimeter,...,HOG80,HOG81,Area_over_PerimeterSquared,Area_over_Perimeter,H90_over_Hflip,H90_over_H180,Hflip_over_H180,summedConvexPerimeter_over_Perimeter,rotated_BoundingBox_solidity,pid
0,2,3496.0,0.863143,0.800571,0.840857,170807.2,76,78,4155.436392,235.872784,...,0.18996,0.367227,0.041825,12.092127,1.18866,1.334921,1.123047,0.815848,0.609756,D20230717T000942_IFCB104_00002
1,3,2432.0,0.925576,0.392681,0.900493,161885.6,124,34,2835.403637,260.807274,...,0.235927,0.315059,0.0309,8.668835,11.166438,13.717263,1.228437,0.929644,0.57685,D20230717T000942_IFCB104_00003
2,4,35772.0,0.198306,0.133238,0.954282,1505260.0,408,334,86633.314481,1144.628962,...,0.404628,0.406124,0.014081,22.443554,35.462351,1.220652,0.034421,0.718147,0.330039,D20230717T000942_IFCB104_00004
3,5,8955.0,0.767606,0.410127,0.917505,310105.0,222,104,14643.525539,523.051077,...,0.555786,0.30078,0.021016,13.718427,12.932686,4.058272,0.3138,0.801277,0.469853,D20230717T000942_IFCB104_00005
4,6,6239.0,0.769169,0.447225,0.929901,177397.7,173,85,9613.432291,419.864582,...,0.446211,0.267132,0.020225,11.233139,15.129081,4.41241,0.291651,0.755954,0.540958,D20230717T000942_IFCB104_00006


Reshape the Pandas dataframe by shifting columns and using the new version of stacking.  Make a copy of the dataframe so we can use the old version if needed later.

In [22]:
# Need to remove the index column header before we restructure the dataframe.  This is so we don't have 'pid' filling the
# columns and messing up how the restructure dataframe is put together.
a2=auto_class.copy()
a2=a2.set_index(['pid'],append=False)
a2.index.name=None
a2.head(3)

Unnamed: 0,Akashiwo,Alexandrium_singlet,Amy_Gony_Protoc,Asterionellopsis,Boreadinium,Centric,Ceratium,Chaetoceros,Ciliates,Clusterflagellate,...,Scrip_Het,Skeletonema,Thalassionema,Thalassiosira,Tiarina,Tintinnid,Tontonia,Torodinium,Tropidoneis,Vicicitus
D20230717T000942_IFCB104_00002,1.9e-05,7.09e-05,0.0,1e-06,0.0,6e-06,0.0,0.000354,5.4e-07,0.998,...,2.4e-07,4e-06,0.0,0.000644,0.0,0.0,2e-07,0.0,0.0,0.0
D20230717T000942_IFCB104_00003,5e-06,1e-07,8.3e-07,2e-05,6e-08,3.1e-05,3.6e-05,0.000559,1.526e-05,2e-06,...,3.7e-06,2e-06,2e-07,3.9e-05,0.0,6e-08,6.56e-06,0.001355,0.0,0.0
D20230717T000942_IFCB104_00004,1e-06,6e-07,6e-08,0.0,4.05e-06,2.3e-05,1.4e-05,4e-06,0.0,0.0,...,0.0,2e-06,0.0,2e-06,0.0,0.0,7e-07,4e-07,2e-07,8e-06


Reshape the dataframe so that "pid" is first column, second are the class names, and the last column are the thresholds for the dataset

In [23]:

a3=a2.stack(future_stack=True)
a3.head(3)

D20230717T000942_IFCB104_00002  Akashiwo               0.000019
                                Alexandrium_singlet    0.000071
                                Amy_Gony_Protoc        0.000000
dtype: float64

Sort out the "pid", class name, and score into a dictionary so we can then create a new dataframe with these as columns and that each class has a "pid" associated with it.  There must be an elegant way to do this, this code is a brute force way of doing it.

In [24]:
ik=np.arange(0,len(a3))
for r in ik:
    junk=str(a3[r:r+1]).split()
    if r==0:
        pid=[junk[0]]
        myclass=[junk[1]]
        score=[float(junk[2])]
    else:
        pid.append(junk[0])
        myclass.append(junk[1])
        score.append(float(junk[2]))

In [25]:
bigtable=pd.DataFrame({'pid':pid,'class':myclass,'score':score})

In [26]:
bigtable.head(3)

Unnamed: 0,pid,class,score
0,D20230717T000942_IFCB104_00002,Akashiwo,1.9e-05
1,D20230717T000942_IFCB104_00002,Alexandrium_singlet,7.1e-05
2,D20230717T000942_IFCB104_00002,Amy_Gony_Protoc,0.0


In [31]:
# It is possible that there could be winners for multiple values
# Dinophysis, Alexandrium, Pseudo-Nitchia
# Note we keep track of full name with ROIs so we can check after this runs to see if we have 
# multiple winners for a class.
# 
# if the pid is duplicated we can make note about that
# So far we have not restricted to our target species
# we will do that after this so we can catch possible duplicate ROI use 
# If that does happen we have to decide which we will declare to be the winner
[l1,l2]=thresholds.shape
ix=np.arange(0,l1)
zz=0
for kk in ix:
    id1=bigtable['class']==thresholds['Class Name'][kk] # find the same names between bigtable and the threshold list
    subtable=bigtable[['pid','class','score']][id1] # subsample the table to only those with that name
    id2=subtable['score'] >= thresholds['Threshold'][kk] # find all values in the subsample that have a value greater than the threshold for that name
    smalltable=subtable[['pid','class','score']][id2] #subsample the subsampled table and keep the pid so we can see if we duplicate
    # put results into what I'm calling a class table
    if smalltable.size > 0:
        # we don't have a classtable yet so create it
        if zz==0:
            classtable=smalltable
            zz=zz+1
        else:
            # let the classtable grow
            classtable=pd.concat([classtable, smalltable],ignore_index=True)

In [32]:
# Example of output
classtable.head(3)

Unnamed: 0,pid,class,score
0,D20230717T000942_IFCB104_00416,Akashiwo,1.0
1,D20230717T000942_IFCB104_01428,Akashiwo,1.0
2,D20230717T000942_IFCB104_01640,Akashiwo,0.9946


In [34]:
auto_class.head(3)

Unnamed: 0,pid,Akashiwo,Alexandrium_singlet,Amy_Gony_Protoc,Asterionellopsis,Boreadinium,Centric,Ceratium,Chaetoceros,Ciliates,...,Scrip_Het,Skeletonema,Thalassionema,Thalassiosira,Tiarina,Tintinnid,Tontonia,Torodinium,Tropidoneis,Vicicitus
0,D20230717T000942_IFCB104_00002,1.9e-05,7.09e-05,0.0,1e-06,0.0,6e-06,0.0,0.000354,5.4e-07,...,2.4e-07,4e-06,0.0,0.000644,0.0,0.0,2e-07,0.0,0.0,0.0
1,D20230717T000942_IFCB104_00003,5e-06,1e-07,8.3e-07,2e-05,6e-08,3.1e-05,3.6e-05,0.000559,1.526e-05,...,3.7e-06,2e-06,2e-07,3.9e-05,0.0,6e-08,6.56e-06,0.001355,0.0,0.0
2,D20230717T000942_IFCB104_00004,1e-06,6e-07,6e-08,0.0,4.05e-06,2.3e-05,1.4e-05,4e-06,0.0,...,0.0,2e-06,0.0,2e-06,0.0,0.0,7e-07,4e-07,2e-07,8e-06


In [35]:
# Now check to see if full pid with ROI has been assigned to multiple class names
# This is very likely to happen and we need to keep track of it
# find common pid with different class names
la=len(auto_class)
for o in np.arange(0,la):
    p=auto_class['pid'][o]
    zc=classtable['pid']==p
    test=classtable[zc]
    if test.empty:
        # do nothing
        i=0
    else:
        if len(test) > 1:
            # in this case multiple classes have been assigned to a single ROI
            # we need to determine a winner
            print(test)

                                 pid              class   score
198   D20230717T000942_IFCB104_01481  Clusterflagellate  0.5930
1701  D20230717T000942_IFCB104_01481      Thalassiosira  0.3804


In [34]:
# This is an example we want to use later commenting out and cleaning up code
#pidsbyclass=classtable['class']=='Centric'
#mytest=classtable[pidsbyclass]
##mytest['pid']
## print(bin_id)
#rois=mytest['pid'].str.replace(bin_id+'_','')
#rois=rois.astype(int)
##print(rois)

In [36]:
summarytable=classtable.groupby('class').count()
summarytable=summarytable.rename(columns={'score':'occurrences'}).drop('pid',axis=1)

In [37]:
summarytable.head(3)

Unnamed: 0_level_0,occurrences
class,Unnamed: 1_level_1
Akashiwo,4
Alexandrium_singlet,15
Amy_Gony_Protoc,8


In [37]:
## test by removing Alexandrium_singlet
#toremove=summarytable.index[:]=='Alexandrium_singlet'
#summarytable=summarytable.drop('Alexandrium_singlet')

In [38]:
# reduce the summary table to only the values that we want...
ns=len(target_species)
ni=0
mvv=[]
for ts in np.arange(0,ns):
    si=summarytable.index[:]==target_species[ts]
    tmp=summarytable[si]
    #print(len(tmp))
    if tmp.empty:
        mvv=np.append(mvv,int(ts))
    if ni==0:
        newsum=tmp
        ni=1
    else:
        newsum=pd.concat([newsum,tmp])

In [39]:
summarytable=newsum

Check if we are missing any of our classes of interest, if mvv is not empty then we are missing something

In [40]:
mvv

[]

Make sure we have an integer value if it is not empty and not some other type of variable.

In [41]:
if len(mvv) > 0:
    mvv=mvv.astype(int)

Add the missing class of interest to the summary table as not there so showing abscence.

In [42]:
# add missing values
lv=len(mvv)
for v in np.arange(0,lv):
    blank={'class':target_species[mvv[v]],'occurrences':0}
    tmp=pd.DataFrame.from_dict([blank])
    tmp.set_index('class',inplace=True) # set the first column to index
    summarytable=pd.concat([summarytable,tmp])

In [50]:
# create event table
# want to create this as a module which we pass information to but for now mostly static...
## event table
# datasetName="santa-cruz-municipal-wharf"
# eventID=bin_id range_response.iloc[0]?
# eventDate=details['timestamp_iso'] # need to truncate to just the date
# decimalLongitude=details['lng']
# decimalLatitude=details['lat']
# countryCode='US'
# geodeticDatum='WGS84'
# minimumDepthInMeters=
# maximumDepthInMeters=
# sampleSizeValue=details['ml_analyzed']
# sampleSizeUnit='milliliter'
########
# sampling protocol NEED TO ADD
#########
eventdict={'datasetName':'https://ifcb.caloos.org/santa-cruz-municipal-wharf',
           'eventID':range_response.iloc[0],
           #'eventID':bin_id,
           'eventDate':bin_info['timestamp_iso'],
           #'eventDate':details['timestamp_iso'],
           'institutionCode':'UCSC',
           'decimalLongitude':bin_info['lng'],
          # 'decimalLongitude':details['lng'],
           'decimalLatitude':bin_info['lat'],
          # 'decimalLatitude':details['lat'],
           'countryCode':'US',
           'geodeticDatum':'WGS84',
           'minimumDepthInMeters':1,
           'maximumDepthInMeters':3,
           'samplingProtocol':'https://doi.org/10.1002/lno.11443',
           'sampleSizeValue':vol_analyzed,
           #'sampleSizeValue':analyzed_volume,
           #'sampleSizeValue':details['ml_analyzed'], # need to remove the ml from the name
           'sampleSizeUnit':'milliliter'}
eventdf=pd.DataFrame.from_dict([eventdict])

In [51]:
eventdf

Unnamed: 0,datasetName,eventID,eventDate,institutionCode,decimalLongitude,decimalLatitude,countryCode,geodeticDatum,minimumDepthInMeters,maximumDepthInMeters,samplingProtocol,sampleSizeValue,sampleSizeUnit
0,https://ifcb.caloos.org/santa-cruz-municipal-w...,D20230717T000942_IFCB104,2023-07-17T00:09:42+00:00,UCSC,-122.021868,36.961491,US,WGS84,1,3,https://doi.org/10.1002/lno.11443,4.01,milliliter


In [52]:
summarytable['occurrences'].iloc[0]

3

In [61]:
classtable.head(3)

Unnamed: 0,pid,class,score
0,D20230717T000942_IFCB104_00416,Akashiwo,1.0
1,D20230717T000942_IFCB104_01428,Akashiwo,1.0
2,D20230717T000942_IFCB104_01640,Akashiwo,0.9946


In [65]:
# 
# size of summary table
#
# how to add associated data...
#
ls=len(summarytable)
xworms=worms_df['className'].reset_index()
# need to find classtable to match class name
# note this doesn't create a table/dataframe but needs to be adjusted as we have one to many needs to fill in columns
#for s in np.arange(0,1):
# so we only want summary table and not individuals as this point
for s in np.arange(0,ls):
    eventID=range_response.iloc[0]
    #eventID=details["bin_id"]
    z=xworms['className'][:]==summarytable.index[s]
    zi=np.where(z)
    iz=worms_df.iloc[zi[0][0]]
    scientificName=iz['scientificname']
    scientificID=iz['lsid']
    taxonRank=iz['rank']
    kingdom=iz['kingdom']
    zc=classtable['class']==summarytable.index[s] # find the class names
    ci=np.where(zc)
    basisOfRecord='MachineObservation'
    if summarytable['occurrences'].iloc[s]>0:
        occurrenceStatus='Present' # need to deal with adding zeros for absent values but for now this is that case
    else:
        occurrenceStatus='Abscent'
    verbatimIdentification=classtable['class'][ci[0][0]]
    identifiedBy=''
    identificationVerificationStatus='PredictedByMachine'
    identificationReferences="Trained machine learning model: Daniel, P. (2023-02) phytoClassUCSC - A phytoplankton classifier for IFCB data. Version 1.0. Hugging Face repository. https://huggingface.co/patcdaniel/phytoClassUCSC | Software to run the trained machine learning model: Sosik, H., J. Futrelle, E. Peacock,  T. Golden, J. Lopez (2023-11-13) ifcb-analysis. GitHub repository. https://github.com/tsgolden/ifcb-analysis/commit/9e228c9f616edd85b57aefc0792125ec92f20e31 forked from https://github.com/yosoyjay/ifcb-analysis forked from https://github.com/hsosik/ifcb-analysis | Software to interpret autoclass scores: https://github.com/CeNCOOS/OBIS_workshop_2024_IFCB [Add which script or function] | Input parameters to interpret autoclass scores: https://github.com/CeNCOOS/OBIS_workshop_2024_IFCB/blob/main/class_thresholds_cencoos_classifier_20240412.csv"    
    # Needs to be put in
    # associatedMedia 
    # get the ROIs so we can get reference the actual image in the dataset
    mytest=classtable[zc]
    # Take the extended "pid" and remove all leading string info to get the ROI number at the end of the string.
    # It was done this way so that it does not depend on the length of the first part of the string.
    rois=mytest['pid'].str.replace(range_response.iloc[0]+'_','')
    #rois=mytest['pid'].str.replace(bin_id+'_','')
    rois=rois.reset_index(drop=True)
    lr=len(rois)
    # image share link example is
    # https://ifcb.caloos.org/image?image=02075&bin=D20230717T0033229_IFCB104
    roistr=''
    strpre='https://ifcb.caloos.org/image?image='
    binstr='&bin='
    for ri in np.arange(0,lr):
        roistr=roistr+strpre+rois[ri]+binstr+range_response.iloc[0]+"|"
        #roistr=roistr+strpre+rois[ri]+binstr+bin_id+"|"
    associatedMedia=roistr
    
    # this is for individuals, for now we are just working with summary values
    # 
    # This is NOT proper it needs to be fixed to do it correctly...
    # This needs to be fixed to have multiple values
    occurrenceID=range_response.iloc[0]+':Taxon:'+str(iz['AphiaID'])+'_1'
    #occurrenceID=details["bin_id"]+':Taxon:'+str(iz['AphiaID'])+'_1'
    # check against table to see if it already is in the dataframe if it is we add and increment....
    # can we possibly have more than 2?
    if s > 0:
        zi=occurrencedf['occurrenceID']==occurrenceID
        ajunk=occurrencedf['occurrenceID'][zi]
        if len(ajunk) > 0:
            occurrenceID=range_response.iloc[0]+':Taxon'+str(iz['AphiaID'])+'_2'
            #occurrenceID=details["bin_id"]+':Taxon:'+str(iz['AphiaID'])+'_2'
    tmpdict={'eventID':eventID,'occurrenceID':occurrenceID,'scienfificName':scientificName,'scientificID':scientificID,
             'taxonRank':taxonRank,'kingdom':kingdom,'basisOfRecord':basisOfRecord,'occurrenceStatus':occurrenceStatus,
             'verbatimIdentification':summarytable.index[s],'identifiedBy':'','identificationVerificationStatus':identificationVerificationStatus,
             'identificationReferences':identificationReferences,'associatedMedia':associatedMedia}
    # now create a dataframe from the dict for a single one
    occframe=pd.DataFrame.from_dict([tmpdict])
    # either start the dataframe/occurrence table or append new data to the frame.
    if s==0:
        occurrencedf=occframe
    else:
        occurrencedf=pd.concat([occurrencedf,occframe])

occurrencedf=occurrencedf.reset_index(drop=True)

In [66]:
occurrencedf.head(3)

Unnamed: 0,eventID,occurrenceID,scienfificName,scientificID,taxonRank,kingdom,basisOfRecord,occurrenceStatus,verbatimIdentification,identifiedBy,identificationVerificationStatus,identificationReferences,associatedMedia
0,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542.0_1,Dinophyceae,urn:lsid:marinespecies.org:taxname:19542,Class,Chromista,MachineObservation,Present,Dinophysis,,PredictedByMachine,"Trained machine learning model: Daniel, P. (20...",https://ifcb.caloos.org/image?image=00108&bin=...
1,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:109470.0_1,Alexandrium,urn:lsid:marinespecies.org:taxname:109470,Genus,Chromista,MachineObservation,Present,Alexandrium_singlet,,PredictedByMachine,"Trained machine learning model: Daniel, P. (20...",https://ifcb.caloos.org/image?image=00022&bin=...
2,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:149151.0_1,Pseudo-nitzschia,urn:lsid:marinespecies.org:taxname:149151,Genus,Chromista,MachineObservation,Present,Pseudo-nitzschia,,PredictedByMachine,"Trained machine learning model: Daniel, P. (20...",https://ifcb.caloos.org/image?image=01081&bin=...


In [None]:
# This is test code to be removed when sure code runs
#zc=classtable['class']==summarytable.index[1]
#mytest=classtable[zc]
#del(rois)
#rois=mytest['pid'].str.replace(bin_id+'_','')
#rois=rois.astype(int).reset_index(drop=True)
#lr=len(rois)
#bv=0
#for i in np.arange(0,lr):
#    if i==0:
#        bv=features.loc[features['roi_number']==rois[0],'Biovolume'].item()
#    else:
#        bv=bv+features.loc[features['roi_number']==rois[0],'Biovolume'].item()
#print(bv)
##features.loc[features['roi_number']==rois[0],'Biovolume'].item()
##bv=features['roi_number'].apply(lambda x: rois[x])

In [83]:
# EMoF table
# every row must have an eventID
# not every column has to be filled 
# occurrenceID is NOT in every row
# this is for a single eventID so could have an outer loop
# row 1
row1={'eventID':range_response.iloc[0],
      #'eventID':details['bin_id'],
      'occurrenceID':'',
      'measurementType':'Sampling Instrument Name',
      'measurementTypeID':'http://vocab.nerc.ac.uk/collection/Q01/current/Q0100002/',
      'measurementValue':'McLane Research Laboratories Imaging FlowCytobot imaging sensor',
      'measurementValueID':'http://vocab.nerc.ac.uk/collection/L22/current/TOOL1588/',
      'measurementUnit':'',
      'measurementUnitID':'',
      'measurementRemarks':''}
emof=pd.DataFrame([row1])
row2={'eventID':range_response.iloc[0],
      #'eventID':details['bin_id'],
      'occurrenceID':'',
      'measurementType':'Sample Volume',
      'measurementTypeID':'http://vocab.nerc.ac.uk/collection/P01/current/VOLXXXXX/',
      'measurementValue':vol_analyzed,
      'measurementValueID':'',
      'measurementUnit':'milliliters',
      'measurementUnitID':'http://vocab.nerc.ac.uk/collection/P06/current/VVML/',
      'measurementRemarks':''}
emof=pd.concat([emof,pd.DataFrame([row2])])
ls=len(summarytable)
for s in np.arange(0,ls):
    z=xworms['className'][:]==summarytable.index[s]
    zi=np.where(z)
    iz=worms_df.iloc[zi[0][0]]
    zc=classtable['class']==summarytable.index[s] # find the class names
    occurrenceID=range_response.iloc[0]+':Taxon:'+str(iz['AphiaID'])+'_1'
    #occurrenceID=details["bin_id"]+':Taxon:'+str(iz['AphiaID'])+'_1'
    # get the ROIs so we can get the biovolumes
    mytest=classtable[zc]
    rois=mytest['pid'].str.replace(range_response.iloc[0]+'_','')
    #rois=mytest['pid'].str.replace(bin_id+'_','')
    rois=rois.astype(int).reset_index(drop=True) # will this work with feature dataframe?
    lr=len(rois)
    #print(lr)
    bv=0
    if summarytable['occurrences'].iloc[s] > 0:
        for i in np.arange(0,lr):
            if i==0:
                #bv=features.loc[features['roi_number']==rois[i],'Biovolume'].item()
                #area=features.loc[features['roi_number']==rois[i],'Area'].item()
                ecd=features.loc[features['roi_number']==rois[i],'EquivDiameter'].item()
                #ecd=2*math.sqrt(area/pi)
                #print("Equiv Diameter "+str(eqd)+" Ecd from area "+str(ecd))
                bvol=(1/6)*pi*(ecd**3)
                #print("Biovol "+str(bv)+" Bio-vol "+str(bvol))
            else:
                #bv=bv+features.loc[features['roi_number']==rois[i],'Biovolume'].item()
                #area=features.loc[features['roi_number']==rois[i],'Area'].item()
                ecd=features.loc[features['roi_number']==rois[i],'EquivDiameter'].item()
                #ecd=2*math.sqrt(area/pi)
                #print("Equiv Diameter "+str(eqd)+" Ecd from area "+str(ecd)) 
                bvol_tmp=(1/6)*pi*(ecd**3)
                bvol=bvol+bvol_tmp
                #print("Sum Biovol "+str(bv)+" Bio-vol "+str(bvol))
    # now bv is the biovolume in pixels
    #bv=bv/2.7 # this is the biovolume in microns?  
    bvol=bvol/2.7 # this is the biovolume in micros?
    # Now need to get biovolume based on roi number
    # conversion factor is 2.7 pixels per micron
    # Do we need these values if the species is abscent?
    row3={'eventID':range_response.iloc[0],
          #'eventID':details['bin_id'],
          'occurrenceID':occurrenceID,
          'measurementType':'Abundance of biological entity specified elsewhere per unit volume of the water body',
          'measurementTypeID':'http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL01/',
          'measurementValue':float(summarytable['occurrences'][s])/vol_analyzed,
          'measurementValueID':'',
          'measurementUnit':'Number per millilitre', # Don't we need to divide ?
          'measurementUnitID':'http://vocab.nerc.ac.uk/collection/P06/current/UCML/',
          'measurementRemarks':'number is inclusive of single cells and chains'}
    emof=pd.concat([emof,pd.DataFrame([row3])])
    row4={'eventID':range_response.iloc[0],
          #'eventID':details['bin_id'],
          'occurrenceID':occurrenceID,
          'measurementType':'Biovolume of biological entity specified elsewhere per unit volume of the water by calculation using Moberg and Sosik (2012) doi: 10.4319/lom.2012.10.278',
          'measurementTypeID':'',
          'measurementValue':bvol, # NEED TO COMPUTE change from bv to bvol for this case
          'measurementValueID':'',
          'measurementUnit':'cubic microns per millilitre',
          'measurementUnitID':'http://vocab.nerc.ac.uk/collection/P06/current/UCUM/',
          'measurementRemarks':'number is inclusive of single cells and chains'}
    emof=pd.concat([emof,pd.DataFrame([row4])])    
# eventID
# eventID=details['bin_id']
# measurementType='Sampling Instrument Name'
# measurementTypeID='http://vocab.nerc.ac.uk/collection/Q01/current/Q0100002/'
# measurementValue='McLane Research Laboratories Imaging FlowCytobot imaging sensor'
# measurementValueID='http://vocab.nerc.ac.uk/collection/L22/current/TOOL1588/'
# measurementType='Sample Volume'
# measurementTypeID='http://vocab.nerc.ac.uk/collection/P01/current/VOLXXXXX/'
# measurementValue=analyzed_volume
# measurementUnit='Millilitres'
# measurementUnitID='http://vocab.nerc.ac.uk/collection/P06/current/VVML/'
# occurrenceID
# measurementRemarks='abundance per milliliter, inclusive of single cells and chains'
# measurementValueID
# measurementUnit
# measurementUnitID
# measurementID
#           'sampleSizeValue':details['ml_analyzed'], # need to remove the ml from the name
#           'sampleSizeUnit':'milliliter'}
# okay need code to loop through 
#ls=len(summarytable)
# xworms=wormsdf['Class Name'].reset_index() # not sure we need this here
#for s in np.arange(0,1):
#for s in np.arange(0,ls):
#    zc=classtable['class']==summarytable.index[s] # find
# This is an example we want to use later commenting out and cleaning up code
#    mytest=classtable[zc]
#    rois=mytest['pid'].str.replace(bin_id+'_','')
#    rois=rois.astype(int) # will this work with feature dataframe?
#    # I don't think this will work since index 0 is roi_number 2
#    # does not work
#    #roin=features['roi_number'][rois]
#    #biov=features['Biovolume'][rois]
###mytest['pid']
#### print(bin_id)
##rois=mytest['pid'].str.replace(bin_id+'_','')
##rois=rois.astype(int)
###print(rois)
## summarytable.index[s] is Patrick's class name


Biovol 740659.920515 Bio-vol 628631.4489067243
Sum Biovol 1334533.05943 Bio-vol 1596554.6953039295
Sum Biovol 2288723.105619 Bio-vol 2393585.6551363934
Biovol 1524645.021717 Bio-vol 1478562.6564404662
Sum Biovol 2446276.517942 Bio-vol 2269734.0977864573
Sum Biovol 2583068.0070640002 Bio-vol 2398193.744286933
Sum Biovol 3099025.6439310005 Bio-vol 2906173.0681326133
Sum Biovol 3249423.6326350006 Bio-vol 3056296.974586997
Sum Biovol 3524783.0506250006 Bio-vol 3318279.0580752757
Sum Biovol 3662509.6871600007 Bio-vol 3443869.7933839834
Sum Biovol 4248038.816069 Bio-vol 4598088.698298707
Sum Biovol 5858432.823867001 Bio-vol 5729607.187569613
Sum Biovol 6258275.88726 Bio-vol 6017973.450489577
Sum Biovol 7391874.043214001 Bio-vol 6831163.263311316
Sum Biovol 8018677.017276001 Bio-vol 7348874.968531845
Sum Biovol 8622291.634162001 Bio-vol 7860918.508344438
Sum Biovol 8981330.383173002 Bio-vol 8122900.591832716
Sum Biovol 11632527.361221002 Bio-vol 10388967.845093107
Biovol 196862.824449 Bio-vol

  'measurementValue':float(summarytable['occurrences'][s])/vol_analyzed,
  'measurementValue':float(summarytable['occurrences'][s])/vol_analyzed,
  'measurementValue':float(summarytable['occurrences'][s])/vol_analyzed,


In [71]:
emof=emof.reset_index(drop=True)
emof

Unnamed: 0,eventID,occurrenceID,measurementType,measurementTypeID,measurementValue,measurementValueID,measurementUnit,measurementUnitID,measurementRemarks
0,D20230717T000942_IFCB104,,Sampling Instrument Name,http://vocab.nerc.ac.uk/collection/Q01/current...,McLane Research Laboratories Imaging FlowCytob...,http://vocab.nerc.ac.uk/collection/L22/current...,,,
1,D20230717T000942_IFCB104,,Sample Volume,http://vocab.nerc.ac.uk/collection/P01/current...,4.01,,milliliters,http://vocab.nerc.ac.uk/collection/P06/current...,
2,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542.0_1,Abundance of biological entity specified elsew...,http://vocab.nerc.ac.uk/collection/P01/current...,0.74813,,Number per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
3,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542.0_1,Biovolume of biological entity specified elsew...,,847675.224303,,cubic microns per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
4,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:109470.0_1,Abundance of biological entity specified elsew...,http://vocab.nerc.ac.uk/collection/P01/current...,3.740648,,Number per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
5,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:109470.0_1,Biovolume of biological entity specified elsew...,,4308343.467119,,cubic microns per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
6,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:149151.0_1,Abundance of biological entity specified elsew...,http://vocab.nerc.ac.uk/collection/P01/current...,0.74813,,Number per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
7,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:149151.0_1,Biovolume of biological entity specified elsew...,,280178.920664,,cubic microns per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains


In [72]:
eventdf.to_csv('ifcb_event.csv',index=False)
occurrencedf.to_csv('ifcb_occurrence.csv',index=False)
emof.to_csv('ifcb_emof.csv',index=False)