# Format Santa Cruz Wharf IFCB Data for OBIS #


In [5]:
# Install pyworms
%pip install pyworms


Note: you may need to restart the kernel to use updated packages.


In [11]:
import numpy as np
import pandas as pd
import pyworms
import requests
import json
import os
import pyworms

## Call WoRMS API ##

__Load AphiaID to ClassificationID mapping spreadsheet__

Each classification class is mapped to an AphiaID in the Aphia table.

In [12]:

class_names_file = "../data/class_names_matched_to_WoRMS_cencoos_classifier_20240412.csv"
class_names = pd.read_csv(class_names_file)
class_names.head()

Unnamed: 0,Class Name,Class ID,Common Name,AphiaID,URN,Grouping,HAB,Description,Manual Classifier Notes,Example Images
0,Akashiwo,0,Akashiwo sanguinea,232546.0,urn:lsid:marinespecies.org:taxname:232546,Dinoflagellate,Y,"Monophyletic, marine dinoflagellate","large single cells, kidney-shaped when viewed ...",
1,Alexandrium_singlet,1,Alexandium sp.,109470.0,urn:lsid:marinespecies.org:taxname:109470,Dinoflagellate,Y,A. catenella and others,"Originally class was broken out for singles, d...",http://akashiwo.oceandatacenter.ucsc.edu:8000/...
2,Amy_Gony_Protoc,2,"Amylax, Gonyaulax or Protoceratium",109428.0,urn:lsid:marinespecies.org:taxname:109428,Dinoflagellate,Y,Descision was made not to distinguish between ...,“Feet” and points visible.,
3,Asterionellopsis,3,Asterionellopsis,149138.0,urn:lsid:marinespecies.org:taxname:149138,Diatom,N,"Common species include A. glacialis, A. kariana",,
4,Boreadinium,4,Boredadinium pisiforme,110067.0,urn:lsid:marinespecies.org:taxname:110067,Dinoflagellate,N,Slightly compressed thecate cells,<45 um,


In [13]:
scientificnames = []
lsids = []
ranks = []
kingdoms = []

for i, aphiaid in enumerate(class_names['AphiaID'].values):
    
    # Check if AphiaID is NaN
    if ~np.isnan(aphiaid):
        # Call worms API service
        worms_response = wmentry = pyworms.aphiaRecordByAphiaID(int(aphiaid))
        scientificnames.append(worms_response['scientificname'])
        lsids.append(worms_response['lsid'])
        ranks.append(worms_response['rank'])
        kingdoms.append(worms_response['kingdom'])
        
    else:
        scientificnames.append(None)
        lsids.append(None)
        ranks.append(None)
        kingdoms.append(None)

In [14]:
worms_df = pd.DataFrame(data={
                            "AphiaID":class_names['AphiaID'].values,
                            "scientificname":scientificnames,
                            "lsid":lsids,
                            "rank":ranks,
                            "kingdom":kingdoms
                            })
worms_df['className'] = class_names['Class Name']
worms_df.tail()

Unnamed: 0,AphiaID,scientificname,lsid,rank,kingdom,className
47,101196.0,Tontonia,urn:lsid:marinespecies.org:taxname:101196,Genus,Chromista,Tontonia
48,109479.0,Torodinium,urn:lsid:marinespecies.org:taxname:109479,Genus,Chromista,Torodinium
49,149518.0,Tropidoneis,urn:lsid:marinespecies.org:taxname:149518,Genus,Chromista,Tropidoneis
50,707571.0,Vicicitus,urn:lsid:marinespecies.org:taxname:707571,Genus,Chromista,Vicicitus
51,,,,,,unclassified



## Load Class Specific Thredsholds

In [7]:
thresholds = pd.read_csv('../data/class_thresholds_cencoos_classifier_20240412.csv')
thresholds.head(3)

Unnamed: 0,Class Name,Threshold
0,Akashiwo,0.95
1,Alexandrium_singlet,0.49
2,Amy_Gony_Protoc,0.35


Files that are unique to the deployments are the class name file and the threshold file</p>
These can NOT be extracted from the api as far as we know and have to be known apriori

In [8]:
target_species=['Dinophysis','Alexandrium_singlet','Pseudo-nitzschia']

## Helper Functions ##

In [22]:
def get_datasets(dashboard_url):
    """Return a list dashboard datasets from the API

    Args:
        dashboard_url (str): base url of an IFCB dashboard (V2)
    
    Returns:
        list: list of dataset names
    """
    request_rul = os.path.join(dashboard_url, 'api/filter_options')
    response=requests.get(request_rul)
    content=response.content
    content=json.loads(content)
    
    return content['dataset_options']

def get_bins_in_range(start_date, end_date, dataset_name, base_dashboard_url='https://ifcb.caloos.org'):
    """ Given a start date and end date, request all of the ifcb sampled from a given instrument feed

    Args:
        start_date (str): Start date string in the form of yyyy-mm-dd
        end_date (str): End date string in the form of yyyy-mm-dd
    Returns: 
        (pd.DataFrame): dataframe with a series of bin ids 
    """
    # Dates should be of the 
    url = f"{base_dashboard_url}/{dataset_name}/api/feed/temperature/start/{start_date}/end/{end_date}"
    response=requests.get(url)
    
    if response.status_code==200:
        content=response.content
        content=json.loads(content)
        content=pd.DataFrame.from_dict(content)
        content["pid"]=content["pid"].map(lambda x: x.lstrip(f"{base_dashboard_url}/{dataset_name}/"))
        content=content["pid"]
        return(content)
    
    else:
        print('Failed to get all bins with range with code: '+response.status_code)
        return(response.status_code)
    
    
def get_ifcb_metadata(bin, base_dashboard_url='https://ifcb.caloos.org'):    
    """ Return metadata for a given bin using the dashboard API (V2)
    Args:
        bin (str): bin id to get metadata. Bins are in the form of DYYYYMMDDTHHmmss_IFCBNNN (ex. D20230717T000942_IFCB104)
        base_dashboard_url (str): base url of an IFCB dashboard (V2)
    Returns:
        dict: Returns a dictionary of metadata for a given bin
    """
    request_url = f"{base_dashboard_url}/api/metadata/{bin}"
    1
    response=requests.get(request_url)
    
    if response.status_code==200:
        content=response.content
        content=json.loads(content)
    
    else:
        print("Metadata GET request failed with code: "+str(response.status_code))
        content = response.status_code
        
    return content

def get_bin_details(bin, base_dashboard_url='https://ifcb.caloos.org'):
    """ Return information about a bin using the dashboard API (V2)

    Args:
        bin (str): bin id to get metadata. Bins are in the form of DYYYYMMDDTHHmmss_IFCBNNN (ex. D20230717T000942_IFCB104)
        base_dashboard_url (str, optional): _description_. Defaults to 'https://ifcb.caloos.org'.

    Returns:
        _type_: _description_
    """
    url = f"{base_dashboard_url}/api/bin/{bin}"
    
    response=requests.get(url)
    
    if response.status_code==200:
        content = response.content
        content = json.loads(content)
        # newdict = {"bin_id":bin}
        # newdict.update(content)
        content
    
    else:
        print("Bin neighbors GET request failed with code: "+str(response.status_code))
        content = None
    return content

def bin_has_autoclass(bin, base_dashboard_url='https://ifcb.caloos.org'):
    """return boolean if a bin has autoclassification results

    Args:
        bin (str): bin id to get metadata. Bins are in the form of DYYYYMMDDTHHmmss_IFCBNNN (ex. D20230717T000942_IFCB104)
        base_dashboard_url (str, optional): base url of an IFCB dashboard (V2). Defaults to 'https://ifcb.caloos.org'.

    Returns:
        boolean: True if dahsboard has autoclassification results for bin, False otherwise
    """
   
    url = f"{base_dashboard_url}/api/has_products/{bin}"
    response=requests.get(url)
    
    if response.status_code==200:
        content=response.content
        content=json.loads(content)       
        class_score = content['has_class_scores']
    
    else:
        print('Autclass GET faile with code: '+str(response.status_code))
        class_score = None
        
    return class_score

def get_autoclass_data(bin,dataset_name, base_url='https://ifcb.caloos.org'):
    """Return autoclassification data for a given bin

    Args:
        bin (str): bin id to get metadata. Bins are in the form of DYYYYMMDDTHHmmss_IFCBNNN (ex. D20230717T000942_IFCB104)
        base_url (str, optional): base url of an IFCB dashboard (V2). Defaults to 'https://ifcb.caloos.org'.

    Returns:
        dict: dictionary of autoclassification data
    """
    try:
        url = f"{base_url}/{dataset_name}/{bin}_class_scores.csv"
        df = pd.read_csv(url)
        
    except Exception as e:
        print(f"Failed to get autoclassification data for bin {bin} with error: {e}")
        df = None
        
    return df

def get_feature_file(bin, dataset_name, base_url='https://ifcb.caloos.org'):
    """Return feature file for a given bin

    Args:
        bin (str): bin id to get metadata. Bins are in the form of DYYYYMMDDTHHmmss_IFCBNNN (ex. D20230717T000942_IFCB104)
        base_url (str, optional): base url of an IFCB dashboard (V2). Defaults to 'https://ifcb.caloos.org'.

    Returns:
        pd.DataFrame: feature file for a given bin
    """
    try:
        url = f"{base_url}/{dataset_name}/{bin}_features.csv"
        df = pd.read_csv(url)
        
    except Exception as e:
        print(f"Failed to get feature file for bin {bin} with error: {e}")
        df = None
        
    return df


## Call Dashboard API ##
Using the IFCB DB API, request the names of all of the datasets on a dashboard instance.

### Get available datasets from Dashboard

Using the IFCB DB API, request the names of all of the datasets on a dashboard instance.

In [23]:
base_url='https://ifcb.caloos.org/'

dataset_names = get_datasets(base_url)
scw_datset = dataset_names[12]
dataset_names

['bloofinz-io',
 'bodega-marine-lab',
 'calcofi-cruises-ctd',
 'calcofi-cruises-underway',
 'cal-poly-humboldt-hioc',
 'cce-lter-process-cruises-ifcb-151',
 'del-mar-mooring',
 'mbari-power-buoy',
 'newport-beach-pier',
 'plumes-and-blooms-cruises',
 'san-francisco-bay-cruises',
 'san-francisco-pier-17',
 'santa-cruz-municipal-wharf',
 'scripps-pier-ifcb-151',
 'scripps-pier-ifcb-158',
 'scripps-pier-ifcb-183',
 'stearns-wharf']

### Get bin files with a date range ###

The dashboard API allows requests for the name of bin files between a daterange. This is useful for getting the bin files for a specific deployment.

In [24]:
DATASET = scw_datset
start_date = "2023-07-17"
end_date = "2023-07-18"

range_response = get_bins_in_range(start_date=start_date, end_date=end_date, dataset_name=DATASET)
print(f"n files: {len(range_response)}\n{range_response.head()}")

n files: 58
0    D20230717T000942_IFCB104
1    D20230717T003329_IFCB104
2    D20230717T005715_IFCB104
3    D20230717T012101_IFCB104
4    D20230717T014447_IFCB104
Name: pid, dtype: object


### Get bin Metadata File ###

This file contains the metadata file for each bin file, which includes information about the bin file.

In [30]:
metadatavals = get_ifcb_metadata(range_response.iloc[0])
metadatavals['metadata'].keys()

dict_keys(['FileComment', 'SyringeSampleVolume', 'sampleVolume2skip', 'runTime', 'inhibitTime', 'temperature', 'humidity', 'PMTAhighVoltage', 'PMTBhighVoltage', 'PMTAtriggerThreshold_DAQ_MCConly', 'PMTBtriggerThreshold_DAQ_MCConly', 'blobXgrowAmount', 'blobYgrowAmount', 'binarizeThreshold', 'minimumBlobArea', 'runSampleFast', 'context', 'AnalogFirmware', 'HousekeepingFirmware', 'sampleNumber', 'sampleType', 'triggerCount', 'roiCount', 'ADCFileFormat', 'DAQ_MCCserialPort_DAC_MCConly', 'auxPower1', 'autoStart', 'InteractiveAutoStart', 'autoShutdown', 'HumidityAlarmThreshold(%)', 'ValveHumidityHysteresis_DAC_MCConly', 'FanTemperatureThreshold_DAC_MCConly', 'FanTemperatureHysteresis_DAC_MCConly', 'valveDelay', 'FlashlampControlVoltage', 'HKTRIGGERtoFlashlampDelayTime_x434ns_DAC_MCConly', 'FlashlampPulseLength_x434ns_DAC_MCConly', 'CameraPulseLength_x434ns_DAC_MCConly', 'cameraGain', 'minimumGapBewtweenAdjacentBlobs', 'laserState', 'runningCamera', 'pump1State', 'pump2State', 'stirrer', 'vi

### Check if bin file has an autoclass file on Dashboard and load file into a pandas dataframe ###

Using the
dashboard API check if an autoclass file exists for a specific bin file.

In [230]:
if bin_has_autoclass(bin=range_response.iloc[0]):
    auto_class = get_autoclass_data(bin=range_response.iloc[0],dataset_name=DATASET)

### get __ml_analyzed__ from the IFCB database for each bin. ###

Volume analyzed (ml_analyzed) is the effective sample volume for a given sample. Sample volume can be limited by the processing speed of the camera/computer system and is caluclated as function of the inhibited time of the camera and the flow rate of the water through the camera.

In [17]:
bin_info = get_bin_details(range_response.iloc[0])
vol_analyzed = float(bin_info['ml_analyzed'].split()[0])
print(f"Volume Analyzed: {vol_analyzed} mL for bin {range_response.iloc[0]}")

Volume Analyzed: 4.01 mL for bin D20230717T000942_IFCB104


### Get Image Feature Information ###

Features are properties of the image that are extracted (ie circularity, aspect ratio, biovolume, esd...). These can used to classify the images into classes using random forest classifiers and are important for calculating biovolume and carbon mass.

In [18]:
features = get_feature_file(range_response.iloc[0],dataset_name=DATASET)
features.head()

Unnamed: 0,roi_number,Area,B180,B90,Bflip,Biovolume,BoundingBox_xwidth,BoundingBox_ywidth,ConvexArea,ConvexPerimeter,...,HOG79,HOG80,HOG81,Area_over_PerimeterSquared,Area_over_Perimeter,H90_over_Hflip,H90_over_H180,Hflip_over_H180,summedConvexPerimeter_over_Perimeter,rotated_BoundingBox_solidity
0,2,3496.0,0.863143,0.800571,0.840857,170807.2,76,78,4155.436392,235.872784,...,0.384116,0.18996,0.367227,0.041825,12.092127,1.18866,1.334921,1.123047,0.815848,0.609756
1,3,2432.0,0.925576,0.392681,0.900493,161885.6,124,34,2835.403637,260.807274,...,0.272964,0.235927,0.315059,0.0309,8.668835,11.166438,13.717263,1.228437,0.929644,0.57685
2,4,35772.0,0.198306,0.133238,0.954282,1505260.0,408,334,86633.314481,1144.628962,...,0.328164,0.404628,0.406124,0.014081,22.443554,35.462351,1.220652,0.034421,0.718147,0.330039
3,5,8955.0,0.767606,0.410127,0.917505,310105.0,222,104,14643.525539,523.051077,...,0.230965,0.555786,0.30078,0.021016,13.718427,12.932686,4.058272,0.3138,0.801277,0.469853
4,6,6239.0,0.769169,0.447225,0.929901,177397.7,173,85,9613.432291,419.864582,...,0.280826,0.446211,0.267132,0.020225,11.233139,15.129081,4.41241,0.291651,0.755954,0.540958


Combine features with the class information to create a single table.

## Select top Class for each roi ##

Using the class specific thresholds select the top class for each roi.

- Return the top class
- Return the probability of the top class
- Return "Unknown" if no class is above the threshold

This code is a little complicated but the basics are:
- Generate a matrix of the class specific thresholds (51 classes x N rois)
- Use the threshold matrix to create a boolean matrix (ie a mask)
- Sum by the rows of the boolean matrix to get the number of classes above the threshold
- Look for where the sum is greater than 0
    - If sum is 1, findwhere the True value is and return the class name and probability
    - If sum is greater than 1, loop through the classes and find the class with the highest probability and return the class name and probability


In [219]:
threshold_array = np.tile(thresholds['Threshold'].values, (auto_class.shape[0], 1))
top_class = np.repeat("Unclassified", auto_class.shape[0])
top_prob = np.repeat(0.0, auto_class.shape[0])

# Mask autoclass data (class and top probability)
auto_class_masked = auto_class[auto_class.columns[1:]] > threshold_array
max_probs = auto_class[auto_class.columns[1:]][auto_class_masked].max(axis=1)

# Where values are greater than the threshold, add the class name to the top_class array
top_ix = np.where(auto_class_masked.sum(axis=1) > 0)
top_class[top_ix] = auto_class_masked.idxmax(axis=1).values[top_ix]
# Collect the top probabilities for each class
top_prob[top_ix] = max_probs.values[top_ix]

# Sum the number of classes that are above the threshold to check if class has more than one. If so loop through and slect the larger value.
multiple_top_ix = np.where(auto_class_masked.sum(axis=1) > 1)[0]
if len(multiple_top_ix > 0):
    for k in range(len(multiple_top_ix)):
    # Make an array that is all unclassified for best guess
        ixs = np.argwhere(auto_class_masked.iloc[multiple_top_ix[k]].values).flatten() # get the indexes of the classes that are above threshold based on boolean mask

        print(f"Multiple Over Threshold Prop: for image {auto_class['pid'].iloc[multiple_top_ix].values[0]}")
        multiples = auto_class[auto_class.columns[ixs+1]].iloc[multiple_top_ix[k]] # add one to the column index to get the class name because the first column is the image name

        for i in range(len(multiples)):
            if i == 0:
                top_mult_class = multiples.index[i]
                top_mult_prob = multiples.iloc[i]

            print(f"Class: {multiples.index[i]}  {multiples.iloc[i]}, Threshold: {thresholds[thresholds['Class Name'] == multiples.index[i]]['Threshold'].values[0]}")
            
            if multiples.iloc[i] > top_mult_prob:
                top_mult_class = multiples.index[i]
                top_mult_prob = multiples.iloc[i]
        
        top_class[multiple_top_ix[k]] = top_mult_class
        top_prob[multiple_top_ix[k]] = top_mult_prob


Multiple Over Threshold Prop: for image D20230717T000942_IFCB104_01481
Class: Clusterflagellate  0.593, Threshold: 0.55
Class: Thalassiosira  0.3804, Threshold: 0.36


# Generate a summary table #

- Generate a summary table for Dinophysis, Alexandrium_singlet, Pseudo-nitzschia	

In [249]:
import pandas as pd
features['pid'] = range_response.iloc[0] + "_" + features['roi_number'].astype(str).str.zfill(5)
export_table = pd.DataFrame({'roi_number': features['roi_number'].astype(str).str.zfill(5),'pid':features['pid'],'class':top_class,'score':top_prob})
# Only keep the classes we are interested in ['Dinophysis','Alexandrium_singlet','Pseudo-nitzschia']
export_table = export_table[export_table['class'].isin(['Dinophysis','Alexandrium_singlet','Pseudo-nitzschia'])]

# Use the worms_df dataframe as a lookup table to get the scientific name, AphiaID, and rank for each class
worms_df.index = worms_df['className']
export_table['scientificName'] = export_table['class'].map(worms_df['scientificname'])
export_table['AphiaID'] = export_table['class'].map(worms_df['AphiaID'])
export_table['scientificID'] = "urn:lsid:marinespecies.org:taxname:" + export_table['AphiaID'].astype(int).astype(str)
export_table['eventID'] = range_response.iloc[0]
export_table['occurrenceID'] = export_table['eventID'] + ":Taxon:" + export_table['AphiaID'].astype(int).astype(str) + "_1"
export_table['taxonRank'] = export_table['class'].map(worms_df['rank'])
export_table['kingdom'] = export_table['class'].map(worms_df['rank'])
export_table.rename(columns={'class':'verbatimIdentification'}, inplace=True)
export_table['identiedBy'] = 'MachineObservation'
export_table['identificationVerificationStatus'] = 'Present'
export_table['identificationReferences'] = "Trained machine learning model: Daniel, P. (2023-02) phytoClassUCSC - A phytoplankton classifier for IFCB data. Version 1.0. Hugging Face repository. https://huggingface.co/patcdaniel/phytoClassUCSC | Software to run the trained machine learning model: Sosik, H., J. Futrelle, E. Peacock,  T. Golden, J. Lopez (2023-11-13) ifcb-analysis. GitHub repository. https://github.com/tsgolden/ifcb-analysis/commit/9e228c9f616edd85b57aefc0792125ec92f20e31 forked from https://github.com/yosoyjay/ifcb-analysis forked from https://github.com/hsosik/ifcb-analysis | Software to interpret autoclass scores: https://github.com/CeNCOOS/OBIS_workshop_2024_IFCB [Add which script or function] | Input parameters to interpret autoclass scores: https://github.com/CeNCOOS/OBIS_workshop_2024_IFCB/blob/main/class_thresholds_cencoos_classifier_20240412.csv"    
export_table['associatedMedia'] = base_url + "image?image=" + export_table['roi_number'] + "&dataset=" + DATASET + "&bin=" + export_table['eventID']
export_table.head()

Unnamed: 0,roi_number,pid,verbatimIdentification,score,scientificName,AphiaID,scientificID,eventID,occurrenceID,taxonRank,kingdom,identiedBy,identificationVerificationStatus,identificationReferences,associatedMedia
105,108,D20230717T000942_IFCB104_00108,Dinophysis,0.9795,Dinophyceae,19542.0,urn:lsid:marinespecies.org:taxname:19542,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542_1,Class,Class,MachineObservation,Present,"Trained machine learning model: Daniel, P. (20...",https://ifcb.caloos.org/image?image=00108&data...
488,492,D20230717T000942_IFCB104_00492,Dinophysis,0.946,Dinophyceae,19542.0,urn:lsid:marinespecies.org:taxname:19542,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542_1,Class,Class,MachineObservation,Present,"Trained machine learning model: Daniel, P. (20...",https://ifcb.caloos.org/image?image=00492&data...
1872,1877,D20230717T000942_IFCB104_01877,Dinophysis,0.9995,Dinophyceae,19542.0,urn:lsid:marinespecies.org:taxname:19542,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542_1,Class,Class,MachineObservation,Present,"Trained machine learning model: Daniel, P. (20...",https://ifcb.caloos.org/image?image=01877&data...


# Generate Event Table #

In [254]:
metadatavals['metadata'].keys()

dict_keys(['FileComment', 'SyringeSampleVolume', 'sampleVolume2skip', 'runTime', 'inhibitTime', 'temperature', 'humidity', 'PMTAhighVoltage', 'PMTBhighVoltage', 'PMTAtriggerThreshold_DAQ_MCConly', 'PMTBtriggerThreshold_DAQ_MCConly', 'blobXgrowAmount', 'blobYgrowAmount', 'binarizeThreshold', 'minimumBlobArea', 'runSampleFast', 'context', 'AnalogFirmware', 'HousekeepingFirmware', 'sampleNumber', 'sampleType', 'triggerCount', 'roiCount', 'ADCFileFormat', 'DAQ_MCCserialPort_DAC_MCConly', 'auxPower1', 'autoStart', 'InteractiveAutoStart', 'autoShutdown', 'HumidityAlarmThreshold(%)', 'ValveHumidityHysteresis_DAC_MCConly', 'FanTemperatureThreshold_DAC_MCConly', 'FanTemperatureHysteresis_DAC_MCConly', 'valveDelay', 'FlashlampControlVoltage', 'HKTRIGGERtoFlashlampDelayTime_x434ns_DAC_MCConly', 'FlashlampPulseLength_x434ns_DAC_MCConly', 'CameraPulseLength_x434ns_DAC_MCConly', 'cameraGain', 'minimumGapBewtweenAdjacentBlobs', 'laserState', 'runningCamera', 'pump1State', 'pump2State', 'stirrer', 'vi

In [None]:
eventdict={'datasetName':f'{}/santa-cruz-municipal-wharf',
           'eventID':range_response.iloc[0],
           'eventDate':details['timestamp_iso'],
           'institutionCode':'UCSC',
           'decimalLongitude':details['lng'],
           'decimalLatitude':details['lat'],
           'countryCode':'US',
           'geodeticDatum':'WGS84',
           'minimumDepthInMeters':1,
           'maximumDepthInMeters':3,
           'samplingProtocol':'https://doi.org/10.1002/lno.11443',
           'sampleSizeValue':analyzed_volume,
           #'sampleSizeValue':details['ml_analyzed'], # need to remove the ml from the name
           'sampleSizeUnit':'milliliter'}
eventdf=pd.DataFrame.from_dict([eventdict])

In [251]:
# create event table
# want to create this as a module which we pass information to but for now mostly static...
## event table
# datasetName="santa-cruz-municipal-wharf"
# eventID=bin_id
# eventDate=details['timestamp_iso'] # need to truncate to just the date
# decimalLongitude=details['lng']
# decimalLatitude=details['lat']
# countryCode='US'
# geodeticDatum='WGS84'
# minimumDepthInMeters=
# maximumDepthInMeters=
# sampleSizeValue=details['ml_analyzed']
# sampleSizeUnit='milliliter'
########
# sampling protocol NEED TO ADD
#########
eventdict={'datasetName':'https://ifcb.caloos.org/santa-cruz-municipal-wharf',
           'eventID':bin_id,
           'eventDate':details['timestamp_iso'],
           'institutionCode':'UCSC',
           'decimalLongitude':details['lng'],
           'decimalLatitude':details['lat'],
           'countryCode':'US',
           'geodeticDatum':'WGS84',
           'minimumDepthInMeters':1,
           'maximumDepthInMeters':3,
           'samplingProtocol':'https://doi.org/10.1002/lno.11443',
           'sampleSizeValue':analyzed_volume,
           #'sampleSizeValue':details['ml_analyzed'], # need to remove the ml from the name
           'sampleSizeUnit':'milliliter'}
eventdf=pd.DataFrame.from_dict([eventdict])

{'metadata': {'FileComment': 'file comment',
  'SyringeSampleVolume': 5,
  'sampleVolume2skip': 0,
  'runTime': 1200.8840277777779,
  'inhibitTime': 238.50505208333334,
  'temperature': 23.13378271152819,
  'humidity': 16.3658518348974,
  'PMTAhighVoltage': 0.45,
  'PMTBhighVoltage': 0.55,
  'PMTAtriggerThreshold_DAQ_MCConly': 0.14,
  'PMTBtriggerThreshold_DAQ_MCConly': 0.14,
  'blobXgrowAmount': 20,
  'blobYgrowAmount': 5,
  'binarizeThreshold': 8,
  'minimumBlobArea': 2000,
  'runSampleFast': False,
  'context': 'SoftwareVersion: 2.3.0.0',
  'AnalogFirmware': 45,
  'HousekeepingFirmware': 32,
  'sampleNumber': 187,
  'sampleType': 'Normal',
  'triggerCount': 2876,
  'roiCount': 2951,
  'ADCFileFormat': 'trigger#, ADCtime, PMTA, PMTB, PMTC, PMTD, PeakA, PeakB, PeakC, PeakD, TimeOfFlight, GrabTimeStart, GrabTimeEnd, RoiX, RoiY, RoiWidth, RoiHeight, StartByte, ComparatorOut, StartPoint, SignalLength, Status, RunTime, InhibitTime',
  'DAQ_MCCserialPort_DAC_MCConly': '/dev/ttyS2',
  'auxP

In [45]:
eventdf

Unnamed: 0,datasetName,eventID,eventDate,institutionCode,decimalLongitude,decimalLatitude,countryCode,geodeticDatum,minimumDepthInMeters,maximumDepthInMeters,samplingProtocol,sampleSizeValue,sampleSizeUnit
0,https://ifcb.caloos.org/santa-cruz-municipal-w...,D20230717T000942_IFCB104,2023-07-17T00:09:42+00:00,UCSC,-122.021868,36.961491,US,WGS84,1,3,https://doi.org/10.1002/lno.11443,4.01,milliliter


In [46]:
summarytable['occurrences'].iloc[0]

3

In [52]:
# 
# size of summary table
#
# how to add associated data...
#
ls=len(summarytable)
xworms=wormsdf['Class Name'].reset_index()
# need to find classtable to match class name
# note this doesn't create a table/dataframe but needs to be adjusted as we have one to many needs to fill in columns
#for s in np.arange(0,1):
# so we only want summary table and not individuals as this point
for s in np.arange(0,ls):
    eventID=details["bin_id"]
    z=xworms['Class Name'][:]==summarytable.index[s]
    zi=np.where(z)
    iz=wormsdf.iloc[zi[0][0]]
    scientificName=iz['scientificname']
    scientificID=iz['lsid']
    taxonRank=iz['rank']
    kingdom=iz['kingdom']
    zc=classtable['class']==summarytable.index[s] # find the class names
    ci=np.where(zc)
    basisOfRecord='MachineObservation'
    if summarytable['occurrences'].iloc[s]>0:
        occurrenceStatus='Present' # need to deal with adding zeros for absent values but for now this is that case
    else:
        occurrenceStatus='Abscent'
    verbatimIdentification=classtable['class'][ci[0][0]]
    identifiedBy=''
    identificationVerificationStatus='PredictedByMachine'
    identificationReferences="Trained machine learning model: Daniel, P. (2023-02) phytoClassUCSC - A phytoplankton classifier for IFCB data. Version 1.0. Hugging Face repository. https://huggingface.co/patcdaniel/phytoClassUCSC | Software to run the trained machine learning model: Sosik, H., J. Futrelle, E. Peacock,  T. Golden, J. Lopez (2023-11-13) ifcb-analysis. GitHub repository. https://github.com/tsgolden/ifcb-analysis/commit/9e228c9f616edd85b57aefc0792125ec92f20e31 forked from https://github.com/yosoyjay/ifcb-analysis forked from https://github.com/hsosik/ifcb-analysis | Software to interpret autoclass scores: https://github.com/CeNCOOS/OBIS_workshop_2024_IFCB [Add which script or function] | Input parameters to interpret autoclass scores: https://github.com/CeNCOOS/OBIS_workshop_2024_IFCB/blob/main/class_thresholds_cencoos_classifier_20240412.csv"    
    # Needs to be put in
    # associatedMedia 
    # get the ROIs so we can get reference the actual image in the dataset
    mytest=classtable[zc]
    rois=mytest['pid'].str.replace(bin_id+'_','')
    rois=rois.reset_index(drop=True)
    lr=len(rois)
    # image share link example is
    # https://ifcb.caloos.org/image?image=02075&bin=D20230717T0033229_IFCB104
    roistr=''
    strpre='https://ifcb.caloos.org/image?image='
    binstr='&bin='
    for ri in np.arange(0,lr):
        roistr=roistr+strpre+rois[ri]+binstr+bin_id+"|"
    associatedMedia=roistr
    
    # this is for individuals, for now we are just working with summary values
    # 
    # This is NOT proper it needs to be fixed to do it correctly...
    # This needs to be fixed to have multiple values
    occurrenceID=details["bin_id"]+':Taxon:'+str(iz['AphiaID'])+'_1'
    # check against table to see if it already is in the dataframe if it is we add and increment....
    # can we possibly have more than 2?
    if s > 0:
        zi=occurrencedf['occurrenceID']==occurrenceID
        ajunk=occurrencedf['occurrenceID'][zi]
        if len(ajunk) > 0:
            occurrenceID=details["bin_id"]+':Taxon:'+str(iz['AphiaID'])+'_2'
    tmpdict={'eventID':eventID,'occurrenceID':occurrenceID,'scienfificName':scientificName,'scientificID':scientificID,
             'taxonRank':taxonRank,'kingdom':kingdom,'basisOfRecord':basisOfRecord,'occurrenceStatus':occurrenceStatus,
             'verbatimIdentification':summarytable.index[s],'identifiedBy':'','identificationVerificationStatus':identificationVerificationStatus,
             'identificationReferences':identificationReferences,'associatedMedia':associatedMedia}
    occframe=pd.DataFrame.from_dict([tmpdict])
    if s==0:
        occurrencedf=occframe
    else:
        occurrencedf=pd.concat([occurrencedf,occframe])

occurrencedf=occurrencedf.reset_index(drop=True)

In [53]:
occurrencedf.head(3)


Unnamed: 0,eventID,occurrenceID,scienfificName,scientificID,taxonRank,kingdom,basisOfRecord,occurrenceStatus,verbatimIdentification,identifiedBy,identificationVerificationStatus,identificationReferences,associatedMedia
0,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542_1,Dinophyceae,urn:lsid:marinespecies.org:taxname:19542,Class,Chromista,MachineObservation,Present,Dinophysis,,PredictedByMachine,"Trained machine learning model: Daniel, P. (20...",https://ifcb.caloos.org/image?image=00108&bin=...
1,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:109470_1,Alexandrium,urn:lsid:marinespecies.org:taxname:109470,Genus,Chromista,MachineObservation,Present,Alexandrium_singlet,,PredictedByMachine,"Trained machine learning model: Daniel, P. (20...",https://ifcb.caloos.org/image?image=00022&bin=...
2,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:149151_1,Pseudo-nitzschia,urn:lsid:marinespecies.org:taxname:149151,Genus,Chromista,MachineObservation,Present,Pseudo-nitzschia,,PredictedByMachine,"Trained machine learning model: Daniel, P. (20...",https://ifcb.caloos.org/image?image=01081&bin=...


In [None]:
# This is test code to be removed when sure code runs
#zc=classtable['class']==summarytable.index[1]
#mytest=classtable[zc]
#del(rois)
#rois=mytest['pid'].str.replace(bin_id+'_','')
#rois=rois.astype(int).reset_index(drop=True)
#lr=len(rois)
#bv=0
#for i in np.arange(0,lr):
#    if i==0:
#        bv=features.loc[features['roi_number']==rois[0],'Biovolume'].item()
#    else:
#        bv=bv+features.loc[features['roi_number']==rois[0],'Biovolume'].item()
#print(bv)
##features.loc[features['roi_number']==rois[0],'Biovolume'].item()
##bv=features['roi_number'].apply(lambda x: rois[x])

In [50]:
# EMoF table
# every row must have an eventID
# not every column has to be filled 
# occurrenceID is NOT in every row
# this is for a single eventID so could have an outer loop
# row 1
row1={'eventID':details['bin_id'],
      'occurrenceID':'',
      'measurementType':'Sampling Instrument Name',
      'measurementTypeID':'http://vocab.nerc.ac.uk/collection/Q01/current/Q0100002/',
      'measurementValue':'McLane Research Laboratories Imaging FlowCytobot imaging sensor',
      'measurementValueID':'http://vocab.nerc.ac.uk/collection/L22/current/TOOL1588/',
      'measurementUnit':'',
      'measurementUnitID':'',
      'measurementRemarks':''}
emof=pd.DataFrame([row1])
row2={'eventID':details['bin_id'],
      'occurrenceID':'',
      'measurementType':'Sample Volume',
      'measurementTypeID':'http://vocab.nerc.ac.uk/collection/P01/current/VOLXXXXX/',
      'measurementValue':analyzed_volume,
      'measurementValueID':'',
      'measurementUnit':'milliliters',
      'measurementUnitID':'http://vocab.nerc.ac.uk/collection/P06/current/VVML/',
      'measurementRemarks':''}
emof=pd.concat([emof,pd.DataFrame([row2])])
ls=len(summarytable)
for s in np.arange(0,ls):
    z=xworms['Class Name'][:]==summarytable.index[s]
    zi=np.where(z)
    iz=wormsdf.iloc[zi[0][0]]
    zc=classtable['class']==summarytable.index[s] # find the class names
    occurrenceID=details["bin_id"]+':Taxon:'+str(iz['AphiaID'])+'_1'
    # get the ROIs so we can get the biovolumes
    mytest=classtable[zc]
    rois=mytest['pid'].str.replace(bin_id+'_','')
    rois=rois.astype(int).reset_index(drop=True) # will this work with feature dataframe?
    lr=len(rois)
    #print(lr)
    bv=0
    if summarytable['occurrences'].iloc[s] > 0:
        for i in np.arange(0,lr):
            if i==0:
                bv=features.loc[features['roi_number']==rois[i],'Biovolume'].item()
            else:
                bv=bv+features.loc[features['roi_number']==rois[i],'Biovolume'].item()
    # now bv is the biovolume in pixels
    bv=bv/2.7 # this is the biovolume in microns?   
    # Now need to get biovolume based on roi number
    # conversion factor is 2.7 pixels per micron
    # Do we need these values if the species is abscent?
    row3={'eventID':details['bin_id'],
          'occurrenceID':occurrenceID,
          'measurementType':'Abundance of biological entity specified elsewhere per unit volume of the water body',
          'measurementTypeID':'http://vocab.nerc.ac.uk/collection/P01/current/SDBIOL01/',
          'measurementValue':float(summarytable['occurrences'][s])/analyzed_volume,
          'measurementValueID':'',
          'measurementUnit':'Number per millilitre', # Don't we need to divide ?
          'measurementUnitID':'http://vocab.nerc.ac.uk/collection/P06/current/UCML/',
          'measurementRemarks':'number is inclusive of single cells and chains'}
    emof=pd.concat([emof,pd.DataFrame([row3])])
    row4={'eventID':details['bin_id'],
          'occurrenceID':occurrenceID,
          'measurementType':'Biovolume of biological entity specified elsewhere per unit volume of the water by calculation using Moberg and Sosik (2012) doi: 10.4319/lom.2012.10.278',
          'measurementTypeID':'',
          'measurementValue':bv, # NEED TO COMPUTE
          'measurementValueID':'',
          'measurementUnit':'cubic microns per millilitre',
          'measurementUnitID':'http://vocab.nerc.ac.uk/collection/P06/current/UCUM/',
          'measurementRemarks':'number is inclusive of single cells and chains'}
    emof=pd.concat([emof,pd.DataFrame([row4])])    
# eventID
# eventID=details['bin_id']
# measurementType='Sampling Instrument Name'
# measurementTypeID='http://vocab.nerc.ac.uk/collection/Q01/current/Q0100002/'
# measurementValue='McLane Research Laboratories Imaging FlowCytobot imaging sensor'
# measurementValueID='http://vocab.nerc.ac.uk/collection/L22/current/TOOL1588/'
# measurementType='Sample Volume'
# measurementTypeID='http://vocab.nerc.ac.uk/collection/P01/current/VOLXXXXX/'
# measurementValue=analyzed_volume
# measurementUnit='Millilitres'
# measurementUnitID='http://vocab.nerc.ac.uk/collection/P06/current/VVML/'
# occurrenceID
# measurementRemarks='abundance per milliliter, inclusive of single cells and chains'
# measurementValueID
# measurementUnit
# measurementUnitID
# measurementID
#           'sampleSizeValue':details['ml_analyzed'], # need to remove the ml from the name
#           'sampleSizeUnit':'milliliter'}
# okay need code to loop through 
#ls=len(summarytable)
# xworms=wormsdf['Class Name'].reset_index() # not sure we need this here
#for s in np.arange(0,1):
#for s in np.arange(0,ls):
#    zc=classtable['class']==summarytable.index[s] # find
# This is an example we want to use later commenting out and cleaning up code
#    mytest=classtable[zc]
#    rois=mytest['pid'].str.replace(bin_id+'_','')
#    rois=rois.astype(int) # will this work with feature dataframe?
#    # I don't think this will work since index 0 is roi_number 2
#    # does not work
#    #roin=features['roi_number'][rois]
#    #biov=features['Biovolume'][rois]
###mytest['pid']
#### print(bin_id)
##rois=mytest['pid'].str.replace(bin_id+'_','')
##rois=rois.astype(int)
###print(rois)
## summarytable.index[s] is Patrick's class name


3
15
3


  'measurementValue':float(summarytable['occurrences'][s])/analyzed_volume,
  'measurementValue':float(summarytable['occurrences'][s])/analyzed_volume,
  'measurementValue':float(summarytable['occurrences'][s])/analyzed_volume,


In [51]:
emof=emof.reset_index(drop=True)
emof

Unnamed: 0,eventID,occurrenceID,measurementType,measurementTypeID,measurementValue,measurementValueID,measurementUnit,measurementUnitID,measurementRemarks
0,D20230717T000942_IFCB104,,Sampling Instrument Name,http://vocab.nerc.ac.uk/collection/Q01/current...,McLane Research Laboratories Imaging FlowCytob...,http://vocab.nerc.ac.uk/collection/L22/current...,,,
1,D20230717T000942_IFCB104,,Sample Volume,http://vocab.nerc.ac.uk/collection/P01/current...,4.01,,milliliters,http://vocab.nerc.ac.uk/collection/P06/current...,
2,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542_1,Abundance of biological entity specified elsew...,http://vocab.nerc.ac.uk/collection/P01/current...,0.74813,,Number per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
3,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:19542_1,Biovolume of biological entity specified elsew...,,847675.224303,,cubic microns per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
4,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:109470_1,Abundance of biological entity specified elsew...,http://vocab.nerc.ac.uk/collection/P01/current...,3.740648,,Number per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
5,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:109470_1,Biovolume of biological entity specified elsew...,,4308343.467119,,cubic microns per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
6,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:149151_1,Abundance of biological entity specified elsew...,http://vocab.nerc.ac.uk/collection/P01/current...,0.74813,,Number per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
7,D20230717T000942_IFCB104,D20230717T000942_IFCB104:Taxon:149151_1,Biovolume of biological entity specified elsew...,,280178.920664,,cubic microns per millilitre,http://vocab.nerc.ac.uk/collection/P06/current...,number is inclusive of single cells and chains
