# convertBCODMOtoCMAP
Krista Longnecker, 2 July 2025\
Run this after running getBCODMOinfo.ipynb\
Once you have the information from BCO-DMO dataset in hand, use this to make it into the format required by CMAP.\
Note: this version puts most of the code into a separate python script (convert.py) to make it easier to run through multiple data files.

In [1]:
#some of these are residual from assembling the data file, keep for now.
import pandas as pd
import requests
import os
import json
import re
import pdb

from datetime import datetime, timedelta, timezone
from SPARQLWrapper import SPARQLWrapper, POST, JSON
from frictionless import describe, Package

In [22]:
# Make a function that searches for bcodmo:name and returns bcodmo:description and bcodmo:units
# input: md --> the list of parameters for one dataset
def getDetails(md,bcodmo_name):
    """
    Take the list of information from BCO-DMO, search for a name, and return the description and units for that name
    """
    for i, item in enumerate(md):
        if item['bcodmo:name'] == bcodmo_name:
            #actually want the descrption, so return that
            description = item['bcodmo:description']
            units = item['bcodmo:units']

    return description, units

In [3]:
#read in the package that was already made (using getBCODMOinfo.ipynb)
biosscope = Package('datapackage.json')
# biosscope = Package(JSON_FILE)

In [4]:
out = [];
for idx,item in enumerate(biosscope.resources):
    justFile = item.name;
    #pdb.set_trace()
    out.append(justFile)

out #this will be a list, iterate through the files on this list 
#leave here so I see the filenames might not need the actual list

['zooscan_images_bats_ae1614',
 'zooscan_images_bats_ae1712',
 'zooscan_images_bats_ae1819',
 'zooscan_images_bats_ae1830',
 'zooscan_bats_biovolume',
 'zooscan_mocness_output',
 'survey_biogeochemical',
 '920443_v1_biosscope_in_situ_pump_chemical_data',
 '964684_v1_amino_acids_biosscope_2021',
 '964801_v1_pump_carbohydrates_biosscope_2021',
 '964826_v1_pump_poc_pon_biosscope']

In [5]:
#most datasets need additional processing to get depth and/or time. 
#Set up a table here to note what needs to be done, use later to spin off to other scripts.
toSkip = {'zooscan_images_bats_ae1614', #these actually will not match below bc do not end in csv
 'zooscan_images_bats_ae1712',
 'zooscan_images_bats_ae1819',
 'zooscan_images_bats_ae1830'}
addDepth = {'920443_v1_biosscope_in_situ_pump_chemical_data',
 '964684_v1_amino_acids_biosscope_2021',
 '964801_v1_pump_carbohydrates_biosscope_2021',
 '964826_v1_pump_poc_pon_biosscope'}
toMerge = {'zooscan_bats_biovolume',
 'zooscan_mocness_output'}
useAsIs = {'survey_biogeochemical'};

# Start working with a data file

In [7]:
# import convert #this is the function, run this cell if I have made an edit to convert.py
# import convert_pumpData

In [50]:
for idx in range(len(biosscope.resources)):
    data_url = biosscope.resources[idx].path
    if data_url.endswith('.csv'):
        checkFile = re.split('/',data_url).pop().replace('.csv','')
        #print(data_url)
        #have a few options and trying to group these based on added steps needed to make the data file ready
        if checkFile in toSkip:
            print('skip ' + checkFile)
        elif checkFile in addDepth:
            %run convert_pumpData.py {idx}   
        elif checkFile in toMerge:
            print('toMerge ' + checkFile)
        elif checkFile in useAsIs:
            %run convert.py {idx}   
        else:
            print('no match ' + checkFile)

toMerge zooscan_bats_biovolume
toMerge zooscan_mocness_output
Data will go here: C:\Users\klongnecker\Documents\Dropbox\GitHub_espresso\BWrepos\data\survey_biogeochemical
Data will go here: C:\Users\klongnecker\Documents\Dropbox\GitHub_espresso\BWrepos\data\920443_v1_biosscope_in_situ_pump_chemical_data
Data will go here: C:\Users\klongnecker\Documents\Dropbox\GitHub_espresso\BWrepos\data\964684_v1_amino_acids_biosscope_2021
Data will go here: C:\Users\klongnecker\Documents\Dropbox\GitHub_espresso\BWrepos\data\964801_v1_pump_carbohydrates_biosscope_2021
Data will go here: C:\Users\klongnecker\Documents\Dropbox\GitHub_espresso\BWrepos\data\964826_v1_pump_poc_pon_biosscope


In [37]:
idx = 10
data_url = biosscope.resources[idx].path
checkFile = re.split('/',data_url).pop().replace('.csv','')
checkFile

'964826_v1_pump_poc_pon_biosscope'

In [41]:
 #to do: figure out a better way to do this so I am not reading in the json file every time
biosscope = Package('datapackage.json')

data_url = biosscope.resources[idx].path
md = biosscope.resources[idx].custom['bcodmo:parameters'] #this is a list, don't forget 'custom' (!!)

#make a short name out of the data_url, will use this as part of the name for the final Excel file 
exportFile = re.split('/',data_url).pop().replace('.csv','')

#super easy to work with the CSV file once I have the URL
#pdb.set_trace()
bcodmo = pd.read_csv(data_url,na_values = ['nd']) #now I have NaN...but they get dropped when writing the file
    
# Required variables are time, lat, lon, depth
df = pd.DataFrame(columns=['time','lat','lon','depth'])

    
# time --> CMAP requirement is this: #< Format  %Y-%m-%dT%H:%M:%S,  Time-Zone:  UTC,  example: 2014-02-28T14:25:55 >
# Do this in two steps so I can check the output more easily
#no time available in particle data
temp = bcodmo.copy()
temp['date'] = pd.to_datetime(temp['Date'])
temp['date_cmap'] = temp['date'].dt.strftime("%Y-%m-%dT%H:%M:%S")
df['time'] = temp['date_cmap']

# lat (-90 to 90) and lon (-180 to 180); use variable names at BCO-DMO
df['lat'] = bcodmo['Latitude']
df['lon'] = bcodmo['Longitude']  #BCO-DMO already has this as negative

#depth in the UM data can be either 'Depth' or 'Depth_m' --> figure out which one
if 'Depth' in bcodmo.columns:
    useD = 'Depth'
elif 'Depth_m' in bcodmo.columns:
    useD = 'Depth_m'

df['depth'] = bcodmo[useD]

# all remaining columns in bcodmo can be considered data
#remember: bcodmo_trim will have the list of variables that I will use later to get metadata about the variables
bcodmo_trim = bcodmo.drop(columns=['Latitude', 'Longitude', useD])
nVariables = bcodmo_trim.shape[1] #remember in Python indexing starts with 0 (rows, 1 is the columns)
# and then add to the datafile I am assembling (essentially re-order columns
df = pd.concat([df, bcodmo_trim], axis=1)

# work on the second sheet: metadata about the variables; use the CMAP dataset template to setup the dataframe so I get the column headers right
templateName = 'datasetTemplate.xlsx'
sheet_name = 'vars_meta_data'
vars = pd.read_excel(templateName, sheet_name=sheet_name)
cols = vars.columns.tolist()
#df2 will be the dataframe with the metadata about the variables, set it up empty here
df2 = pd.DataFrame(columns=cols,index = pd.RangeIndex(start=0,stop=nVariables)) #remember, Python is 0 indexed

#the variables I need to search for are here: bcodmo_trim.columns, put them in the first column
df2['var_short_name'] = bcodmo_trim.columns
#Need the information from BCO-DMO to fill in the metadata about the variables.
#md = biosscope.resources[idx].custom['bcodmo:parameters'] #this is a list, don't forget 'custom' (!!)

In [43]:
for idx,item in enumerate(df2.iterrows()):
    a,b = getDetails(md,df2.loc[idx,'var_short_name']) #getDetails is the function I wrote (see above)
    #pdb.set_trace()
    df2.loc[idx,'var_long_name'] = a
    df2.loc[idx,'var_unit'] = b
    print(a)

Internal sample ID
BIOSSCOPE cruise identifier
<p>Date of sampling</p>
Particle size range for water fraction
Filter type used for particle fraction (Nitex, GF/C ,GF75)
standard deviation of the depth
PN; average bulk particulate nitrogen (one sample filter analyzed, replicate blanks used to incorporate uncertainty)
standard deviation for bulk particulate nitrogen
POC; average bulk particulate organic carbon (one sample filter analyzed, replicate blanks used to incorporate uncertainty)
standard deviation for bulk particulate organic carbon
Carbon to nitrogen ratio of acidified POC to non-acidified PN
standard deviation of CN ratio (propagated error from POC and PN)


In [44]:
df2.head()

Unnamed: 0,var_short_name,var_long_name,var_sensor,var_unit,var_spatial_res,var_temporal_res,var_discipline,visualize,var_keywords,var_comment
0,Sample_ID,Internal sample ID,,unitless,,,,,,
1,Cruise,BIOSSCOPE cruise identifier,,unitless,,,,,,
2,Date,<p>Date of sampling</p>,,unitless,,,,,,
3,Size_fraction,Particle size range for water fraction,,micrometers (um),,,,,,
4,Filter_type,"Filter type used for particle fraction (Nitex,...",,unitless,,,,,,


In [16]:
idx=0


In [45]:
#metadata about the project    
# finally gather up the dataset_meta_data: for now I just wrote the information here, I might setup in a separate text file later
#pdb.set_trace()
df3 = pd.DataFrame({
    'dataset_short_name': ['BIOSSCOPE_v1'],
    'dataset_long_name': ['BIOS-SCOPE_' + exportFile],
    'dataset_version': ['1.0'],
    'dataset_release_date': ['2025-06-25'],
    'dataset_make': ['observation'],
    'dataset_source': ['Hilary Close, University of Miami Rosenstiel School of Marine and Atmospheric Science'],
    'dataset_distributor': ['Hilary Close, University of Miami Rosenstiel School of Marine and Atmospheric Science'],
    'dataset_acknowledgement': ['We thank the BIOS-SCOPE project team and the BATS team for assistance with sample collection, processing, and analysis. The efforts of the captains, crew, and marine technicians of the R/V Atlantic Explorer are a key aspect of the success of this project. This work supported by funding from the Simons Foundation International.'],
    'dataset_history': [''],
    'dataset_description': ['blah blah'],
    'dataset_references': ['Henderson, L.C., Wittmers, F., Carlson, C. A., Worden, A.Z., & Close, H. G. (2024. Variable carbon isotope fractionation of photosynthetic communities over depth in an open-ocean euphotic zone. Proceedings of the National Academy of Sciences, 121(10). https://doi.org/10.1073/pnas.2304613121'],
    'climatology': [0]
    })

#get the list of cruise names from the bcodmo data file
t = pd.DataFrame(bcodmo['Cruise'].unique())
t.columns = ['cruise_names']
df3 = pd.concat([df3,t],axis=1,ignore_index = True)
  
#export the result as an Excel file with three tabs
#make the data folder if it is not already there (it is in .gitignore, so it will not end up at GitHub)
folder = "data"
os.chdir(".")

In [47]:
if os.path.isdir(folder):
    print("Data will go here: %s" % (os.getcwd()) + '\\' + folder + '\\' + exportFile)
else:
    os.mkdir(folder)

fName_CMAP = 'data/' + 'BIOSSCOPE_' + exportFile + '.xlsx' 
dataset_names = {'data': df, 'dataset_meta_data': df3, 'vars_meta_data': df2}
with pd.ExcelWriter(fName_CMAP) as writer:
    for sheet_name, data in dataset_names.items():
        data.to_excel(writer, sheet_name=sheet_name, index=False)

Data will go here: C:\Users\klongnecker\Documents\Dropbox\GitHub_espresso\BWrepos\data\964826_v1_pump_poc_pon_biosscope


In [46]:
fName_CMAP = 'test.xlsx' 
dataset_names = {'data': df, 'dataset_meta_data': df3, 'vars_meta_data': df2}
with pd.ExcelWriter(fName_CMAP) as writer:
    for sheet_name, data in dataset_names.items():
        data.to_excel(writer, sheet_name=sheet_name, index=False)

In [None]:
# Make a function that searches for bcodmo:name and returns bcodmo:description and bcodmo:units
# input: md --> the list of parameters for one dataset
def getDetails(md,bcodmo_name):
    """
    Take the list of information from BCO-DMO, search for a name, and return the description and units for that name
    """
    for i, item in enumerate(md):
        if item['bcodmo:name'] == bcodmo_name:
            #actually want the descrption, so return that
            description = item['bcodmo:description']
            units = item['bcodmo:units']

    return description, units

In [None]:
#raise UserWarning('Stopping and leave code below for historical reasons, code will not run')

In [None]:
#was heading down this path and using out as a dictionary, leave for now in case I come back to this.
# out = {};
# for idx,item in enumerate(biosscope.resources):
#     justFile = item.name;
#     #pdb.set_trace()
#     out.update({'file':justFile})
#     if justFile == '920443_v1_biosscope_in_situ_pump_chemical_data':
#         print('yes')
#         out.update({'toDo':'addDepth'})
        
#     #out.append(justFile)

# out #this will be a list, iterate through the files on this list