# convertBCODMOtoCMAP
Krista Longnecker, 18 July 2025\
Run this after running ```getBCODMOinfo.ipynb```\
This script will make the format required by CMAP.\
Note: this version puts most of the code into separate python scripts (convert*.py) to make it easier to run through multiple data files.\
Also note that you end up running through this twice in order to get all the metadata needed for the _variables_. There are too many details needed by CMAP to automate that step.

In [1]:
%reset -f

In [None]:
#some of these are residual from assembling the data file, keep for now.
import pandas as pd
import requests
import os
import json
import re
import pdb

from datetime import datetime, timedelta, timezone
from SPARQLWrapper import SPARQLWrapper, POST, JSON
from frictionless import describe, Package

In [None]:
# Make a function that searches for bcodmo:name and returns bcodmo:description and bcodmo:units
# input: md --> the list of parameters for one dataset
def getDetails(md,bcodmo_name):
    """
    Take the list of information from BCO-DMO, search for a name, and return the description and units for that name
    """
    for i, item in enumerate(md):
        if item['bcodmo:name'] == bcodmo_name:
            #actually want the descrption, so return that
            description = item['bcodmo:description']
            if 'bcodmo:units' in item:
                units = item['bcodmo:units']
            else:
                units = 'not applicable'
            #print(units)

    return description, units

In [None]:
#Check that an Excel file exists for the variable metadata:
if os.path.exists('CMAP_variableMetadata_additions.xlsx'):
    print('Found Excel file with metadata')
else:
    #You cannot proceed without the file, so stop the script if it's not found
    print(f"No Excel file with metadata found")
    sys.exit(1)
    

In [None]:
#read in the package that was already made (using getBCODMOinfo.ipynb)
biosscope = Package('datapackage.json')
# biosscope = Package(JSON_FILE)

In [None]:
out = [];
for idx,item in enumerate(biosscope.resources):
    justFile = item.name;
    out.append(justFile)

out #this will be a list, leave here so I see the filenames

In [None]:
#most datasets need additional processing to get depth and/or time. 
#Set up a table here to note what needs to be done, use later to spin off to other scripts.
#this essentially splits by PI since each PI has different project metadata
toSkip = {'zooscan_images_bats_ae1614', #these actually will not match below bc do not end in csv
 'zooscan_images_bats_ae1712',
 'zooscan_images_bats_ae1819',
 'zooscan_images_bats_ae1830',
 'zooscan_bats_biovolume'} #this has data, but no time or location informationn
pumpData = {'920443_v1_biosscope_in_situ_pump_chemical_data',
 '964684_v1_amino_acids_biosscope_2021',
 '964801_v1_pump_carbohydrates_biosscope_2021',
 '964826_v1_pump_poc_pon_biosscope'}
zoopData = {'zooscan_mocness_output'}
discreteData = {'survey_biogeochemical'};

# Work though all of the data files

In [None]:
for idx in range(len(biosscope.resources)):
    data_url = biosscope.resources[idx].path
    if data_url.endswith('.csv'):
        checkFile = re.split('/',data_url).pop().replace('.csv','')
        #print(data_url)
        #have a few options and trying to group these based on added steps needed to make the data file ready
        if checkFile in toSkip:
            print('skip ' + checkFile)
        elif checkFile in pumpData:
            %run convert_pumpData.py {idx}   
        elif checkFile in zoopData:
            %run convert_zoopData.py {idx}   
        elif checkFile in discreteData:
            %run convert.py {idx}   
        else:
            print('no match ' + checkFile)