# BCODMOtoCMAP
Setup a script to get data from BCO-DMO to CMAP format\
Krista Longnecker, 25 June 2025\
Working in Python, though I could do this in any language.\
API at BCD-DMO is listed as in progress, so seems like I will have to gather the data manually.

In [121]:
import pandas as pd
#might not need both of these, but import them for now
from pandas import ExcelFile
from openpyxl import Workbook
pd.options.mode.copy_on_write = True #will be default, may as well set to true

In [122]:
#BIOS-SCOPE discrete data are here: 
wPage = 'https://datadocs.bco-dmo.org/file/KAAGNEBc2V61jx/survey_biogeochemical.csv'

#multiple other options (though I manually found these names)
#https://datadocs.bco-dmo.org/file/m7zBJ4GTjvNzDY/964826_v1_pump_poc_pon_biosscope.csv
#https://datadocs.bco-dmo.org/file/XYG3xXqfkkjG5x/964684_v1_amino_acids_biosscope_2021.csv
#https://datadocs.bco-dmo.org/file/7Dxl3PMCmEl8lX/964801_v1_pump_carbohydrates_biosscope_2021.csv

In [123]:
#super easy once I have the URL:
bcodmo = pd.read_csv(wPage,na_values = ['nd']) #now I have NaN...but they get dropped when writing the file

In [124]:
bcodmo.head()

Unnamed: 0,Program,Cruise_ID,Cast,Niskin,decy,ISO_DateTime_UTC,Latitude,Longitude,Depth,Nominal_Depth,...,V1V2_ID,V4_18s_ID,Sunrise,Sunset,MLD_dens125,MLD_bvfrq,MLD_densT2,DCM,VertZone,Season
0,BIOSSCOPE,AE1614,1,1,2016.5207,2016-07-09T14:04:00Z,32.1647,-64.5009,10.432,10,...,,,922,2322,19.8589,19.8589,15.8872,95.3077,0.0,3
1,BIOSSCOPE,AE1614,1,2,2016.5207,2016-07-09T14:04:00Z,32.1647,-64.5009,10.327,10,...,,,922,2322,19.8589,19.8589,15.8872,95.3077,0.0,3
2,BIOSSCOPE,AE1614,1,3,2016.5207,2016-07-09T14:04:00Z,32.1647,-64.5009,10.506,10,...,,,922,2322,19.8589,19.8589,15.8872,95.3077,0.0,3
3,BIOSSCOPE,AE1614,1,4,2016.5207,2016-07-09T14:04:00Z,32.1647,-64.5009,10.174,10,...,,,922,2322,19.8589,19.8589,15.8872,95.3077,0.0,3
4,BIOSSCOPE,AE1614,1,5,2016.5207,2016-07-09T14:04:00Z,32.1647,-64.5009,10.365,10,...,,,922,2322,19.8589,19.8589,15.8872,95.3077,0.0,3


### Setup the data into the CMAP format, generating a multi-page Excel file

### data

In [125]:
# Required variables are time, lat, lon, depth
df = pd.DataFrame(columns=['time','lat','lon','depth'])

In [126]:
# time --> CMAP requirement is this: #< Format  %Y-%m-%dT%H:%M:%S,  Time-Zone:  UTC,  example: 2014-02-28T14:25:55 >
# Do this in two steps so I can check the output more easily
temp = bcodmo.copy()
temp['date'] = pd.to_datetime(temp['decy'], unit='D', origin='1970-01-01')
temp['date_cmap'] = temp['date'].dt.strftime("%Y-%m-%dT%H:%M:%S")
df['time'] = temp['date_cmap']

In [127]:
# lat (-90 to 90) and lon (-180 to 180); use variable names at BCO-DMO 
df['lat'] = bcodmo['Latitude']
df['lon'] = bcodmo['Longitude']  #BCO-DMO already has this as negative
df['depth'] = bcodmo['Depth']

In [128]:
# all remaining columns in bcodmo can be considered data (may do some trimming later for times)
bcodmo_trim = bcodmo.drop(columns=['Latitude', 'Longitude', 'Depth'])
nVariables = bcodmo_trim.shape[1] #@Python, so indexing starts with 0 (rows, 1 is the columns)
# and then add to the datafile I am assembling (essentially re-order columns
df = pd.concat([df, bcodmo_trim], axis=1)

### metadata about the variables

In [129]:
# work on the second sheet: metadata about the variables; use the CMAP dataset template to setup the dataframe so I get the column headers right
fName = 'datasetTemplate.xlsx'
sheet_name = 'vars_meta_data'
vars = pd.read_excel(fName, sheet_name=sheet_name)
cols = vars.columns.tolist()
#df2 will be the dataframe with the metadat about the variables, set it up empty here
df2 = pd.DataFrame(columns=cols,index = pd.RangeIndex(1,nVariables,1))


In [130]:
#original plan read in the second sheet of the BIOS-SCOPE discrete data file...but that is not at BCO-DMO so I need another plan.

# this is only a partial list of variables for the moment
#df2['var_short_name'] = wbVar['Header']
#df2['var_long_name'] = wbVar['Description']
#df2[,'var_sensor'] = 'need this'
#df2['var_unit'] = wbVar['Unit']
df2.loc[:,('var_spatial_res')] = 'irregular'
df2.loc[:, ('var_temporal_res')] = 'irregular'

In [131]:
df2.head()

Unnamed: 0,var_short_name,var_long_name,var_sensor,var_unit,var_spatial_res,var_temporal_res,var_discipline,visualize,var_keywords,var_comment
1,,,,,irregular,irregular,,,,
2,,,,,irregular,irregular,,,,
3,,,,,irregular,irregular,,,,
4,,,,,irregular,irregular,,,,
5,,,,,irregular,irregular,,,,


### metadata about the project

In [132]:
# finally gather up the dataset_meta_data
# assemble the details here, might setup in a separate text file later
df3 = pd.DataFrame({
    'dataset_short_name': ['BIOSSCOPE_v1'],
    'dataset_long_name': ['BIOS-SCOPE discrete sample data'],
    'dataset_version': ['1.0'],
    'dataset_release_date': ['2025-06-25'],
    'dataset_make': ['observation'],
    'dataset_source': ['Craig Carlson, Bermuda Institute of Ocean Sciences'],
    'dataset_distributor': ['Craig Carlson, Bermuda Institute of Ocean Sciences'],
    'dataset_acknowledgement': ['We thank the BIOS-SCOPE project team and the BATS team for assistance with sample collection, processing, and analysis. The efforts of the captains, crew, and marine technicians of the R/V Atlantic Explorer are a key aspect of the success of this project. This work supported by funding from the Simons Foundation International.'],
    'dataset_history': [''],
    'dataset_description': ['This dataset includes analyses from Niskin bottle samples collected on R/V Atlantic Explorer cruises as part of the BIOS-SCOPE campaign in the time period from 2016 until 2025. Included are CTD data, and survey biogeochemical samples including inorganic nutrients, particulate organic carbon and nitrogen, dissolved organic carbon, dissolved organic nitrogen, total dissolved amino acids, bacterial abundance and production.'],
    'dataset_references': ['Carlson, C. A., Giovannoni, S., Liu, S., Halewood, E. (2025) BIOS-SCOPE survey biogeochemical data as collected on Atlantic Explorer cruises (AE1614, AE1712, AE1819, AE1916) from 2016 through 2019. Biological and Chemical Oceanography Data Management Office (BCO-DMO). (Version 1) Version Date 2021-10-17. doi:10.26008/1912/bco-dmo.861266.1 [25 June 2025]'],
    'climatology': [0]
    })

#get the list of cruise names from the bcodmo data file
t = pd.DataFrame(bcodmo['Cruise_ID'].unique())
t.columns = ['cruise_names']
df3 = pd.concat([df3,t],axis=1,ignore_index = True)

In [133]:
df3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,BIOSSCOPE_v1,BIOS-SCOPE discrete sample data,1.0,2025-06-25,observation,"Craig Carlson, Bermuda Institute of Ocean Scie...","Craig Carlson, Bermuda Institute of Ocean Scie...",We thank the BIOS-SCOPE project team and the B...,,This dataset includes analyses from Niskin bot...,"Carlson, C. A., Giovannoni, S., Liu, S., Halew...",0.0,AE1614
1,,,,,,,,,,,,,AE1712
2,,,,,,,,,,,,,AE1819
3,,,,,,,,,,,,,AE1916


In [134]:
fName_CMAP = 'data/forCMAPfromBCODMOwithPython.xlsx'
dataset_names = {'data': df, 'dataset_meta_data': df3, 'vars_meta_data': df2}
with pd.ExcelWriter(fName_CMAP) as writer:
    for sheet_name, data in dataset_names.items():
        data.to_excel(writer, sheet_name=sheet_name, index=False)


Next cell allows me to store code that is *not* to run below this cell

In [135]:
raise "SystemExit(\"Stop execution here\")

SyntaxError: unterminated string literal (detected at line 1); perhaps you escaped the end quote? (321358534.py, line 1)