# BCODMOtoCMAP
Setup a script to get data from BCO-DMO to CMAP format\
Krista Longnecker, 25 June 2025\
Working in Python, though I could do this in any language.\
API at BCD-DMO is listed as in progress, so seems like I will have to gather the data manually.

In [121]:
import pandas as pd
#might not need both of these, but import them for now
from pandas import ExcelFile
from openpyxl import Workbook
pd.options.mode.copy_on_write = True #will be default, may as well set to true

In [122]:
#BIOS-SCOPE discrete data are here: 
wPage = 'https://datadocs.bco-dmo.org/file/KAAGNEBc2V61jx/survey_biogeochemical.csv'

#multiple other options (though I manually found these names)
#https://datadocs.bco-dmo.org/file/m7zBJ4GTjvNzDY/964826_v1_pump_poc_pon_biosscope.csv
#https://datadocs.bco-dmo.org/file/XYG3xXqfkkjG5x/964684_v1_amino_acids_biosscope_2021.csv
#https://datadocs.bco-dmo.org/file/7Dxl3PMCmEl8lX/964801_v1_pump_carbohydrates_biosscope_2021.csv

In [123]:
#super easy once I have the URL:
bcodmo = pd.read_csv(wPage,na_values = ['nd']) #now I have NaN...but they get dropped when writing the file

In [124]:
bcodmo.head()

Unnamed: 0,Program,Cruise_ID,Cast,Niskin,decy,ISO_DateTime_UTC,Latitude,Longitude,Depth,Nominal_Depth,...,V1V2_ID,V4_18s_ID,Sunrise,Sunset,MLD_dens125,MLD_bvfrq,MLD_densT2,DCM,VertZone,Season
0,BIOSSCOPE,AE1614,1,1,2016.5207,2016-07-09T14:04:00Z,32.1647,-64.5009,10.432,10,...,,,922,2322,19.8589,19.8589,15.8872,95.3077,0.0,3
1,BIOSSCOPE,AE1614,1,2,2016.5207,2016-07-09T14:04:00Z,32.1647,-64.5009,10.327,10,...,,,922,2322,19.8589,19.8589,15.8872,95.3077,0.0,3
2,BIOSSCOPE,AE1614,1,3,2016.5207,2016-07-09T14:04:00Z,32.1647,-64.5009,10.506,10,...,,,922,2322,19.8589,19.8589,15.8872,95.3077,0.0,3
3,BIOSSCOPE,AE1614,1,4,2016.5207,2016-07-09T14:04:00Z,32.1647,-64.5009,10.174,10,...,,,922,2322,19.8589,19.8589,15.8872,95.3077,0.0,3
4,BIOSSCOPE,AE1614,1,5,2016.5207,2016-07-09T14:04:00Z,32.1647,-64.5009,10.365,10,...,,,922,2322,19.8589,19.8589,15.8872,95.3077,0.0,3


### Setup the data into the CMAP format, generating a multi-page Excel file

### data

In [125]:
# Required variables are time, lat, lon, depth
df = pd.DataFrame(columns=['time','lat','lon','depth'])

In [126]:
# time --> CMAP requirement is this: #< Format  %Y-%m-%dT%H:%M:%S,  Time-Zone:  UTC,  example: 2014-02-28T14:25:55 >
# Do this in two steps so I can check the output more easily
temp = bcodmo.copy()
temp['date'] = pd.to_datetime(temp['decy'], unit='D', origin='1970-01-01')
temp['date_cmap'] = temp['date'].dt.strftime("%Y-%m-%dT%H:%M:%S")
df['time'] = temp['date_cmap']

In [127]:
# lat (-90 to 90) and lon (-180 to 180); use variable names at BCO-DMO 
df['lat'] = bcodmo['Latitude']
df['lon'] = bcodmo['Longitude']  #BCO-DMO already has this as negative
df['depth'] = bcodmo['Depth']

In [128]:
# all remaining columns in bcodmo can be considered data (may do some trimming later for times)
bcodmo_trim = bcodmo.drop(columns=['Latitude', 'Longitude', 'Depth'])
nVariables = bcodmo_trim.shape[1] #@Python, so indexing starts with 0 (rows, 1 is the columns)
# and then add to the datafile I am assembling (essentially re-order columns
df = pd.concat([df, bcodmo_trim], axis=1)

### metadata about the variables

In [129]:
# work on the second sheet: metadata about the variables; use the CMAP dataset template to setup the dataframe so I get the column headers right
fName = 'datasetTemplate.xlsx'
sheet_name = 'vars_meta_data'
vars = pd.read_excel(fName, sheet_name=sheet_name)
cols = vars.columns.tolist()
#df2 will be the dataframe with the metadat about the variables, set it up empty here
df2 = pd.DataFrame(columns=cols,index = pd.RangeIndex(1,nVariables,1))


In [130]:
#original plan read in the second sheet of the BIOS-SCOPE discrete data file...but that is not at BCO-DMO so I need another plan.

# this is only a partial list of variables for the moment
#df2['var_short_name'] = wbVar['Header']
#df2['var_long_name'] = wbVar['Description']
#df2[,'var_sensor'] = 'need this'
#df2['var_unit'] = wbVar['Unit']
df2.loc[:,('var_spatial_res')] = 'irregular'
df2.loc[:, ('var_temporal_res')] = 'irregular'

In [131]:
df2.head()

Unnamed: 0,var_short_name,var_long_name,var_sensor,var_unit,var_spatial_res,var_temporal_res,var_discipline,visualize,var_keywords,var_comment
1,,,,,irregular,irregular,,,,
2,,,,,irregular,irregular,,,,
3,,,,,irregular,irregular,,,,
4,,,,,irregular,irregular,,,,
5,,,,,irregular,irregular,,,,


### metadata about the project

In [132]:
# finally gather up the dataset_meta_data
# assemble the details here, might setup in a separate text file later
df3 = pd.DataFrame({
    'dataset_short_name': ['BIOSSCOPE_v1'],
    'dataset_long_name': ['BIOS-SCOPE discrete sample data'],
    'dataset_version': ['1.0'],
    'dataset_release_date': ['2025-06-25'],
    'dataset_make': ['observation'],
    'dataset_source': ['Craig Carlson, Bermuda Institute of Ocean Sciences'],
    'dataset_distributor': ['Craig Carlson, Bermuda Institute of Ocean Sciences'],
    'dataset_acknowledgement': ['We thank the BIOS-SCOPE project team and the BATS team for assistance with sample collection, processing, and analysis. The efforts of the captains, crew, and marine technicians of the R/V Atlantic Explorer are a key aspect of the success of this project. This work supported by funding from the Simons Foundation International.'],
    'dataset_history': [''],
    'dataset_description': ['This dataset includes analyses from Niskin bottle samples collected on R/V Atlantic Explorer cruises as part of the BIOS-SCOPE campaign in the time period from 2016 until 2025. Included are CTD data, and survey biogeochemical samples including inorganic nutrients, particulate organic carbon and nitrogen, dissolved organic carbon, dissolved organic nitrogen, total dissolved amino acids, bacterial abundance and production.'],
    'dataset_references': ['Carlson, C. A., Giovannoni, S., Liu, S., Halewood, E. (2025) BIOS-SCOPE survey biogeochemical data as collected on Atlantic Explorer cruises (AE1614, AE1712, AE1819, AE1916) from 2016 through 2019. Biological and Chemical Oceanography Data Management Office (BCO-DMO). (Version 1) Version Date 2021-10-17. doi:10.26008/1912/bco-dmo.861266.1 [25 June 2025]'],
    'climatology': [0]
    })

#get the list of cruise names from the bcodmo data file
t = pd.DataFrame(bcodmo['Cruise_ID'].unique())
t.columns = ['cruise_names']
df3 = pd.concat([df3,t],axis=1,ignore_index = True)

In [133]:
df3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,BIOSSCOPE_v1,BIOS-SCOPE discrete sample data,1.0,2025-06-25,observation,"Craig Carlson, Bermuda Institute of Ocean Scie...","Craig Carlson, Bermuda Institute of Ocean Scie...",We thank the BIOS-SCOPE project team and the B...,,This dataset includes analyses from Niskin bot...,"Carlson, C. A., Giovannoni, S., Liu, S., Halew...",0.0,AE1614
1,,,,,,,,,,,,,AE1712
2,,,,,,,,,,,,,AE1819
3,,,,,,,,,,,,,AE1916


In [134]:
fName_CMAP = 'data/forCMAPfromBCODMOwithPython.xlsx'
dataset_names = {'data': df, 'dataset_meta_data': df3, 'vars_meta_data': df2}
with pd.ExcelWriter(fName_CMAP) as writer:
    for sheet_name, data in dataset_names.items():
        data.to_excel(writer, sheet_name=sheet_name, index=False)


In [160]:
from bs4 import BeautifulSoup
import requests
from urllib.request import urlopen
from urllib.parse import urljoin

In [191]:
# url = 'https://www.bco-dmo.org/dataset/861266/data/view/903496'
url = 'https://www.bco-dmo.org/dataset/861266'

#can view the data:
url_view = 'https://www.bco-dmo.org/dataset/861266/data/view'
#data description is here...and parameters will list out the metadata for the variables
url_description = 'https://www.bco-dmo.org/dataset/861266/description'

In [192]:
page = urlopen(url_description)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, 'html.parser')


In [193]:
for link in soup.find_all('a'):
    print(link.get('href'))

None
https://www.bco-dmo.org/dataset/861266
https://www.bco-dmo.org/project/826178
https://www.bco-dmo.org/person/50575
https://www.bco-dmo.org/person/514364
https://www.bco-dmo.org/person/51543
https://www.bco-dmo.org/person/861289
https://www.bco-dmo.org/person/816518
None
#coverage
#description
#acquisition
#processing
#data-files
#supplemental-files
#related-publications
#related-datasets
#parameters
#instruments
#deployments
#projects
#funding
None
None
http://www.msi.ucsb.edu/services/analytical-lab
None
#toc
#top
None
#toc
#top
None
#toc
#top
None
https://doi.org/10.1016/j.dsr2.2010.02.022
https://doi.org/10.1016/j.dsr2.2010.02.013
https://doi.org/10.1029/2004JC002378
https://doi.org/10.1002/lno.11405
https://doi.org/10.1016/j.dsr2.2013.01.008
https://github.com/nbaetge/naames_export_ms/blob/master/Rmd/ARGO.md
https://doi.org/10.4319/lo.1980.25.5.0943
http://www.gso.uri.edu/dcsmith/page3/page19/assets/smithazam92.PDF
https://doi.org/https://doi.org/10.1029/92JC00407
https://doi.

In [164]:
page = urlopen(url)
html = page.read().decode("utf-8")
soup = BeautifulSoup(html, 'html.parser')

In [194]:
#print(soup.get_text())
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   BIOS-SCOPE survey biogeochemical data as collected on Atlantic Explorer cruises (AE1614, AE1712, AE1819, AE1916) from 2016 through 2019
  </title>
  <style type="text/css">
   body {font-family: verdana}
      .label {font-weight: bold}
      table, img, blockquote {page-break-inside: avoid;}
      table tr {page-break-inside: avoid}
      div#data-files-section table, div#supplemental-files-section table {
        border-spacing: 0px;
      }
      div#data-files-section td, div#supplemental-files-section td {
        border: solid 1px #666;
        padding: .4em;
        vertical-align: top;
      }
      div#data-files-section th, div#supplemental-files-section th {
        background: #eee;
        border: solid 1px #666;
        padding: .4em;
        text-align: left;
      }
      .bcodmo-block {display: block;}
  </style>
 </head>
 <body>
  <a name="top">
  </a>
  <div style="width: 98%; display: block">
   <div class="dataset-descrip

In [190]:
for link in soup.find_all('a'):
    print(link.get('href'))

/
None
None
/search/dataset
None
/search/dataset
/search/deployment
/search/project
/search/program
/search/instrument
/search/people
/search/parameter
/search/award
/search/platform
/search/publication
/search/affiliation
/search/funding
/how-to/access-and-reuse/database-search
None
https://submit.bco-dmo.org/welcome
https://submit.bco-dmo.org/welcome
/how-to/contribute
None
/how-to/prepare/data_management_plan
/how-to/prepare
/how-to/education-and-training
/how-to/frequently-asked-questions-faqs
/how-to
None
https://blog.bco-dmo.org/
/about
/team
/policies
None
/search/dataset
/search/dataset
/how-to/access-and-reuse/database-search
/how-to/access-and-reuse/bco-dmo-api
https://submit.bco-dmo.org/welcome
https://submit.bco-dmo.org/welcome
/how-to/prepare
/how-to/contribute
/about
/team
/policies
/products
/how-to/education-and-training
/how-to
/how-to/frequently-asked-questions-faqs
/person/50575
/affiliation/76
/person/514364
/affiliation/9
/person/51543
/affiliation/76
/person/86128

In [172]:
soup.find_all("img") #only searching in the tags...not a general text search ? Seems odd

[<img alt="BCO-DMO Logo" data-nimg="1" decoding="async" height="41.25" loading="lazy" src="/BCO-DMO-LogoNew.svg" style="color:transparent" width="164.033"/>,
 <img alt="NSF Logo" data-nimg="1" decoding="async" height="60" loading="lazy" src="/_next/image?url=%2FNSF-Logo.png&amp;w=128&amp;q=75" srcset="/_next/image?url=%2FNSF-Logo.png&amp;w=64&amp;q=75 1x, /_next/image?url=%2FNSF-Logo.png&amp;w=128&amp;q=75 2x" style="color:transparent" width="60"/>]

In [196]:
soup.title

<title>BIOS-SCOPE survey biogeochemical data as collected on Atlantic Explorer cruises (AE1614, AE1712, AE1819, AE1916) from 2016 through 2019</title>

In [195]:
soup.find_all(string = "Parameters")

['Parameters', 'Parameters']

In [179]:
soup

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><meta content="width=device-width, initial-scale=1" name="viewport"/><link data-precedence="next" href="/_next/static/css/25b5ac988b69c4a6.css" rel="stylesheet"/><link data-precedence="next" href="/_next/static/css/a6f84c82ce130fad.css" rel="stylesheet"/><link data-precedence="next" href="/_next/static/css/6c968578289dcc86.css" rel="stylesheet"/><link data-precedence="next" href="/_next/static/css/f07ed7640274cbc5.css" rel="stylesheet"/><link as="script" fetchpriority="low" href="/_next/static/chunks/webpack-720ea280b8b2bd8b.js" rel="preload"/><script async="" src="/_next/static/chunks/fd9d1056-a313fadf9411ff81.js"></script><script async="" src="/_next/static/chunks/2117-1ca00259eafc8f05.js"></script><script async="" src="/_next/static/chunks/main-app-be3d940aef6bd3eb.js"></script><script async="" src="/_next/static/chunks/e2799680-541c871d8bafd680.js"></script><script async="" src="/_next/static/chunks/4035-187f68ca241c9922.

In [142]:
file_links = []
for link in soup.find_all('a', href=True):
    href = link['href']
    if href.endswith(('.pdf', '.zip', '.xlsx', '.docx')): # Add desired file extensions
        file_links.append(href)

In [153]:

base_url = "https://www.bco-dmo.org/dataset/861266/data/view/903496"
absolute_file_links = [urljoin(base_url, link) for link in file_links]

In [154]:
for file_url in absolute_file_links:
    file_name = file_url.split('/')[-1] # Extract filename from URL
    try:
        file_response = requests.get(file_url, stream=True)
        file_response.raise_for_status() # Raise an exception for bad status codes
        with open(file_name, 'wb') as f:
            for chunk in file_response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {file_name}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {file_url}: {e}")

In [155]:
absolute_file_links

[]

In [None]:
response = requests.get(url)
html_content = response.text

page =urlopen(url)
html_bytes = page.read()
html = html_bytes.decode("utf-8")

Next cell allows me to store code that is *not* to run below this cell

In [135]:
raise "SystemExit(\"Stop execution here\")

SyntaxError: unterminated string literal (detected at line 1); perhaps you escaped the end quote? (321358534.py, line 1)