install if your environment does not have it

In [None]:
# pip install PyCO2SYS

In [None]:
import pandas as pd
import xarray as xr
import numpy as np
import numpy.ma as ma
%matplotlib inline
%config InlineBackend.figure_format = 'jpg'
%config InlineBackend.print_figure_kwargs = {'dpi':300, 'bbox_inches': 'tight'}
import matplotlib as mpl
from matplotlib.ticker import AutoMinorLocator
import matplotlib.pyplot as plt
import scipy
import sklearn.linear_model 
import pickle
import re
import PyCO2SYS as pyco2
import requests
from datetime import datetime
import tempfile

In [None]:
import gcsfs 
fs = gcsfs.GCSFileSystem()

In [None]:
fs.ls('YOUR_GCS_PATH/Taylor_data/databases/raw')

In [None]:
# In this part, we need to set the download path and save path.
# And the script would download the data and save it directly to GCS.
# If failed, the output would notify

# Set the path of BATS data.
# Available here: https://bats.bios.asu.edu/bats-data/ 
# Select "bats_bottle.txt" and click "copy link" on the top right to get the url
# Current version information will be shown in the next cell's output
# After copy the link, it may end with "dl=0". If so, please change to "dl=1" so that this script could successfully download directly
bats_url = 'https://www.dropbox.com/scl/fi/1jygi9z845irab8fo22kj/bats_bottle.txt?rlkey=etw7w4c30umb0gkekgi20ec76&e=1&st=bzpbp9a2&dl=1'

# GCS path to save the data
save_path = 'YOUR_GCS_PATH/Taylor_data/databases'
# Edit the file name if needed.
gcs_path = f'{save_path}/raw/bats_bottle.txt'

# If you want to change the outcome .nc save path or file name, search for "zarr_gcs_path" later in this notebook

fs = gcsfs.GCSFileSystem()

if fs.exists(gcs_path):
    print(f"File already exists at {gcs_path}. Skipping download.")
else:
    # download and save to GCS
    response = requests.get(bats_url, stream=True)
    if response.status_code == 200:
        with fs.open(gcs_path, 'wb') as gcs_file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    gcs_file.write(chunk)
        print(f"Succesfully downloaded and saved to {gcs_path}")
    else:
        print(f"Error! Could not download: {response.status_code}")

In [None]:
# Load data from GCS and convert to dataframe
with fs.open(gcs_path, 'rb') as gcs_file:
    lines = gcs_file.readlines()

# Mannully set where data starts from, skipping the comment lines. Here we use: 58
# Shound not change if BATS keeps their format
lines = [line.decode('utf-8') for line in lines]
data = lines[58:]

# Get column names
columns = data[0].replace(',', '').strip().split()

df = pd.DataFrame([line.strip().split() for line in data[1:]], columns=columns)

# Detect the latest cruise date
# As long as BATS keeps the same format, this code would work.
# Current format: The 3rd line:BATS discrete Bottle Data (excluding HPLC pigments) for  October 1988 (cruise#10001) to June 2023 (cruise# 10405)
info_line = lines[2].strip()
match = re.search(r'for\s+([A-Za-z]+\s+\d{4})\s+\(.*?\)\s+to\s+([A-Za-z]+\s+\d{4})\s+\(.*?\)', info_line)
if match:
    start_date = match.group(1)
    end_date = match.group(2)
    print(f"The current version is : from {start_date} to  {end_date}")
else:
    raise ValueError("Take care. Fail to detect the latest cruise date!")

In [None]:
df.columns

In [None]:
# Drop unnecessary columns
df=df.drop(columns=['time','Pro','Syn','Naneu','Piceu','TN','Bact','latN','lonW','QF','BSi','LSi','O2(1)','TOC','PON','OxFix','POC','Sig-th','Anom1','POP','TDP','NO21','NO31'])

df.head(3)

In [None]:
# Convert data types
df['Depth'] = pd.to_numeric(df['Depth'], errors='coerce')
df['Temp'] = pd.to_numeric(df['Temp'], errors='coerce')
df['CTD_S'] = pd.to_numeric(df['CTD_S'], errors='coerce')
df['Sal1'] = pd.to_numeric(df['Sal1'], errors='coerce')
df['CO2'] = pd.to_numeric(df['CO2'], errors='coerce')
df['Alk'] = pd.to_numeric(df['Alk'], errors='coerce')
df['PO41'] = pd.to_numeric(df['PO41'], errors='coerce')
df['Si1'] = pd.to_numeric(df['Si1'], errors='coerce')
df['SRP'] = pd.to_numeric(df['SRP'], errors='coerce')

df.head()

In [None]:
# Grab observations above 10 meters
ds = df[(df.Depth < 10) & (df.Alk > 0) & (df.Sal1 > 0) & (df.Temp != -999) & (df.CO2 > 0) & (df.Si1 != -999) & (df.PO41 != -999)]
ds.head()

In [None]:
has_invalid_values = (ds == -999).any().any()
has_invalid_values

In [None]:
ds.replace(-999, np.nan, inplace=True)
has_invalid_values = (ds == -999).any().any()
has_invalid_values

In [None]:
# Define input conditions
# These are the inputs to calculate pCO2 using PyCO2SYS package
par1type =    1  # The first parameter supplied is of type "1", which is "alkalinity"
par1     = ds.Alk  # Value of the first parameter
par2type =    2  # The second parameter supplied is of type "2", which is "DIC"
par2     = ds.CO2  # Value of the second parameter
sal      = ds.Sal1  # Salinity of the sample
tempin   = ds.Temp  # Temperature at input conditions
presin   = 0  # Pressure    at input conditions
sil      = ds.Si1 #50  # Concentration of silicate  in the sample (in umol/kg)
po4      = ds.PO41  # 2# Concentration of phosphate in the sample (in umol/kg)
pHscale  =    1  # pH scale at which the input pH is reported ("1" means "Total Scale")  
                 #  - doesn't matter in this example
k1k2c    =    10 #4  # Choice of H2CO3 and HCO3- dissociation constants K1 and K2 ("4" means "Mehrbach refit")  (Galen says use "10")
kso4c    =    1  # Choice of HSO4- dissociation constants KSO4 ("1" means "Dickson")

In [None]:
# Run CO2SYS!
CO2dict = pyco2.sys(par1, par2, par1type, par2type,
                    salinity=sal, temperature=tempin, pressure=presin,
                    total_silicate=sil, total_phosphate=po4,
                    opt_pH_scale=pHscale, opt_k_carbonic=k1k2c, opt_k_bisulfate=kso4c)
print('PyCO2SYS ran successfully!')

In [None]:
fig = plt.subplots(1,1,figsize=(12,2))
plt.scatter(ds.decy,CO2dict['pCO2'])
plt.xlim(min(ds.decy),max(ds.decy))

In [None]:
ds.columns

In [None]:
ds['date']=ds['yyyymmdd'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d'))

In [None]:
ds.head()

In [None]:
# Convert date format, only for rename file use. Not used in data processing job.
date_mapping = {
    'January': '01', 'February': '02', 'March': '03',
    'April': '04', 'May': '05', 'June': '06',
    'July': '07', 'August': '08', 'September': '09',
    'October': '10', 'November': '11', 'December': '12'
}

def format_date(date_str):
    month_str, year_str = date_str.split()
    month_num = date_mapping[month_str]
    return f"{year_str}-{month_num}"

formatted_start_date = format_date(start_date)
formatted_end_date = format_date(end_date)

print(f"Time period is: {formatted_start_date} and {formatted_end_date}")

In [None]:
bats_out = xr.Dataset({
                        'temp':(["time"],ds['Temp']),
                        'salinity':(['time'],ds['Sal1']),
                        'CO2':(['time'],ds['CO2']),
                        'alk':(['time'],ds['Alk']),
                        'Si':(['time'],ds['Si1']),
                        'PO4':(['time'],ds['PO41']),
                        'spco2':(['time'],CO2dict['pCO2'])},
                        coords={'time': (['time'],ds['date'])})

In [None]:
f'{save_path}/bats_spco2_{formatted_start_date}-{formatted_end_date}.nc'

In [None]:
# Save to GCS as zarr
zarr_gcs_path = f'{save_path}/processed/bats_spco2_{formatted_start_date}-{formatted_end_date}.zarr'

'''with tempfile.NamedTemporaryFile(suffix='.nc') as tmp_file:
    bats_out.to_netcdf(tmp_file.name)
    with fs.open(zarr_gcs_path, 'wb') as gcs_file:
        gcs_file.write(tmp_file.read())'''

bats_out.to_zarr( zarr_gcs_path, mode='w')

# Check if saved sucessfully
if fs.exists(zarr_gcs_path):
    print(f"Successfully saved to {zarr_gcs_path}")
else:
    print(f"Failed to save to {zarr_gcs_path}")

In [None]:
fs.ls(save_path+ '/processed')


In [None]:
# If you want to delete file from the GCS path, run the following code
'''
delete_path = 'YOUR_GCS_PATH/Taylor_data/GLODAPv2.2023_Merged_Master_File.csv'

fs = gcsfs.GCSFileSystem()

if fs.exists(delete_path):
    fs.rm(delete_path, recursive=True)
    print(f"The file {delete_path} has been deleted")
else:
    print(f"The file {delete_path} does not exist")
'''