# Bottle data in bats


In [1]:
import pandas as pd
import numpy as np
import xarray as xr
import cf_xarray.units  # noqa: F401
import pint_xarray  # noqa: F401
import json

pd.options.plotting.backend = "plotly"

## Load raw data

---


In [2]:
raw_data = pd.read_csv("../../1_raw/bats_primary_production.csv", index_col=0)
raw_metadata = pd.read_csv("../../1_raw/bats_primary_production_meta.csv", index_col=0)

In [3]:
raw_metadata = raw_metadata.set_index("Variable")

raw_data["time"] = pd.to_datetime(raw_data["time"])
raw_data = raw_data.drop(
    columns=["Cruise_ID", "UNOLS", "time_out", "Lat_out", "Long_out"]
)

In [4]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3522 entries, 0 to 3521
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   time         3522 non-null   datetime64[ns]
 1   lat          3522 non-null   float64       
 2   lon          3522 non-null   float64       
 3   depth        3522 non-null   float64       
 4   pres         1554 non-null   float64       
 5   temp         1553 non-null   float64       
 6   salt         2860 non-null   float64       
 7   lt1          3477 non-null   float64       
 8   lt2          3465 non-null   float64       
 9   lt3          3442 non-null   float64       
 10  dark         3484 non-null   float64       
 11  t0           3413 non-null   float64       
 12  pp           3483 non-null   float64       
 13  niskin_flag  3522 non-null   int64         
dtypes: datetime64[ns](1), float64(12), int64(1)
memory usage: 412.7 KB


In [5]:
raw_data.describe()

Unnamed: 0,time,lat,lon,depth,pres,temp,salt,lt1,lt2,lt3,dark,t0,pp,niskin_flag
count,3522,3522.0,3522.0,3522.0,1554.0,1553.0,2860.0,3477.0,3465.0,3442.0,3484.0,3413.0,3483.0,3522.0
mean,2005-04-09 09:11:16.456558848,31.681377,-64.176102,69.97544,71.083526,21.489855,36.652562,4.314907,4.348254,4.372304,1.03572,1.099288,3.325593,1.988643
min,1988-12-18 00:00:00,31.135,-64.914,0.0,0.9,18.285,36.077,-1.05,-0.77,-1.32,0.13,0.0,-3.11,-3.0
25%,1996-07-09 00:00:00,31.665,-64.17,21.625,25.725,19.832,36.602,1.17,1.18,1.16,0.39,0.31,0.48,2.0
50%,2004-09-28 12:00:00,31.667,-64.167,61.3,69.7,20.668,36.656,3.3,3.31,3.28,0.58,0.49,2.45,2.0
75%,2013-06-16 06:35:00,31.67,-64.164,101.475,102.875,22.577,36.713,5.67,5.69,5.71,0.9,0.84,4.595,2.0
max,2022-12-16 08:11:00,32.108,-64.012,160.7,161.9,29.425,37.118,61.11,66.4,66.72,22.98,77.6,53.92,2.0
std,,0.07195,0.072507,45.365006,45.494583,2.426284,0.110324,4.482207,4.601342,4.641804,1.750784,2.727551,4.0398,0.238061


## Clean data

---


We only select the data with a flag equal to 2 (verified/acceptable).


In [6]:
print(f"Count QF flag : {np.unique(raw_data["niskin_flag"], return_counts=True)}")
# Drop when QF is not 2
raw_data = raw_data[raw_data["niskin_flag"] == 2]
raw_data = raw_data.drop(columns=["niskin_flag"])

Count QF flag : (array([-3,  2]), array([   8, 3514]))


In [7]:
fig = (
    raw_data.isna()
    .sum()
    .plot(
        kind="bar",
        title=f"Missing values per column for a total of {len(raw_data)} entries",
        labels=dict(
            index="Column",
            value="Number of missing values",
        ),
        # log y-axis
        log_y=True,
    )
)
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=500)
# Remove legend
fig.update_layout(showlegend=False)
fig.show()

## Manage index

---


In [8]:
raw_data["time"].value_counts().sort_index().plot(
    title="Number of entries per time",
    labels=dict(
        index="Time",
        value="Number of entries",
    ),
)

In [9]:
raw_data["lat"].astype(float).plot.hist(nbins=100, title="Latitude")

In [10]:
raw_data["lat"] = np.full_like(raw_data["lat"], 31.6)

In [11]:
raw_data["lon"].astype(float).plot.hist(nbins=100, title="Latitude")

In [12]:
raw_data["lon"] = np.full_like(raw_data["lon"], -64.2)

In [13]:
raw_data["depth"].astype(float).plot.hist(
    nbins=100, title="Depth", labels=dict(value="Depth (m)")
)

Then aggregate data by 50 meters.


In [14]:
# group depth by 50m bins
raw_data["depth"] = pd.cut(
    raw_data["depth"],
    bins=[0, 50, 100, 150, 200, 250, 300, 350, 400],
    labels=[50, 100, 150, 200, 250, 300, 350, 400],
    right=False,
)

In [15]:
raw_data["depth"].astype(float).plot.hist(title="Depth", labels=dict(value="Depth (m)"))

## Produce preprocessed data

---


In [16]:
# This show that there are duplicates index

try:
    raw_data.set_index(["time", "depth", "lat", "lon"], verify_integrity=True)
except ValueError as e:
    print(e)

Index has duplicate keys: MultiIndex([('1988-12-18 00:00:00',  50, 31.6, -64.2),
            ('1988-12-18 00:00:00', 100, 31.6, -64.2),
            ('1988-12-18 00:00:00', 150, 31.6, -64.2),
            ('1989-01-27 00:00:00',  50, 31.6, -64.2),
            ('1989-01-27 00:00:00', 100, 31.6, -64.2),
            ('1989-02-19 00:00:00',  50, 31.6, -64.2),
            ('1989-03-26 00:00:00',  50, 31.6, -64.2),
            ('1989-03-26 00:00:00', 100, 31.6, -64.2),
            ('1989-04-17 00:00:00',  50, 31.6, -64.2),
            ('1989-04-17 00:00:00', 100, 31.6, -64.2),
            ...
            ('2022-08-14 06:02:00', 150, 31.6, -64.2),
            ('2022-09-17 06:04:00',  50, 31.6, -64.2),
            ('2022-09-17 06:04:00', 100, 31.6, -64.2),
            ('2022-09-17 06:04:00', 150, 31.6, -64.2),
            ('2022-11-25 07:55:00',  50, 31.6, -64.2),
            ('2022-11-25 07:55:00', 100, 31.6, -64.2),
            ('2022-11-25 07:55:00', 150, 31.6, -64.2),
            ('2022-12-1

In [17]:
raw_data = raw_data.rename(columns={"lat": "latitude", "lon": "longitude"})

preprocessed_data = xr.Dataset.from_dataframe(
    raw_data.groupby(["time", "depth", "latitude", "longitude"]).mean()
)

for var in preprocessed_data:
    attrs = raw_metadata.loc[var].to_dict()
    # set all attrs keys to lowercase
    attrs = {k.lower(): v for k, v in attrs.items()}
    # transform unit key to units
    attrs["units"] = attrs.pop("unit")
    preprocessed_data[var].attrs = attrs

preprocessed_data["time"].attrs = {
    "standard_name": "time",
    "long_name": "time",
    "axis": "T",
}
preprocessed_data["latitude"].attrs = {
    "standard_name": "latitude",
    "long_name": "latitude",
    "axis": "Y",
    "units": "degrees_north",
}
preprocessed_data["longitude"].attrs = {
    "standard_name": "longitude",
    "long_name": "longitude",
    "axis": "X",
    "units": "degrees_east",
}
preprocessed_data["depth"].attrs = {
    "standard_name": "depth",
    "long_name": "depth",
    "axis": "Z",
    "units": "m",
}

preprocessed_data





## Final plot

---


In [18]:
raw_data.groupby(["time", "latitude", "longitude", "depth"]).mean().plot.box()





## Export preprocessed data

---


In [19]:
try:
    preprocessed_data.pint.quantify()
except Exception as e:
    print(e)
    print("Some units cannot be quantified and are only here for information.")

Cannot parse units:
 -- invalid units for variable 'lt1': mgC/m^3/day (attribute) (reason: 'mgC' is not defined in the unit registry)
 -- invalid units for variable 'lt2': mgC/m^3/day (attribute) (reason: 'mgC' is not defined in the unit registry)
 -- invalid units for variable 't0': mgC/m^3/day (attribute) (reason: 'mgC' is not defined in the unit registry)
 -- invalid units for variable 'lt3': mgC/m^3/day (attribute) (reason: 'mgC' is not defined in the unit registry)
 -- invalid units for variable 'pp': mgC/m^3/day (attribute) (reason: 'mgC' is not defined in the unit registry)
 -- invalid units for variable 'dark': mgC/m^3/day (attribute) (reason: 'mgC' is not defined in the unit registry)
 -- invalid units for variable 'salt': nan (attribute) (reason: Unit expression cannot have a scaling factor.)
Some units cannot be quantified and are only here for information.


In [20]:
preprocessed_data.to_netcdf("../../2_processed/bats_primary_production.nc", mode="w")