# Zooplankton data in hot


In [27]:
import pandas as pd
import xarray as xr
import cf_xarray.units  # noqa: F401
import pint_xarray  # noqa: F401

pd.options.plotting.backend = "plotly"

## Load raw data

---


In [28]:
raw_data = pd.read_csv("../../1_raw/hot_zooplankton.csv", index_col=0)
raw_metadata = pd.read_csv("../../1_raw/hot_zooplankton_meta.csv", index_col=0)

In [29]:
raw_metadata = raw_metadata.set_index("Variable")

raw_data["time"] = pd.to_datetime(raw_data["time"])
raw_data = raw_data.drop(columns=["cruise"])

## Clean data

---


Remove the data when time and position are not known.


In [30]:
fig = (
    raw_data.isna()
    .sum()
    .plot(
        kind="bar",
        title=f"Missing values per column for a total of {len(raw_data)} entries",
        labels=dict(
            index="Column",
            value="Number of missing values",
        ),
        # log y-axis
        log_y=True,
    )
)
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=500)
# Remove legend
fig.update_layout(showlegend=False)
fig.show()

## Manage index

---


### Time

Must be set to daily frequency. If multiple data points are available for a single day, the mean is taken.


In [31]:
raw_data["time"].value_counts().sort_index().plot(
    title="Number of entries per time",
    labels=dict(
        index="Time",
        value="Number of entries",
    ),
)

In [32]:
raw_data.time.dt.hour.hist(bins=24, title="Number of entries per hour")

In [33]:
raw_data["is_day"] = raw_data.time.dt.hour.isin(range(6, 18))
raw_data["time"] = raw_data.time.dt.floor("D")
raw_data

Unnamed: 0,time,lat,lon,depth,frac,tow,vol,svol,wwt,dwt,carb,nit,abnd,is_day
0,1994-02-17,22.75,-158,181,0,8,726,,0.2642,0.0132,3.95,1.09,24.0,True
1,1994-02-17,22.75,-158,181,1,8,726,,0.4170,0.0417,14.05,3.26,221.0,True
2,1994-02-17,22.75,-158,181,2,8,726,,0.2302,0.0230,8.89,2.19,240.0,True
3,1994-02-17,22.75,-158,181,3,8,726,,0.3738,0.0374,14.21,3.24,2331.0,True
4,1994-02-17,22.75,-158,181,4,8,726,,0.2749,0.0302,11.81,2.66,5595.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9343,2022-09-02,22.75,-158,241,1,1640,135,,2.9467,0.4162,135.35,34.69,,False
9344,2022-09-02,22.75,-158,241,2,1640,135,,2.1526,0.3467,118.38,30.20,,False
9345,2022-09-02,22.75,-158,241,3,1640,135,,2.5806,0.4098,143.47,35.31,,False
9346,2022-09-02,22.75,-158,241,4,1640,135,,1.8745,0.3151,116.69,27.91,,False


### Depth


In [34]:
raw_data["depth"].astype(float).plot.hist(
    nbins=100, title="Depth", labels=dict(value="Depth (m)")
)

Then aggregate data by 50 meters.


In [35]:
# group depth by 50m bins
raw_data["depth"] = pd.cut(
    raw_data["depth"],
    bins=[0, 50, 100, 150, 200, 250, 300, 350, 400],
    labels=[50, 100, 150, 200, 250, 300, 350, 400],
    right=False,
)

In [36]:
raw_data["depth"].astype(float).plot.hist(title="Depth", labels=dict(value="Depth (m)"))

## Produce preprocessed data

---


Group by tow and use the mean values. Use the size of the frac rather than the flag to represent the size of the zooplankton.


In [40]:
# This show that there are duplicates index when not grouped by tow

try:
    raw_data.set_index(
        ["time", "is_day", "depth", "lat", "lon", "frac"], verify_integrity=True
    )
except ValueError as e:
    print(e)

Index has duplicate keys: MultiIndex([('1994-03-10', False, 200, 22.75, -158, 0),
            ('1994-03-10', False, 200, 22.75, -158, 1),
            ('1994-03-10', False, 200, 22.75, -158, 2),
            ('1994-03-10', False, 200, 22.75, -158, 3),
            ('1994-03-10', False, 200, 22.75, -158, 4),
            ('1994-03-10', False, 200, 22.75, -158, 5),
            ('1994-06-19', False, 200, 22.75, -158, 0),
            ('1994-06-19', False, 200, 22.75, -158, 1),
            ('1994-06-19', False, 200, 22.75, -158, 2),
            ('1994-06-19', False, 200, 22.75, -158, 3),
            ...
            ('2022-09-01', False, 250, 22.75, -158, 2),
            ('2022-09-01', False, 250, 22.75, -158, 3),
            ('2022-09-01', False, 250, 22.75, -158, 4),
            ('2022-09-01', False, 250, 22.75, -158, 5),
            ('2022-09-02',  True, 200, 22.75, -158, 0),
            ('2022-09-02',  True, 200, 22.75, -158, 1),
            ('2022-09-02',  True, 200, 22.75, -158, 2),
      

In [38]:
raw_data = raw_data.drop(columns=["tow"])

In [41]:
preprocessed_data = xr.Dataset.from_dataframe(
    raw_data.groupby(["time", "is_day", "lat", "lon", "depth", "frac"]).mean()
)
# Replace by the mesh size in mm
preprocessed_data = preprocessed_data.where(
    preprocessed_data.frac != 5, drop=True
).assign_coords({"frac": [0.2, 0.5, 1, 2, 5]})

for var in preprocessed_data:
    attrs = raw_metadata.loc[var].to_dict()
    # set all attrs keys to lowercase
    attrs = {k.lower(): v for k, v in attrs.items()}
    # transform unit key to units
    attrs["units"] = attrs.pop("unit")
    preprocessed_data[var].attrs = attrs

preprocessed_data["time"].attrs = {
    "standard_name": "time",
    "long_name": "time",
    "axis": "T",
}

preprocessed_data["is_day"].attrs = {
    "flag_values": f"{[True, False]}",
    "flag_meanings": "day night",
    "standard_name": "is_day",
    "long_name": "Is day",
    "description": "Flag to indicate if the time is during the day or night",
}

preprocessed_data["lat"].attrs = {
    "standard_name": "latitude",
    "long_name": "latitude",
    "axis": "Y",
    "units": "degrees_north",
}
preprocessed_data["lon"].attrs = {
    "standard_name": "longitude",
    "long_name": "longitude",
    "axis": "X",
    "units": "degrees_east",
}
preprocessed_data["depth"].attrs = {
    "standard_name": "depth",
    "long_name": "depth",
    "axis": "Z",
    "units": "m",
}
preprocessed_data["frac"].attrs = {
    "standard_name": "fraction",
    "long_name": "fraction",
    "units": "mm",
}

preprocessed_data





## Final plot

---


In [42]:
raw_data.groupby(["time", "lat", "lon", "depth"]).mean().plot.box()





## Export preprocessed data

---


In [43]:
try:
    preprocessed_data.pint.quantify()
except Exception as e:
    print(e)
    print("Some units cannot be quantified and are only here for information.")


Some units cannot be quantified and are only here for information.


In [45]:
preprocessed_data.to_zarr("../../2_processed/hot_zooplankton.zarr")

<xarray.backends.zarr.ZarrStore at 0x301fe23c0>