# Zooplankton data in hot


In [16]:
import pandas as pd
import xarray as xr
import cf_xarray.units  # noqa: F401
import pint_xarray  # noqa: F401

pd.options.plotting.backend = "plotly"

## Load raw data

---


In [17]:
raw_data = pd.read_csv("../../1_raw/hot_zooplankton.csv", index_col=0)
raw_metadata = pd.read_csv("../../1_raw/hot_zooplankton_meta.csv", index_col=0)

In [18]:
raw_metadata = raw_metadata.set_index("Variable")

raw_data["time"] = pd.to_datetime(raw_data["time"])
raw_data = raw_data.drop(columns=["cruise"])

## Clean data

---


Remove the data when time and position are not known.


In [19]:
fig = (
    raw_data.isna()
    .sum()
    .plot(
        kind="bar",
        title=f"Missing values per column for a total of {len(raw_data)} entries",
        labels=dict(
            index="Column",
            value="Number of missing values",
        ),
        # log y-axis
        log_y=True,
    )
)
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=500)
# Remove legend
fig.update_layout(showlegend=False)
fig.show()

## Manage index

---


In [20]:
raw_data["time"].value_counts().sort_index().plot(
    title="Number of entries per time",
    labels=dict(
        index="Time",
        value="Number of entries",
    ),
)

In [21]:
raw_data["depth"].astype(float).plot.hist(
    nbins=100, title="Depth", labels=dict(value="Depth (m)")
)

Then aggregate data by 50 meters.


In [22]:
# group depth by 50m bins
raw_data["depth"] = pd.cut(
    raw_data["depth"],
    bins=[0, 50, 100, 150, 200, 250, 300, 350, 400],
    labels=[50, 100, 150, 200, 250, 300, 350, 400],
    right=False,
)

In [23]:
raw_data["depth"].astype(float).plot.hist(title="Depth", labels=dict(value="Depth (m)"))

## Produce preprocessed data

---


Group by tow and use the mean values. Use the size of the frac rather than the flag to represent the size of the zooplankton.


In [24]:
# This show that there are duplicates index when not grouped by tow

try:
    raw_data.set_index(["time", "depth", "lat", "lon", "frac"], verify_integrity=True)
except ValueError as e:
    print(e)

Index has duplicate keys: MultiIndex([('1996-10-01 10:00:00', 150, 22.75, -158, 0),
            ('1996-10-01 10:00:00', 150, 22.75, -158, 1),
            ('1996-10-01 10:00:00', 150, 22.75, -158, 2),
            ('1996-10-01 10:00:00', 150, 22.75, -158, 3),
            ('1996-10-01 10:00:00', 150, 22.75, -158, 4),
            ('1996-10-01 10:00:00', 150, 22.75, -158, 5)],
           names=['time', 'depth', 'lat', 'lon', 'frac'])


In [25]:
raw_data = raw_data.drop(columns=["tow"])

In [38]:
preprocessed_data = xr.Dataset.from_dataframe(
    raw_data.groupby(["time", "lat", "lon", "depth", "frac"]).mean()
)
# Replace by the mesh size in mm
preprocessed_data = preprocessed_data.where(
    preprocessed_data.frac != 5, drop=True
).assign_coords({"frac": [0.2, 0.5, 1, 2, 5]})

for var in preprocessed_data:
    attrs = raw_metadata.loc[var].to_dict()
    # set all attrs keys to lowercase
    attrs = {k.lower(): v for k, v in attrs.items()}
    # transform unit key to units
    attrs["units"] = attrs.pop("unit")
    preprocessed_data[var].attrs = attrs

preprocessed_data["time"].attrs = {
    "standard_name": "time",
    "long_name": "time",
    "axis": "T",
}
preprocessed_data["lat"].attrs = {
    "standard_name": "latitude",
    "long_name": "latitude",
    "axis": "Y",
    "units": "degrees_north",
}
preprocessed_data["lon"].attrs = {
    "standard_name": "longitude",
    "long_name": "longitude",
    "axis": "X",
    "units": "degrees_east",
}
preprocessed_data["depth"].attrs = {
    "standard_name": "depth",
    "long_name": "depth",
    "axis": "Z",
    "units": "m",
}
preprocessed_data["frac"].attrs = {
    "standard_name": "fraction",
    "long_name": "fraction",
    "units": "mm",
}

preprocessed_data





## Final plot

---


In [39]:
raw_data.groupby(["time", "lat", "lon", "depth"]).mean().plot.box()





## Export preprocessed data

---


In [40]:
try:
    preprocessed_data.pint.quantify()
except Exception as e:
    print(e)
    print("Some units cannot be quantified and are only here for information.")


Some units cannot be quantified and are only here for information.


In [42]:
preprocessed_data.to_zarr("../../2_processed/hot_zooplankton.zarr")

<xarray.backends.zarr.ZarrStore at 0x302f99ec0>