# Bottle data in bats


In [1]:
import pandas as pd
import numpy as np
import xarray as xr
import cf_xarray.units  # noqa: F401
import pint_xarray  # noqa: F401
import json

pd.options.plotting.backend = "plotly"

## Load raw data

---


In [2]:
raw_data = pd.read_csv("../../1_raw/bats_zooplankton.csv", index_col=0)
raw_metadata = pd.read_csv("../../1_raw/bats_zooplankton_meta.csv", index_col=0)

In [3]:
raw_metadata = raw_metadata.set_index("Variable")

raw_data["time"] = pd.to_datetime(raw_data["time"])
raw_data["sieve_size"] = raw_data["sieve_size"].astype("category")
raw_data = raw_data.drop(columns=["Cruise_ID", "time_out", "duration_minutes", "UNOLS"])

## Clean data

---


In [4]:
fig = (
    raw_data.isna()
    .sum()
    .plot(
        kind="bar",
        title=f"Missing values per column for a total of {len(raw_data)} entries",
        labels=dict(
            index="Column",
            value="Number of missing values",
        ),
        # log y-axis
        log_y=True,
    )
)
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=500)
# Remove legend
fig.update_layout(showlegend=False)
fig.show()

## Manage index

---


In [5]:
raw_data["time_in"] = (
    raw_data["time_in"]
    .astype(str)
    .apply(lambda x: x.zfill(4))
    .apply(lambda x: x[:2] + ":" + x[2:] + ":00")
    .apply(pd.to_timedelta)
)
raw_data["time"] = raw_data["time"] + raw_data["time_in"]
raw_data = raw_data.drop(columns=["time_in"])

In [6]:
raw_data["time"].value_counts().sort_index().plot(
    title="Number of entries per time",
    labels=dict(
        index="Time",
        value="Number of entries",
    ),
)

In [7]:
raw_data["lat"].astype(float).plot.hist(nbins=100, title="Latitude")

In [8]:
raw_data["lat"] = np.full_like(raw_data["lat"], 31.6)

In [9]:
raw_data["lon"].astype(float).plot.hist(nbins=100, title="Latitude")

In [10]:
raw_data["lon"] = np.full_like(raw_data["lon"], -64.2)

In [11]:
raw_data["depth"].astype(float).plot.hist(
    nbins=100, title="Depth", labels=dict(value="Depth (m)")
)

Then aggregate data by 50 meters.


In [12]:
# group depth by 50m bins
raw_data["depth"] = pd.cut(
    raw_data["depth"],
    bins=[0, 50, 100, 150, 200, 250, 300, 350, 400],
    labels=[50, 100, 150, 200, 250, 300, 350, 400],
    right=False,
)

In [13]:
raw_data["depth"].astype(float).plot.hist(title="Depth", labels=dict(value="Depth (m)"))

## Produce preprocessed data

---


Group by tow and use the mean values. Use the size of the frac rather than the flag to represent the size of the zooplankton.


In [14]:
# This show that there are duplicates index

try:
    raw_data.set_index(
        ["time", "depth", "lat", "lon", "sieve_size"], verify_integrity=True
    )
except ValueError as e:
    print(e)

In [15]:
raw_data["depth"] = raw_data["depth"].astype(float)
# Convert to millimeters
raw_data["sieve_size"] = raw_data["sieve_size"].astype(float) / 1000

In [16]:
raw_data = raw_data.rename(columns={"lat": "latitude", "lon": "longitude"})

preprocessed_data = xr.Dataset.from_dataframe(
    raw_data.groupby(["time", "depth", "latitude", "longitude", "sieve_size"]).mean()
)

for var in preprocessed_data:
    attrs = raw_metadata.loc[var].to_dict()
    # set all attrs keys to lowercase
    attrs = {k.lower(): v for k, v in attrs.items()}
    # transform unit key to units
    attrs["units"] = attrs.pop("unit")
    preprocessed_data[var].attrs = attrs

preprocessed_data["time"].attrs = {
    "standard_name": "time",
    "long_name": "time",
    "axis": "T",
}
preprocessed_data["latitude"].attrs = {
    "standard_name": "latitude",
    "long_name": "latitude",
    "axis": "Y",
    "units": "degrees_north",
}
preprocessed_data["longitude"].attrs = {
    "standard_name": "longitude",
    "long_name": "longitude",
    "axis": "X",
    "units": "degrees_east",
}
preprocessed_data["depth"].attrs = {
    "standard_name": "depth",
    "long_name": "depth",
    "axis": "Z",
    "units": "m",
}
preprocessed_data["sieve_size"].attrs = {
    "standard_name": "size",
    "long_name": "Sieve size",
    "units": "mm",
}

preprocessed_data

## Final plot

---


In [17]:
raw_data.groupby(["time", "latitude", "longitude", "depth"]).mean().plot.box()

## Export preprocessed data

---


In [18]:
try:
    preprocessed_data.pint.quantify()
except Exception as e:
    print(e)
    print("Some units cannot be quantified and are only here for information.")

In [19]:
preprocessed_data.to_zarr("../../2_processed/bats_zooplankton.zarr", mode="w")

<xarray.backends.zarr.ZarrStore at 0x12d391040>