# Zooplankton data in hot


In [40]:
import pandas as pd
import xarray as xr
import numpy as np
import cf_xarray.units  # noqa: F401
import pint_xarray  # noqa: F401
import seaborn as sns
import plotly.express as px

pd.options.plotting.backend = "plotly"

## Load raw data

---


Observation data is zooplakton biomass dry weight in mg/m3 (wet weights for period 1850/1980) as described in the official technical report.


In [41]:
raw_data = pd.read_csv("../../1_raw/papa_zooplankton.csv", sep=";", decimal=",")
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1077 entries, 0 to 1076
Columns: 114 entries, Key to XXXX >> Remainder
dtypes: float64(95), int64(9), object(10)
memory usage: 959.3+ KB


In [42]:
raw_data["time"] = pd.to_datetime(raw_data["Date"], format="%d %m %Y")
index = raw_data[
    [
        "time",
        "lat",
        "lon",
        "Twilight",
        "Mesh_Size(um)",
        "DEPTH_STRT",
        "DEPTH_END",
        "Volume Filtered(m3)",
    ]
]
index = index.astype(
    {
        "time": "datetime64[ns]",
        "Twilight": "category",
        "Mesh_Size(um)": "float64",
        "DEPTH_STRT": "float64",
        "DEPTH_END": "float64",
        "Volume Filtered(m3)": "float64",
        "lat": "float64",
        "lon": "float64",
    }
)
index.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1077 entries, 0 to 1076
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   time                 1077 non-null   datetime64[ns]
 1   lat                  1077 non-null   float64       
 2   lon                  1077 non-null   float64       
 3   Twilight             1077 non-null   category      
 4   Mesh_Size(um)        1077 non-null   float64       
 5   DEPTH_STRT           1077 non-null   float64       
 6   DEPTH_END            1077 non-null   float64       
 7   Volume Filtered(m3)  1077 non-null   float64       
dtypes: category(1), datetime64[ns](1), float64(6)
memory usage: 60.2 KB


In [43]:
data = raw_data.iloc[:, slice(20, -1)]
# data = data.replace(0, np.nan)
data = data.astype("float64")
data

Unnamed: 0,ANNE:POLY: >> POLY larvae s1,ANNE:POLY: >> POLY s1,ANNE:POLY: >> POLY s2,ANNE:POLY: >> POLY s3,ARCR:: >> CIRRI s1,ARCR:: >> CRUST larvae s1,ARCR:AMPH:GAMM >> GAMM s1,ARCR:AMPH:GAMM >> GAMM s2,ARCR:AMPH:GAMM >> GAMM s3,ARCR:AMPH:HYPE >> HYPER s1,...,UROC:THAL: >> DOLIO s2,UROC:THAL: >> DOLIO s3,UROC:THAL: >> SALP s1,UROC:THAL: >> SALP s2,UROC:THAL: >> SALP s3,VERT:PISC: >> PISCES egg s1,VERT:PISC: >> PISCES s1,VERT:PISC: >> PISCES s2,VERT:PISC: >> PISCES s3,XXXX >> Remainder
0,0.0,0.0,0.00000,0.00000,0.00000,0.0,0.0,0.00000,0.000,0.00000,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.00000,0.07819,0.00000
1,0.0,0.0,0.00000,0.00000,0.00000,0.0,0.0,0.00000,0.000,0.00000,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.00000,4.22832,0.00000
2,0.0,0.0,0.00000,0.00000,0.00000,0.0,0.0,0.05811,0.000,0.00000,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.00000,0.00000,0.00000
3,0.0,0.0,0.00000,0.25838,0.00000,0.0,0.0,0.00000,0.000,0.00000,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.94146,0.00000,0.00000
4,0.0,0.0,0.00000,0.00000,0.00000,0.0,0.0,0.18938,0.000,0.00000,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.33458,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072,0.0,0.0,0.00568,0.00000,0.01070,0.0,0.0,0.00000,0.000,0.19212,...,0.01070,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.24398,1.27507,0.00000
1073,0.0,0.0,0.00000,0.05045,0.00000,0.0,0.0,0.00000,0.000,0.90018,...,0.00000,0.00000,0.0,0.00000,0.13353,0.0,0.00000,0.00000,0.00000,0.00000
1074,0.0,0.0,0.04762,0.41514,0.00000,0.0,0.0,0.00000,0.087,0.00000,...,4.41514,1.53846,0.0,0.00000,0.00000,0.0,0.02365,0.27778,3.73779,0.00000
1075,0.0,0.0,0.00000,0.00000,0.00000,0.0,0.0,0.00000,0.000,0.12242,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.00000,0.00000,0.02066


In [44]:
data = pd.merge(index, data, left_index=True, right_index=True)
data

Unnamed: 0,time,lat,lon,Twilight,Mesh_Size(um),DEPTH_STRT,DEPTH_END,Volume Filtered(m3),ANNE:POLY: >> POLY larvae s1,ANNE:POLY: >> POLY s1,...,UROC:THAL: >> DOLIO s2,UROC:THAL: >> DOLIO s3,UROC:THAL: >> SALP s1,UROC:THAL: >> SALP s2,UROC:THAL: >> SALP s3,VERT:PISC: >> PISCES egg s1,VERT:PISC: >> PISCES s1,VERT:PISC: >> PISCES s2,VERT:PISC: >> PISCES s3,XXXX >> Remainder
0,1995-09-24,49.820,-128.563,Daylight,236.0,246.0,0.0,86.33,0.0,0.0,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.00000,0.07819,0.00000
1,1995-09-24,49.820,-128.563,Night,236.0,50.0,0.0,36.09,0.0,0.0,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.00000,4.22832,0.00000
2,1995-09-25,49.560,-128.259,Daylight,236.0,247.0,0.0,113.58,0.0,0.0,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.00000,0.00000,0.00000
3,1995-09-25,49.560,-128.259,Night,236.0,49.0,0.0,24.77,0.0,0.0,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.94146,0.00000,0.00000
4,1995-09-25,49.670,-128.075,Night,236.0,235.0,0.0,69.70,0.0,0.0,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.33458,0.00000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072,2020-08-17,49.283,-134.667,Night,236.0,250.0,0.0,59.84,0.0,0.0,...,0.01070,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.24398,1.27507,0.00000
1073,2020-08-18,49.569,-138.666,Night,236.0,250.0,0.0,67.40,0.0,0.0,...,0.00000,0.00000,0.0,0.00000,0.13353,0.0,0.00000,0.00000,0.00000,0.00000
1074,2020-08-20,50.000,-145.000,Night,236.0,250.0,0.0,65.52,0.0,0.0,...,4.41514,1.53846,0.0,0.00000,0.00000,0.0,0.02365,0.27778,3.73779,0.00000
1075,2020-08-30,50.463,-129.916,Daylight,236.0,250.0,0.0,61.95,0.0,0.0,...,0.00000,0.00000,0.0,0.00000,0.00000,0.0,0.00000,0.00000,0.00000,0.02066


## Define taxa groups

---


In [45]:
taxa_groups = {
    "benthos": ["ANNE:POLY: >> POLY larvae s1"],
    "crustacean": ["ARCR"],
    "chaetognatha": ["CHAE"],
    "small_gelatinous": ["CNID", "CTEN"],
    "larvacean": ["LARV"],
    "thaliacea": ["THAL"],
    "others": ["ECHI", "ECTO", "MOCE", "MOGA", "MOLL", "PROT", "PISC", "XXXX"],
}

## Clean data

---


Remove the data when time and position are not known.


In [46]:
fig = (
    raw_data.isna()
    .sum()
    .plot(
        kind="bar",
        title=f"Missing values per column for a total of {len(raw_data)} entries",
        labels=dict(
            index="Column",
            value="Number of missing values",
        ),
        # log y-axis
        log_y=True,
    )
)
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=500)
# Remove legend
fig.update_layout(showlegend=False)
fig.show()

## Manage index

---


### Time

Must be set to daily frequency. If multiple data points are available for a single day, the mean is taken.


In [47]:
px.histogram(
    index,
    x="time",
    title="Number of entries per time",
    labels=dict(
        time="Time",
        count="Number of entries",
    ),
    marginal="box",
)

In [48]:
index.plot(
    title="Number of entries during day/night",
    labels=dict(
        index="Time",
        value="Number of entries",
    ),
    kind="hist",
    x="Twilight",
)

## Space


In [49]:
px.histogram(
    index,
    x="lat",
    title="Number of entries on latitude",
    labels=dict(
        lat="Latitude",
        count="Number of entries",
    ),
    marginal="box",
)


In [50]:
px.histogram(
    index,
    x="lon",
    title="Number of entries on longitude",
    labels=dict(
        lon="Longitude",
        count="Number of entries",
    ),
    marginal="box",
)


### Depth


In [51]:
px.histogram(
    index,
    x="DEPTH_STRT",
    title="Number of entries on depth",
    labels=dict(
        DEPTH_STRT="Depth",
        count="Number of entries",
    ),
    marginal="violin",
)


### Volume filtered


In [52]:
px.histogram(
    index,
    x="Volume Filtered(m3)",
    title="Number of entries by volume filtered",
    labels={
        "Volume Filtered(m3)": "Volume filtered (m3)",
        "count": "Number of entries",
    },
    marginal="box",
)


### Zero values


In [53]:
nb_zeros = (data.iloc[:, 6:] > 0).sum(1)

In [54]:
px.histogram(
    x=data["DEPTH_STRT"],
    y=nb_zeros,
    title="Number of non-zero values by depth",
    labels=dict(x="Depth (m)", y="Number of non-zero values"),
    nbins=100,
)

In [55]:
px.histogram(
    x=data["Volume Filtered(m3)"],
    y=nb_zeros,
    title="Number of non-zero values by volume filtered",
    labels=dict(x="Volume filtered (m3)", y="Number of non-zero values"),
    nbins=100,
)


## Aggregate by taxa group

---


In [56]:
taxa_groups

{'benthos': ['ANNE:POLY: >> POLY larvae s1'],
 'crustacean': ['ARCR'],
 'chaetognatha': ['CHAE'],
 'small_gelatinous': ['CNID', 'CTEN'],
 'larvacean': ['LARV'],
 'thaliacea': ['THAL'],
 'others': ['ECHI', 'ECTO', 'MOCE', 'MOGA', 'MOLL', 'PROT', 'PISC', 'XXXX']}

In [57]:
res = {}
for k, v in taxa_groups.items():
    nested_list = [data.columns[[vv in col for col in data.columns]] for vv in v]
    flat_list = [item for sublist in nested_list for item in sublist]
    res[k] = flat_list

In [58]:
final_data = pd.merge(
    index,
    pd.DataFrame({k: data.loc[:, v].sum(1) for k, v in res.items()}),
    left_index=True,
    right_index=True,
)
final_data = final_data.rename(
    columns={
        "Twilight": "is_day",
        "Mesh_Size(um)": "mesh_size",
        "Volume Filtered(m3)": "volume_filtered",
        "DEPTH_STRT": "depth",
    }
).drop(columns="DEPTH_END")
final_data["total"] = final_data[[k for k in taxa_groups.keys()]].sum(1)
final_data

Unnamed: 0,time,lat,lon,is_day,mesh_size,depth,volume_filtered,benthos,crustacean,chaetognatha,small_gelatinous,larvacean,thaliacea,others,total
0,1995-09-24,49.820,-128.563,Daylight,236.0,246.0,86.33,0.0,14.24295,2.54984,1.93420,0.03966,0.00000,1.32172,20.08837
1,1995-09-24,49.820,-128.563,Night,236.0,50.0,36.09,0.0,29.86960,2.66833,6.63730,0.07093,0.00000,4.31805,43.56421
2,1995-09-25,49.560,-128.259,Daylight,236.0,247.0,113.58,0.0,6.69830,3.16646,0.00000,0.00563,0.00000,0.16840,10.03879
3,1995-09-25,49.560,-128.259,Night,236.0,49.0,24.77,0.0,36.23745,1.39943,2.21558,0.00000,0.00000,1.52656,41.37902
4,1995-09-25,49.670,-128.075,Night,236.0,235.0,69.70,0.0,10.62110,3.38267,0.76872,0.01240,0.00000,1.03913,15.82402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072,2020-08-17,49.283,-134.667,Night,236.0,250.0,59.84,0.0,27.80584,3.73822,4.67863,0.00000,0.01070,2.24733,38.48072
1073,2020-08-18,49.569,-138.666,Night,236.0,250.0,67.40,0.0,30.44516,8.03965,3.21958,0.00000,0.13353,1.33575,43.17367
1074,2020-08-20,50.000,-145.000,Night,236.0,250.0,65.52,0.0,26.65790,5.02856,3.90876,0.15629,9.00122,5.08919,49.84192
1075,2020-08-30,50.463,-129.916,Daylight,236.0,250.0,61.95,0.0,6.76577,3.16848,0.50268,0.63018,0.00000,6.28915,17.35626


## Aggregate by depth and space

---


In [59]:
final_data["depth"] = pd.cut(
    final_data["depth"],
    bins=[0, 50, 100, 150, 200, 250, 300, 350, 400],
    labels=[50, 100, 150, 200, 250, 300, 350, 400],
    right=True,
)

final_data["lon"] = pd.cut(
    final_data["lon"],
    bins=np.arange(-158, -126, 1),
    labels=np.arange(-157.5, -126.5, 1),
    right=True,
)

final_data["lat"] = pd.cut(
    final_data["lat"],
    bins=np.arange(46, 59, 1),
    labels=np.arange(46.5, 58.5, 1),
    right=True,
)

final_data

Unnamed: 0,time,lat,lon,is_day,mesh_size,depth,volume_filtered,benthos,crustacean,chaetognatha,small_gelatinous,larvacean,thaliacea,others,total
0,1995-09-24,49.5,-128.5,Daylight,236.0,250,86.33,0.0,14.24295,2.54984,1.93420,0.03966,0.00000,1.32172,20.08837
1,1995-09-24,49.5,-128.5,Night,236.0,50,36.09,0.0,29.86960,2.66833,6.63730,0.07093,0.00000,4.31805,43.56421
2,1995-09-25,49.5,-128.5,Daylight,236.0,250,113.58,0.0,6.69830,3.16646,0.00000,0.00563,0.00000,0.16840,10.03879
3,1995-09-25,49.5,-128.5,Night,236.0,50,24.77,0.0,36.23745,1.39943,2.21558,0.00000,0.00000,1.52656,41.37902
4,1995-09-25,49.5,-128.5,Night,236.0,250,69.70,0.0,10.62110,3.38267,0.76872,0.01240,0.00000,1.03913,15.82402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072,2020-08-17,49.5,-134.5,Night,236.0,250,59.84,0.0,27.80584,3.73822,4.67863,0.00000,0.01070,2.24733,38.48072
1073,2020-08-18,49.5,-138.5,Night,236.0,250,67.40,0.0,30.44516,8.03965,3.21958,0.00000,0.13353,1.33575,43.17367
1074,2020-08-20,49.5,-145.5,Night,236.0,250,65.52,0.0,26.65790,5.02856,3.90876,0.15629,9.00122,5.08919,49.84192
1075,2020-08-30,50.5,-129.5,Daylight,236.0,250,61.95,0.0,6.76577,3.16848,0.50268,0.63018,0.00000,6.28915,17.35626


## Produce preprocessed data

---


Group by tow and use the mean values. Use the size of the frac rather than the flag to represent the size of the zooplankton.


In [60]:
# This show that there are duplicates index when not grouped by tow

try:
    final_data.set_index(
        ["time", "is_day", "depth", "lat", "lon"], verify_integrity=True
    )
except ValueError as e:
    print(e)

Index has duplicate keys: MultiIndex([('1995-09-25',    'Night',  50, 49.5, -128.5),
            ('1997-02-21',    'Night', 150, 49.5, -145.5),
            ('1997-03-29', 'Daylight', 150, 49.5, -141.5),
            ('1997-03-29', 'Daylight', 200, 49.5, -141.5),
            ('1997-06-16', 'Daylight', 150, 49.5, -145.5),
            ('1997-07-03', 'Daylight', 150, 52.5, -149.5),
            ('1997-07-05', 'Daylight', 150, 50.5, -145.5),
            ('1997-07-06', 'Daylight', 150, 50.5, -144.5),
            ('1997-07-07', 'Daylight', 150, 51.5, -145.5),
            ('1997-07-08', 'Daylight', 150, 52.5, -144.5),
            ...
            ('2017-07-26',    'Night', 250, 49.5, -132.5),
            ('2017-07-26', 'Daylight', 250, 50.5, -130.5),
            ('2017-07-27',    'Night', 250, 50.5, -130.5),
            ('2017-07-28', 'Daylight', 250, 50.5, -130.5),
            ('2017-09-06', 'Daylight', 250, 50.5, -129.5),
            ('2018-05-13', 'Daylight', 250, 49.5, -128.5),
            ('

In [61]:
final_data["is_day"] = final_data["is_day"] == "Daylight"

In [62]:
preprocessed_data = xr.Dataset.from_dataframe(
    final_data.groupby(["time", "is_day", "lat", "lon", "depth"]).mean()
).rename({"lat": "latitude", "lon": "longitude"})

for var in list(taxa_groups.keys()) + ["total"]:
    preprocessed_data[var].attrs = {
        "standard_name": var,
        "long_name": f"dry weight of {var} zooplankton",
        "units": "mg/m3",
    }

preprocessed_data["time"].attrs = {
    "standard_name": "time",
    "long_name": "time",
    "axis": "T",
}

preprocessed_data["is_day"].attrs = {
    "flag_values": f"{[True, False]}",
    "flag_meanings": "day night",
    "standard_name": "is_day",
    "long_name": "Is day",
    "description": "Flag to indicate if the time is during the day or night",
}

preprocessed_data["latitude"].attrs = {
    "standard_name": "latitude",
    "long_name": "latitude",
    "axis": "Y",
    "units": "degrees_north",
}
preprocessed_data["longitude"].attrs = {
    "standard_name": "longitude",
    "long_name": "longitude",
    "axis": "X",
    "units": "degrees_east",
}
preprocessed_data["depth"].attrs = {
    "standard_name": "depth",
    "long_name": "depth",
    "axis": "Z",
    "units": "m",
}
preprocessed_data["volume_filtered"].attrs = {
    "standard_name": "volume",
    "long_name": "filtered volume of water",
    "units": "m3",
}
preprocessed_data["mesh_size"].attrs = {
    "standard_name": "mesh_size",
    "long_name": "net mesh size",
    "units": "um",
}


preprocessed_data





## Final plot

---


In [65]:
final_data.groupby(["time", "is_day", "lat", "lon", "depth"]).mean().dropna().plot.box()





## Export preprocessed data

---


In [66]:
try:
    preprocessed_data.pint.quantify()
except Exception as e:
    print(e)
    print("Some units cannot be quantified and are only here for information.")

In [68]:
preprocessed_data.to_zarr("../../2_processed/papa_zooplankton.zarr")

<xarray.backends.zarr.ZarrStore at 0x131d37440>