In [21]:
import pandas as pd
import xarray as xr
import cf_xarray.units  # noqa: F401
import pint_xarray  # noqa: F401

pd.options.plotting.backend = "plotly"

In [2]:
data = pd.read_excel(
    "../../1_raw/P26-zoo_1995-2020 major taxa by size.xlsx",
    sheet_name="P26 mg per m3_groups",
    skiprows=1,
)

We only need these columns :


In [3]:
header = ("date", "N/Day", "Net_Mouth_Dia(m)", "DEPTH_STRT", "Volume Filtered(m3)")
columns = (
    "CRUST (mg/m3)",
    "LARV (mg/m3)",
    "THAL(mg/m3)",
    "CHAETO(mg/m3)",
    "PTEROPOD(mg/m3)",
    "small GEL(mg/m3)",
    "FORAM(mg/m3)",
    "OTH(mg/m3)",
    "TOTAL non GEL(mg/m3)",
)

In [4]:
# garder les 5 premieres columns et les 9 dernières
data = data.iloc[:, [*list(range(5)), *list(range(-9, 0))]]
data.head(2)

Unnamed: 0,date,N/Day,Net_Mouth_Dia(m),DEPTH_STRT,Volume Filtered(m3),CRUST \n(mg/m3),LARV \n(mg/m3),THAL\n(mg/m3),CHAETO\n(mg/m3),PTEROPOD\n(mg/m3),small GEL\n(mg/m3),FORAM\n(mg/m3),OTH\n(mg/m3),TOTAL non GEL\n(mg/m3)
0,1997-02-21,N,0.56,123,64.79,5.75863,0.05149,0.0,0.27368,0.06297,0.51674,0.00519,0.00648,6.10695
1,1997-02-21,N,0.56,136,77.41,5.49039,0.32688,0.0,0.29913,0.07854,1.27374,0.0,0.32088,6.18894


Then we rename and keep the units.


In [5]:
data = data.rename(
    columns={
        "CRUST \n(mg/m3)": "crustaceans",
        "LARV \n(mg/m3)": "larvae",
        "THAL\n(mg/m3)": "thaliaceans",
        "CHAETO\n(mg/m3)": "chaetognaths",
        "PTEROPOD\n(mg/m3)": "pteropods",
        "small GEL\n(mg/m3)": "small_gelatinous",
        "FORAM\n(mg/m3)": "foram",
        "OTH\n(mg/m3)": "other",
        "TOTAL non GEL\n(mg/m3)": "total_non_gelatinous",
        "DEPTH_STRT": "depth",
    }
)
species_unit = "mg/m3"
volum_unit = "m3"

In [6]:
data.head(2)

Unnamed: 0,date,N/Day,Net_Mouth_Dia(m),depth,Volume Filtered(m3),crustaceans,larvae,thaliaceans,chaetognaths,pteropods,small_gelatinous,foram,other,total_non_gelatinous
0,1997-02-21,N,0.56,123,64.79,5.75863,0.05149,0.0,0.27368,0.06297,0.51674,0.00519,0.00648,6.10695
1,1997-02-21,N,0.56,136,77.41,5.49039,0.32688,0.0,0.29913,0.07854,1.27374,0.0,0.32088,6.18894


Sometimes the net mouth diameter is different.


In [7]:
fig = data["Net_Mouth_Dia(m)"].plot(kind="hist")
fig.update_layout(width=600, height=300)

In [8]:
fig = data["Volume Filtered(m3)"].plot(kind="hist")
fig.update_layout(width=600, height=300)

In [9]:
data = data.drop(columns=["Net_Mouth_Dia(m)", "Volume Filtered(m3)"])
data.head(2)

Unnamed: 0,date,N/Day,depth,crustaceans,larvae,thaliaceans,chaetognaths,pteropods,small_gelatinous,foram,other,total_non_gelatinous
0,1997-02-21,N,123,5.75863,0.05149,0.0,0.27368,0.06297,0.51674,0.00519,0.00648,6.10695
1,1997-02-21,N,136,5.49039,0.32688,0.0,0.29913,0.07854,1.27374,0.0,0.32088,6.18894


---

Now we clean the data.


In [10]:
data.iloc[:, 3:].plot.box(title="Data distribution")

We see a lot of 0 values. Does this mean that the individuals are not in the region? Difficult to say, and even more difficult on a larger scale.


In [11]:
data = data.rename(columns={"date": "time"})

In [12]:
papa_latitude = 50
papa_longitude = -150

data["latitude"] = papa_latitude
data["longitude"] = papa_longitude

In [13]:
# transforme data["N/Day"] en colonne bool
data["N/Day"] = data["N/Day"] == "D"
data = data.rename(columns={"N/Day": "is_day"})
data.head(2)

Unnamed: 0,time,is_day,depth,crustaceans,larvae,thaliaceans,chaetognaths,pteropods,small_gelatinous,foram,other,total_non_gelatinous,latitude,longitude
0,1997-02-21,False,123,5.75863,0.05149,0.0,0.27368,0.06297,0.51674,0.00519,0.00648,6.10695,50,-150
1,1997-02-21,False,136,5.49039,0.32688,0.0,0.29913,0.07854,1.27374,0.0,0.32088,6.18894,50,-150


In [14]:
data["depth"].plot.hist()

In [15]:
# group depth by 50m bins
data["depth"] = pd.cut(
    data["depth"],
    bins=[0, 50, 100, 150, 200, 250, 300, 350, 400],
    labels=[50, 100, 150, 200, 250, 300, 350, 400],
    right=False,
)

In [16]:
data["depth"].astype(float).plot.hist(title="Depth", labels=dict(value="Depth (m)"))

In [17]:
preprocessed_data = xr.Dataset.from_dataframe(
    data.groupby(["time", "is_day", "latitude", "longitude", "depth"]).mean()
)
preprocessed_data = preprocessed_data.dropna("depth", how="all")





In [18]:
for var in preprocessed_data:
    preprocessed_data[var].attrs = {"units": species_unit}

In [19]:
preprocessed_data["time"].attrs = {
    "standard_name": "time",
    "long_name": "time",
    "axis": "T",
}

preprocessed_data["is_day"].attrs = {
    "flag_values": f"{[True, False]}",
    "flag_meanings": "day night",
    "standard_name": "is_day",
    "long_name": "Is day",
    "description": "Flag to indicate if the time is during the day or night",
}

preprocessed_data["latitude"].attrs = {
    "standard_name": "latitude",
    "long_name": "latitude",
    "axis": "Y",
    "units": "degrees_north",
}
preprocessed_data["longitude"].attrs = {
    "standard_name": "longitude",
    "long_name": "longitude",
    "axis": "X",
    "units": "degrees_east",
}
preprocessed_data["depth"].attrs = {
    "standard_name": "depth",
    "long_name": "depth",
    "axis": "Z",
    "units": "m",
}

preprocessed_data

## Export preprocessed data

---


In [22]:
try:
    preprocessed_data.pint.quantify()
except Exception as e:
    print(e)
    print("Some units cannot be quantified and are only here for information.")

In [23]:
preprocessed_data.to_zarr("../../3_post_processed/papa_zooplankton.zarr")

<xarray.backends.zarr.ZarrStore at 0x168bc8240>