# Primary production data in hot


In [1]:
import pandas as pd
import xarray as xr
import cf_xarray.units  # noqa: F401
import pint_xarray  # noqa: F401

pd.options.plotting.backend = "plotly"

## Load raw data

---


In [2]:
raw_data = pd.read_csv("../../1_raw/hot_primary_production.csv", index_col=0)
raw_metadata = pd.read_csv("../../1_raw/hot_primary_production_meta.csv", index_col=0)

In [3]:
raw_metadata = raw_metadata.set_index("Variable")

raw_data["time"] = pd.to_datetime(raw_data["time"])
raw_data = raw_data.drop(columns=["cruise", "stime", "etime", "itype"])

In [4]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2589 entries, 0 to 2588
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   time    2589 non-null   datetime64[ns]
 1   lat     2589 non-null   float64       
 2   lon     2589 non-null   int64         
 3   depth   2589 non-null   int64         
 4   chl     2579 non-null   float64       
 5   phaeo   2579 non-null   float64       
 6   l12     2153 non-null   float64       
 7   d12     900 non-null    float64       
 8   bsal    2121 non-null   float64       
 9   pbact   1981 non-null   float64       
 10  hbact   1977 non-null   float64       
 11  sbact   1988 non-null   float64       
 12  ebact   1988 non-null   float64       
dtypes: datetime64[ns](1), float64(10), int64(2)
memory usage: 283.2 KB


In [5]:
raw_data.describe()

Unnamed: 0,time,lat,lon,depth,chl,phaeo,l12,d12,bsal,pbact,hbact,sbact,ebact
count,2589,2589.0,2589.0,2589.0,2579.0,2579.0,2153.0,900.0,2121.0,1981.0,1977.0,1988.0,1988.0
mean,2004-12-01 02:35:10.776361600,22.75,-158.0,85.522596,0.118511,0.169703,3.707033,0.131623,35.143407,126099.200909,382577.03743,1119.045775,952.511569
min,1988-10-31 00:00:00,22.75,-158.0,0.0,0.004,0.0,0.015,0.0,34.3871,15.0,60189.0,0.0,0.0
25%,1997-04-09 00:00:00,22.75,-158.0,25.0,0.069,0.065,0.94,0.07,35.0364,30709.0,266712.0,47.75,496.0
50%,2004-05-19 00:00:00,22.75,-158.0,75.0,0.101,0.116,3.53,0.116,35.1695,137536.0,392558.0,909.5,843.5
75%,2013-02-12 00:00:00,22.75,-158.0,125.0,0.158,0.238,5.953,0.173,35.2816,204649.0,479802.0,1699.25,1240.5
max,2022-09-01 00:00:00,22.75,-158.0,178.0,0.5,0.887,27.22,0.71,35.5255,369400.0,840648.0,9823.0,6670.0
std,,0.0,0.0,56.373355,0.069985,0.140999,2.943736,0.084598,0.185581,92719.850237,136839.259434,1212.666813,695.518793


In [6]:
raw_data.head()

Unnamed: 0,time,lat,lon,depth,chl,phaeo,l12,d12,bsal,pbact,hbact,sbact,ebact
0,1988-10-31,22.75,-158,24,0.129,0.077,12.21,,,,,,
1,1988-10-31,22.75,-158,42,0.192,0.051,2.47,0.07,,,,,
2,1988-10-31,22.75,-158,68,0.285,0.303,2.58,0.06,,,,,
3,1988-10-31,22.75,-158,114,0.326,0.044,0.72,0.16,,,,,
4,1988-12-02,22.75,-158,0,0.202,0.0,,,,,,,


## Clean data

---


Remove the data when time and position are not known.


In [7]:
fig = (
    raw_data.isna()
    .sum()
    .plot(
        kind="bar",
        title=f"Missing values per column for a total of {len(raw_data)} entries",
        labels=dict(
            index="Column",
            value="Number of missing values",
        ),
    )
)
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=500)
# Remove legend
fig.update_layout(showlegend=False)
fig.show()

## Manage index

---


In [8]:
raw_data["time"].value_counts().sort_index().plot(
    title="Number of entries per time",
    labels=dict(
        index="Time",
        value="Number of entries",
    ),
)

In [9]:
raw_data["depth"].astype(float).plot.hist(
    nbins=100, title="Depth", labels=dict(value="Depth (m)")
)

Then aggregate data by 50 meters.


In [10]:
# group depth by 50m bins
raw_data["depth"] = pd.cut(
    raw_data["depth"],
    bins=[0, 50, 100, 150, 200],
    labels=[50, 100, 150, 200],
    right=False,
)

In [11]:
raw_data["depth"].astype(float).value_counts().plot(kind="bar")

## Produce preprocessed data

---


In [12]:
preprocessed_data = xr.Dataset.from_dataframe(
    raw_data.groupby(["time", "lat", "lon", "depth"]).mean()
)

for var in preprocessed_data:
    attrs = raw_metadata.loc[var].to_dict()
    # set all attrs keys to lowercase
    attrs = {k.lower(): v for k, v in attrs.items()}
    # transform unit key to units
    attrs["units"] = attrs.pop("unit")
    preprocessed_data[var].attrs = attrs

preprocessed_data["time"].attrs = {
    "standard_name": "time",
    "long_name": "time",
    "axis": "T",
}
preprocessed_data["lat"].attrs = {
    "standard_name": "latitude",
    "long_name": "latitude",
    "axis": "Y",
    "units": "degrees_north",
}
preprocessed_data["lon"].attrs = {
    "standard_name": "longitude",
    "long_name": "longitude",
    "axis": "X",
    "units": "degrees_east",
}
preprocessed_data["depth"].attrs = {
    "standard_name": "depth",
    "long_name": "depth",
    "axis": "Z",
    "units": "m",
}

preprocessed_data





## Final plot

---


In [13]:
raw_data.groupby(["time", "lat", "lon", "depth"]).mean().plot.box()





## Export preprocessed data

---


In [14]:
try:
    preprocessed_data.pint.quantify()
except Exception as e:
    print(e)
    print("Some units cannot be quantified and are only here for information.")


Some units cannot be quantified and are only here for information.


In [15]:
preprocessed_data.to_netcdf("../../2_processed/hot_primary_production.nc", mode="w")