# Bottle data in hot


In [81]:
import pandas as pd
import numpy as np
import xarray as xr
import cf_xarray.units  # noqa: F401
import pint_xarray  # noqa: F401
import json

pd.options.plotting.backend = "plotly"

## Load raw data

---


In [82]:
with open("../../1_raw/hot_zooplankton.json") as f:
    metadata = json.load(f)

In [83]:
INDEX = ["cruise_number", "date", "time", "size_fraction", "depth"]
DATA = list(set(metadata.keys()) - set(INDEX))
HEADER = list(metadata.keys())
DTYPE = {k: v["type"] for k, v in metadata.items()}
LATITUDE = 22.5
LONGITUDE = -158.0
SIZE_FRACTION = {0: 0.2, 1: 0.5, 2: 1, 3: 2, 4: 5}

In [84]:
# Set the data type of each column manualy
raw_data = pd.read_csv(
    "../../1_raw/hot_zooplankton.txt",
    skiprows=5,
    names=HEADER,
    dtype=DTYPE,
    # Only keep the first 12 columns
    usecols=HEADER,
)
raw_data.head()

Unnamed: 0,cruise_number,date,time,size_fraction,depth,zooplankton_dry_weight
0,52,21794,1108,0,181.0,0.0132
1,52,21794,1108,1,181.0,0.0417
2,52,21794,1108,2,181.0,0.023
3,52,21794,1108,3,181.0,0.0374
4,52,21794,1108,4,181.0,0.0302


In [85]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9348 entries, 0 to 9347
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   cruise_number           9348 non-null   int64  
 1   date                    9348 non-null   int64  
 2   time                    9348 non-null   int64  
 3   size_fraction           9348 non-null   int64  
 4   depth                   9348 non-null   float64
 5   zooplankton_dry_weight  9348 non-null   float64
dtypes: float64(2), int64(4)
memory usage: 438.3 KB


In [86]:
raw_data.describe()

Unnamed: 0,cruise_number,date,time,size_fraction,depth,zooplankton_dry_weight
count,9348.0,9348.0,9348.0,9348.0,9348.0,9348.0
mean,194.007702,67096.484596,1335.306162,2.5,165.889602,0.240679
std,83.201974,33703.129147,743.368368,1.707916,37.588684,0.951514
min,52.0,10797.0,0.0,0.0,9.0,-9.0
25%,120.0,40695.0,1005.0,1.0,140.0,0.114775
50%,193.0,70219.0,1238.5,2.5,167.0,0.20535
75%,265.0,100196.0,2204.0,4.0,192.0,0.3464
max,339.0,122207.0,2358.0,5.0,271.0,3.5814


## Clean data

---


Remove the data when time and position are not known.


In [87]:
fig = (
    raw_data.isna()
    .sum()
    .plot(
        kind="bar",
        title=f"Missing values per column for a total of {len(raw_data)} entries",
        labels=dict(
            index="Column",
            value="Number of missing values",
        ),
    )
)
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=500)
# Remove legend
fig.update_layout(showlegend=False)
fig.show()

## Manage date

---


In [88]:
raw_data["date"] = raw_data["date"].astype(str).apply(lambda x: x.zfill(6))
raw_data["time"] = raw_data["time"].astype(str).apply(lambda x: x.zfill(4))

# ---- #

raw_data["year"] = raw_data["date"].apply(
    lambda x: 1900 + int(x[-2:]) if int(x[-2:]) > 30 else 2000 + int(x[-2:])
)
raw_data["month"] = raw_data["date"].apply(lambda x: x.zfill(6)[:2])
raw_data["day"] = raw_data["date"].apply(lambda x: x[2:4])
raw_data["hour"] = raw_data["time"].apply(lambda x: x[:2])
raw_data["minute"] = raw_data["time"].apply(lambda x: x[2:])
raw_data["time"] = pd.to_datetime(raw_data[["year", "month", "day", "hour", "minute"]])
raw_data = raw_data.drop(
    columns=["year", "month", "day", "hour", "minute", "date", "cruise_number"]
)
raw_data.head()

Unnamed: 0,time,size_fraction,depth,zooplankton_dry_weight
0,1994-02-17 11:08:00,0,181.0,0.0132
1,1994-02-17 11:08:00,1,181.0,0.0417
2,1994-02-17 11:08:00,2,181.0,0.023
3,1994-02-17 11:08:00,3,181.0,0.0374
4,1994-02-17 11:08:00,4,181.0,0.0302


In [89]:
_ = raw_data.set_index(["time", "size_fraction", "depth"], verify_integrity=True)

In [95]:
raw_data["time"].plot.hist(nbins=100, title="Time")

In [92]:
raw_data["depth"].astype(float).plot.hist(
    nbins=100, title="Depth", labels=dict(value="Depth (m)")
)

## Produce preprocessed data

---


In [96]:
preprocessed_data = pd.DataFrame(
    {
        "time": raw_data["time"],
        "latitude": np.full(raw_data["time"].size, LATITUDE).astype(float),
        "longitude": np.full(raw_data["time"].size, LONGITUDE).astype(float),
        "depth": pd.cut(
            raw_data["depth"],
            bins=[0, 50, 100, 150, 200, 250, 300],
            include_lowest=True,
            labels=[50, 100, 150, 200, 250, 300],
        ).astype(float),
        **{k: raw_data[k] for k in DATA},
    }
)
preprocessed_data.head()

Unnamed: 0,time,latitude,longitude,depth,zooplankton_dry_weight
0,1994-02-17 11:08:00,22.5,-158.0,200.0,0.0132
1,1994-02-17 11:08:00,22.5,-158.0,200.0,0.0417
2,1994-02-17 11:08:00,22.5,-158.0,200.0,0.023
3,1994-02-17 11:08:00,22.5,-158.0,200.0,0.0374
4,1994-02-17 11:08:00,22.5,-158.0,200.0,0.0302


## Final plot

---


In [98]:
df_normalized = preprocessed_data.groupby(
    ["time", "latitude", "longitude", "depth"]
).mean()
df_normalized = (df_normalized - df_normalized.min()) / (
    df_normalized.max() - df_normalized.min()
)
fig = df_normalized.plot.box()
fig.update_xaxes(title_text="Variable")
fig.update_yaxes(title_text="Normalized values distribution")
# rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-20)
fig.show()

## Export preprocessed data

---


In [105]:
preprocessed_data.to_csv("../../2_processed/hot_zooplankton.csv", index=False)

In [106]:
out_data = xr.Dataset.from_dataframe(preprocessed_data)
for k, v in metadata.items():
    if k not in out_data:
        continue
    if "long_name" in v and v["long_name"] is not None:
        out_data[k].attrs["long_name"] = v["long_name"]
    if "standard_name" in v and v["standard_name"] is not None:
        out_data[k].attrs["standard_name"] = v["standard_name"]
    if "units" in v and v["units"] is not None:
        out_data[k].attrs["units"] = v["units"]
    if "attrs" in v and v["attrs"] is not None:
        out_data[k].attrs.update(v["attrs"])
out_data["time"].attrs = {"axis": "T"}
out_data["latitude"].attrs = {"axis": "Y", "units": "degrees_north"}
out_data["longitude"].attrs = {"axis": "X", "units": "degrees_east"}
out_data["depth"].attrs = {"axis": "Z", "units": "meters"}
out_data

In [107]:
try:
    out_data.pint.quantify()
except Exception as e:
    print(e)
    print("Some units cannot be quantified and are only here for information.")

In [108]:
out_data.to_zarr("../../2_processed/hot_zooplankton.zarr")

<xarray.backends.zarr.ZarrStore at 0x177dcb7c0>