# Bottle data in hot


In [1]:
import pandas as pd
import numpy as np
import xarray as xr
import cf_xarray.units  # noqa: F401
import pint_xarray  # noqa: F401
import json

pd.options.plotting.backend = "plotly"

## Load raw data

---


In [2]:
with open("../../1_raw/hot_primary_production.json") as f:
    metadata = json.load(f)

In [3]:
INDEX = ["cruise_number", "date", "start_time", "end_time", "instrument_type", "depth"]
DATA = list(set(metadata.keys()) - set(INDEX))
HEADER = list(metadata.keys())
DTYPE = {k: v["type"] for k, v in metadata.items()}
LATITUDE = 22.5
LONGITUDE = -158.0

In [4]:
# Set the data type of each column manualy
raw_data = pd.read_csv(
    "../../1_raw/hot_primary_production.txt",
    skiprows=5,
    names=HEADER,
    dtype=DTYPE,
    # Only keep the first 12 columns
    usecols=HEADER,
)
raw_data.head()

Unnamed: 0,cruise_number,date,start_time,end_time,instrument_type,depth,chlorophyll,bottle_salinity,phototrophic_bacteria,heterotrophic_bacteria,synechococcus_bacteria,eukaryotic_bacteria
0,1,881031,-9,-9,0,24.0,0.129,-9.0,-9,-9,-9,-9
1,1,881031,-9,-9,0,42.0,0.192,-9.0,-9,-9,-9,-9
2,1,881031,-9,-9,0,68.0,0.285,-9.0,-9,-9,-9,-9
3,1,881031,-9,-9,0,114.0,0.326,-9.0,-9,-9,-9,-9
4,2,881202,-9,-9,0,0.0,0.202,-9.0,-9,-9,-9,-9


We replace NaN values in time column with 0.


In [5]:
# convert -999 to NaN
raw_data = raw_data.replace(-9, np.nan)
# replace NaN with 0 in time column because day/night cycle is not relevant here
raw_data[["start_time", "end_time"]] = (
    raw_data[["start_time", "end_time"]].fillna(0).astype(int)
)

In [6]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2586 entries, 0 to 2585
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   cruise_number           2586 non-null   int64  
 1   date                    2586 non-null   int64  
 2   start_time              2586 non-null   int64  
 3   end_time                2586 non-null   int64  
 4   instrument_type         2586 non-null   object 
 5   depth                   2586 non-null   float64
 6   chlorophyll             2579 non-null   float64
 7   bottle_salinity         2121 non-null   float64
 8   phototrophic_bacteria   1981 non-null   float64
 9   heterotrophic_bacteria  1977 non-null   float64
 10  synechococcus_bacteria  1988 non-null   float64
 11  eukaryotic_bacteria     1988 non-null   float64
dtypes: float64(7), int64(4), object(1)
memory usage: 242.6+ KB


## Clean data

---


Remove the data when time and position are not known.


In [7]:
fig = (
    raw_data.isna()
    .sum()
    .plot(
        kind="bar",
        title=f"Missing values per column for a total of {len(raw_data)} entries",
        labels=dict(
            index="Column",
            value="Number of missing values",
        ),
    )
)
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=500)
# Remove legend
fig.update_layout(showlegend=False)
fig.show()

## Manage date

---


In [8]:
raw_data["date"] = raw_data["date"].apply(lambda x: str(x).zfill(6))
raw_data["start_time"] = raw_data["start_time"].apply(lambda x: str(x).zfill(4))
raw_data["end_time"] = raw_data["end_time"].apply(lambda x: str(x).zfill(4))

In [9]:
raw_data["year"] = raw_data["date"].apply(
    lambda x: 1900 + int(x[:2]) if int(x[:2]) > 30 else 2000 + int(x[:2])
)
raw_data["month"] = raw_data["date"].apply(lambda x: str(x).zfill(6)[2:4]).apply(int)
raw_data["day"] = raw_data["date"].apply(lambda x: x[4:])
raw_data["hour"] = raw_data["start_time"].apply(lambda x: x[:2])
raw_data["minute"] = raw_data["start_time"].apply(lambda x: x[2:])
raw_data = raw_data.drop(columns=["date", "start_time", "end_time"])
raw_data["time"] = pd.to_datetime(raw_data[["year", "month", "day", "hour", "minute"]])
raw_data = raw_data.drop(columns=["year", "month", "day", "hour", "minute"])

In [10]:
_ = raw_data.set_index(["instrument_type", "time", "depth"], verify_integrity=True)

## Produce preprocessed data

---


In [11]:
preprocessed_data = pd.DataFrame(
    {
        "instrument_type": raw_data["instrument_type"].astype(int),
        "time": raw_data["time"],
        "latitude": np.full(raw_data["time"].size, LATITUDE).astype(float),
        "longitude": np.full(raw_data["time"].size, LONGITUDE).astype(float),
        "depth": pd.cut(
            raw_data["depth"],
            bins=[0, 50, 100, 150, 200, 250, 300],
            include_lowest=True,
            labels=[50, 100, 150, 200, 250, 300],
        ).astype(float),
        **{k: raw_data[k] for k in DATA},
    }
)
preprocessed_data.head()

Unnamed: 0,instrument_type,time,latitude,longitude,depth,heterotrophic_bacteria,synechococcus_bacteria,phototrophic_bacteria,bottle_salinity,eukaryotic_bacteria,chlorophyll
0,0,1988-10-31,22.5,-158.0,50.0,,,,,,0.129
1,0,1988-10-31,22.5,-158.0,50.0,,,,,,0.192
2,0,1988-10-31,22.5,-158.0,100.0,,,,,,0.285
3,0,1988-10-31,22.5,-158.0,150.0,,,,,,0.326
4,0,1988-12-02,22.5,-158.0,50.0,,,,,,0.202


## Final plot

---


In [12]:
df_normalized = preprocessed_data.groupby(
    ["instrument_type", "time", "latitude", "longitude", "depth"]
).mean()
df_normalized = (df_normalized - df_normalized.min()) / (
    df_normalized.max() - df_normalized.min()
)
fig = df_normalized.plot.box()
fig.update_xaxes(title_text="Variable")
fig.update_yaxes(title_text="Normalized values distribution")
# rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-20)
fig.show()

## Export preprocessed data

---


In [13]:
preprocessed_data.to_csv("../../2_processed/hot_primary_production.csv", index=False)

In [14]:
out_data = xr.Dataset.from_dataframe(preprocessed_data)
for k, v in metadata.items():
    if k not in out_data:
        continue
    if "long_name" in v and v["long_name"] is not None:
        out_data[k].attrs["long_name"] = v["long_name"]
    if "standard_name" in v and v["standard_name"] is not None:
        out_data[k].attrs["standard_name"] = v["standard_name"]
    if "units" in v and v["units"] is not None:
        out_data[k].attrs["units"] = v["units"]
    if "attrs" in v and v["attrs"] is not None:
        out_data[k].attrs.update(v["attrs"])
out_data["time"].attrs = {"axis": "T"}
out_data["latitude"].attrs = {"axis": "Y", "units": "degrees_north"}
out_data["longitude"].attrs = {"axis": "X", "units": "degrees_east"}
out_data["depth"].attrs = {"axis": "Z", "units": "meters"}
out_data

In [15]:
try:
    out_data.pint.quantify()
except Exception as e:
    print(e)
    print("Some units cannot be quantified and are only here for information.")


Some units cannot be quantified and are only here for information.


In [16]:
out_data.to_zarr("../../2_processed/hot_primary_production.zarr")

<xarray.backends.zarr.ZarrStore at 0x1593efec0>