In [1]:
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import xarray as xr
from plotly.subplots import make_subplots
from scipy.stats import t
from statsmodels.nonparametric.smoothers_lowess import lowess

# use plotly backend
pd.options.plotting.backend = "plotly"

In [2]:
station = "Papa"
time_start = "2002-01-01"
time_end = "2015-01-01"
NB_YEARS = 13
MINIMUM_OBS_BY_YEAR = 2

## Load


In [3]:
data = xr.load_dataset(f"../data/1_products/{station}_obs_filtered.zarr", engine="zarr")
data

In [4]:
with xr.set_options(keep_attrs=True):
    layer = data.layer
    data = data.mean("layer")

## Plot all observations


In [5]:
data_df = data.mean(["latitude", "longitude"]).resample({"time": "1MS"}).mean().to_dataframe().dropna(how="all")
data_df = data_df.stack().rename("biomass").reset_index().rename(columns={"level_1": "is_day"})
data_df["year"] = data_df["time"].dt.year
data_df["month"] = data_df["time"].dt.month

In [6]:
cmap = plt.get_cmap("RdBu_r")
cmap = plt.get_cmap("plasma")
n_colors = data_df.year.unique().size
rgba_colors = [cmap(i / (n_colors - 1)) for i in range(n_colors)]
hex_colors = [mcolors.to_hex(c) for c in rgba_colors]

px.strip(
    data_df,
    x="month",
    y="biomass",
    color="year",
    facet_row="is_day",
    title=f"Biomass at {station}",
    color_discrete_sequence=hex_colors,
).update_layout(height=800, width=800)

## Computing the trend


In [7]:
def plot_trend(day_data, night_data, poly_order=1, confidence_level=0.95):
    # day_data = data["day"][:, 0, 0, 0].dropna(dim="time")
    time_indices = (day_data["time"] - day_data["time"][0]).astype(int)  # .astype("timedelta64[D]").astype(int)
    day_trend = np.polyfit(time_indices, day_data, poly_order)
    day_trend_line = np.polyval(day_trend, time_indices)
    day_std_error = np.std(day_data - day_trend_line) / np.sqrt(len(day_data))
    degrees_freedom_day = len(day_data) - 2
    t_critical_day = t.ppf((1 + confidence_level) / 2, degrees_freedom_day)
    day_confidence_interval = float(t_critical_day * day_std_error)

    # night_data = data["night"][:, 0, 0, 0].dropna(dim="time")
    night_time_indices = (night_data["time"] - night_data["time"][0]).astype(
        int
    )  # .astype("timedelta64[D]").astype(int)
    night_trend = np.polyfit(night_time_indices, night_data, poly_order)
    night_trend_line = np.polyval(night_trend, night_time_indices)
    night_std_error = np.std(night_data - night_trend_line) / np.sqrt(len(night_data))
    degrees_freedom_night = len(night_data) - 2
    t_critical_night = t.ppf((1 + confidence_level) / 2, degrees_freedom_night)
    night_confidence_interval = float(t_critical_night * night_std_error)

    figure = go.Figure()

    figure.add_trace(
        go.Scatter(
            x=day_data.indexes["time"],
            y=day_data,
            mode="markers",
            name="Day Data",
            marker={"color": "firebrick"},
        )
    )
    figure.add_trace(
        go.Scatter(
            x=day_data.indexes["time"],
            y=day_trend_line,
            mode="lines",
            name="Day Trend Line",
            line={"color": "firebrick"},
        )
    )
    figure.add_trace(
        go.Scatter(
            x=list(day_data.indexes["time"]) + list(day_data.indexes["time"])[::-1],
            y=list(day_trend_line + day_confidence_interval) + list(day_trend_line - day_confidence_interval)[::-1],
            fill="toself",
            fillcolor="rgba(255, 0, 0, 0.2)",
            line_color="rgba(0, 0, 0, 0)",
            name="Day Confidence Interval",
        )
    )
    figure.add_trace(
        go.Scatter(
            x=night_data.indexes["time"],
            y=night_data,
            mode="markers",
            name="Day Data",
            marker={"color": "royalblue"},
        )
    )
    figure.add_trace(
        go.Scatter(
            x=night_data.indexes["time"],
            y=night_trend_line,
            mode="lines",
            name="Night Trend Line",
            line={"color": "royalblue"},
        )
    )

    figure.add_trace(
        go.Scatter(
            x=list(night_data.indexes["time"]) + list(night_data.indexes["time"])[::-1],
            y=list(night_trend_line + night_confidence_interval)
            + list(night_trend_line - night_confidence_interval)[::-1],
            fill="toself",
            fillcolor="rgba(0, 0, 255, 0.2)",
            line_color="rgba(0, 0, 0, 0)",
            name="Night Confidence Interval",
        ),
    )

    figure.update_layout(height=600)
    return figure

In [8]:
confidence_level = 0.95
day_data = data["day"].mean(["latitude", "longitude"]).dropna("time", how="all")
night_data = data["night"].mean(["latitude", "longitude"]).dropna("time", how="all")

plot_trend(day_data, night_data, poly_order=1).update_layout(
    title=f"{station} Day and Night Trend (1st order polynomial fit)",
    xaxis_title="Time",
    yaxis_title="Biomass (mg/m3)",
).show()

In [9]:
plot_trend(day_data, night_data, poly_order=2).update_layout(
    title=f"{station} Day and Night Trend (2nd order polynomial fit)",
    xaxis_title="Time",
    yaxis_title="Biomass (mg/m3)",
).show()

In [10]:
fig = go.Figure()

# Plot day data
fig.add_trace(
    go.Scatter(
        x=day_data.time.data.flatten(),
        y=day_data.data.flatten(),
        mode="markers",
        name="Day",
    )
)

# Plot night data
fig.add_trace(
    go.Scatter(
        x=night_data.time.data.flatten(),
        y=night_data.data.flatten(),
        mode="markers",
        name="Night",
    )
)

# Add vertical lines for 2000 and 2015
fig.add_shape(
    type="line",
    x0=time_start,
    y0=0,
    x1=time_start,
    y1=1,
    xref="x",
    yref="paper",
    line={"color": "black", "width": 2, "dash": "dash"},
    name="Year 2000",
)

fig.add_shape(
    type="line",
    x0=time_end,
    y0=0,
    x1=time_end,
    y1=1,
    xref="x",
    yref="paper",
    line={"color": "black", "width": 2, "dash": "dash"},
    name="Year 2015",
)

# Update layout
fig.update_layout(
    title=f"Day and Night Data at {station} : remove interannual variability",
    xaxis_title="Time",
    yaxis_title="Zooplankton biomass (mg/m3)",
    legend_title="Legend",
    height=600,
    showlegend=True,
)

fig.show()

In [11]:
day_data = day_data.sel(time=slice(time_start, time_end))
night_data = night_data.sel(time=slice(time_start, time_end))
plot_trend(day_data, night_data, poly_order=2).show()

In [12]:
data = data.sel(time=slice(time_start, time_end))
data

## Number of samples


### Total


In [13]:
counts_month = data.groupby(data["time"].dt.month).count().sum(dim=["latitude", "longitude"]).to_dataframe()
counts_year = (
    data.groupby(data["time"].dt.year)
    .count()
    # .isel(latitude=0, longitude=0, layer=0)
    .sum(dim=["latitude", "longitude"])
    .to_dataframe()
)

In [14]:
px.bar(
    counts_month,
    x=counts_month.index,
    y=counts_month.columns,
    # side by side
    barmode="group",
    title=f"Counts of {station} data per month",
    labels={"value": "Counts"},
    color_discrete_sequence=px.colors.qualitative.Plotly,
).update_layout(height=600).update_xaxes(
    title_text="Month",
    tickvals=np.arange(1, 13),
    ticktext=["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"],
).show()

In [15]:
px.bar(
    counts_year,
    x=counts_year.index,
    y=counts_year.columns,
    # side by side
    barmode="group",
    title=f"Counts of {station} data per year",
    labels={"value": "Counts"},
    color_discrete_sequence=px.colors.qualitative.Plotly,
).update_layout(xaxis_title="Year", height=600).show()

### By position


In [16]:
counts_by_year_position = (
    data.groupby("time.month")
    .count()
    .to_dataframe()
    .dropna(how="all")
    .query("day > 0 or night > 0")
    .stack()
    .rename_axis(index={None: "day_night"})
    .rename("counts")
    .reset_index()
)
average_nb_of_obs_by_year_and_position = (
    counts_by_year_position.groupby(["latitude", "longitude"])["counts"].sum() / NB_YEARS
).reset_index()

In [17]:
px.box(counts_by_year_position, x="month", y="counts", color="day_night").update_layout(
    height=600,
    title=f"Counts of {station} day and night data by month and location",
).update_xaxes(
    title_text="Month",
    tickvals=np.arange(1, 13),
    ticktext=["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"],
).show()

In [18]:
average_nb_of_obs_by_year_and_position

Unnamed: 0,latitude,longitude,counts
0,46.5,-151.5,0.076923
1,46.5,-144.5,0.076923
2,47.5,-145.5,0.076923
3,47.5,-132.5,0.076923
4,47.5,-130.5,0.076923
...,...,...,...
56,56.5,-145.5,0.076923
57,56.5,-138.5,0.076923
58,57.5,-145.5,0.076923
59,57.5,-139.5,0.076923


In [19]:
fig = (
    px.box(average_nb_of_obs_by_year_and_position, x="counts", orientation="h", points="all")
    .update_layout(
        title=f"Average number of observation (day+night) data by year for each position at {station}",
    )
    .update_xaxes(type="log")
)
fig.add_shape(
    type="line",
    x0=MINIMUM_OBS_BY_YEAR,
    x1=MINIMUM_OBS_BY_YEAR,
    y0=-1,
    y1=1,
    xref="x",
    line={"color": "black", "width": 2, "dash": "dash"},
    name="Minimum number of observations by year",
)
fig.show()

## Remove location where there is not enough data (less than 6 by year)


In [20]:
positions = average_nb_of_obs_by_year_and_position.query(f"counts >= {MINIMUM_OBS_BY_YEAR}")[["latitude", "longitude"]]
positions

Unnamed: 0,latitude,longitude
5,48.5,-130.5
6,48.5,-128.5
11,49.5,-138.5
12,49.5,-134.5
15,49.5,-128.5
24,50.5,-129.5


In [21]:
data = xr.combine_by_coords(
    [data.sel(latitude=[lat], longitude=[lon]) for lat, lon in positions.itertuples(index=False)]
)
data

In [22]:
px.scatter_map(
    (data.count("time") / NB_YEARS).to_dataframe().sum(axis=1).rename("count").reset_index().query("count > 0"),
    lon="longitude",
    lat="latitude",
    color="count",
    size="count",
).update_layout(
    title=f"Average number of observations at {station} locations with at least {MINIMUM_OBS_BY_YEAR} observations by year",
    height=600,
    width=800,
)  # .update_traces(marker={"size": 20})

## Group data by month


In [23]:
grouped_data = data.groupby("time.month").mean().reindex(month=np.arange(1, 13))
grouped_data

In [24]:
px.box(
    grouped_data.to_dataframe().stack().rename("biomass").reset_index().rename(columns={"level_3": "is_day"}),
    x="month",
    y="biomass",
    color="is_day",
    title=f"Biomass at {station} by month",
    points="all",
    facet_row="is_day",
).update_xaxes(
    tickvals=np.arange(1, 13),
    ticktext=["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"],
).update_layout(
    height=700,
).show()

## Duplicating observation across the entire time period


In [25]:
value_by_month = grouped_data.to_dataframe().reset_index().set_index("month")

time = pd.date_range(start=time_start, end=time_end, freq="ME")
month = time.month
month_by_day = pd.DataFrame({"month": month}, index=time)

aggregated_smoothed = month_by_day.join(value_by_month, on="month").drop(columns=["month"])
aggregated_smoothed.index.name = "time"

aggregated_smoothed = aggregated_smoothed.reset_index().set_index(
    ["time", "latitude", "longitude"], verify_integrity=True
)

climato_observations = xr.Dataset.from_dataframe(aggregated_smoothed)
for var in climato_observations:
    climato_observations[var].attrs = {"units": "mg/m3"}
climato_observations.attrs = {
    "desc": "Aggregated and smoothed observations using lowess filtre.",
}
climato_observations

## Plot final results


In [26]:
fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=("Day Data", "Night Data"),
    shared_xaxes=True,
    vertical_spacing=0.1,
)
# augmente la taille de la figure
fig.update_layout(height=800)

# -------------------------------------------------------------------------------------------------------------------- #

# Ajouter les données d'observation initiales de jour
fig.add_trace(
    go.Scatter(
        x=day_data["time"],
        y=day_data,
        mode="lines",
        name="Initial Day Data",
    ),
    row=1,
    col=1,
)

# Ajouter les données d'observation initiales de nuit
fig.add_trace(
    go.Scatter(
        x=night_data["time"],
        y=night_data,
        mode="lines",
        name="Initial Night Data",
    ),
    row=2,
    col=1,
)

# # -------------------------------------------------------------------------------------------------------------------- #

# Ajouter les données de jour
fig.add_trace(
    go.Scatter(
        x=climato_observations.day.mean(["latitude", "longitude"]).dropna("time", how="all").indexes["time"],
        y=climato_observations.day.mean(["latitude", "longitude"]).dropna("time", how="all"),
        mode="lines",
        name="day",
    ),
    row=1,
    col=1,
)

# Ajouter les données de nuit
fig.add_trace(
    go.Scatter(
        x=climato_observations.night.mean(["latitude", "longitude"]).dropna("time", how="all").indexes["time"],
        y=climato_observations.night.mean(["latitude", "longitude"]).dropna("time", how="all"),
        mode="lines",
        name="night",
    ),
    row=2,
    col=1,
)

# -------------------------------------------------------------------------------------------------------------------- #

# Mettre à jour la mise en page
fig.update_layout(
    title=f"Comparison of Initial and Processed Data at {station} : With Trend",
    xaxis_title="Time",
    yaxis_title="Zooplankton biomass (mg/m3)",
    # night mode
    # template="plotly_dark",
)

fig.show()

## Export


In [27]:
climato_observations = climato_observations.expand_dims(layer=layer)
climato_observations = climato_observations.transpose("time", "latitude", "longitude", "layer")

climato_observations.latitude.attrs = data.latitude.attrs
climato_observations.longitude.attrs = data.longitude.attrs
climato_observations.layer.attrs = layer.attrs
climato_observations.time.attrs = data.time.attrs
climato_observations.day.attrs = data.day.attrs
climato_observations.night.attrs = data.night.attrs

climato_observations

In [28]:
climato_observations.to_zarr(
    f"../data/1_products/{station}_obs_zoo_climato_monthly_{pd.Timestamp(time_start).year}_{pd.Timestamp(time_end).year}.zarr",
    mode="w",
)

<xarray.backends.zarr.ZarrStore at 0x3232289c0>