In [1]:
import os
import sys
import datetime
import pytz
import json

import numpy as np
from scipy.signal import medfilt
import matplotlib.pyplot as plt
import polars as pl
import pandas as pd

import synoptic

from brc_tools.utils.lookups import _VRBLS

# JRL: if problems, delete your existing config toml file
# then create new one by uncommenting:
# synoptic.configure(token="blah")

# Use Helvetica or Arial for plots
plt.rcParams['font.family'] = 'Helvetica'
plt.rcParams['font.sans-serif'] = 'Helvetica'
plt.rcParams['font.size'] = 12


ModuleNotFoundError: No module named 'brc_tools'

# Importing observation data from Synoptic Weather as polars dataframe.
We then save as json to send to our team's UBAIR website server.

The plan goes as follows:
- Import data from Synoptic Weather based on date, location, variable, etc
- Export a json file from the polars (?) dataframe
- Note the formatting so we can make the website read it in a predictable format

First off: we want time series for multiple stations.

Some of these data are noisy and/or with errors. We will filter some variables later.

In [None]:
stid_list = ["KSLC", "UTORM", "CLN", "UTHEB", "UTCOP", "UTSTV", "UBHSP", "UB7ST", "UBCSP",
             # 'COOPDINU1', 'COOPROSU1',  'COOPVELU1',
             'COOPFTDU1', 'COOPALMU1', 'COOPDSNU1', 'COOPNELU1',
             ]
data_root = "./data"
data_fname = "df_obs_pp.h5"
metadata_fname = "df_metadata.h5"
df_obs_fpath = os.path.join(data_root, data_fname)
df_meta_fpath = os.path.join(data_root, metadata_fname)

start_date = datetime.datetime(2025, 1, 24, 0, 0, 0)
end_date = datetime.datetime(2025, 2, 4, 0, 0, 0)
# end_date = datetime.datetime(2025, 3, 16, 0, 0, 0)

# df_meta = load_pickle(df_meta_fpath)
# df_obs = pd.read_hdf(df_obs_fpath, key='df_obs')
# df_obs_winter = df_obs[df_obs.index.month.isin([11, 12, 1, 2, 3])]


In [None]:
def replace_max_values(df, vrbl, max_value=None):
    # Assumes df is already filtered by stid.
    if max_value is None:
        max_value = df.select(pl.col(vrbl).max())[0, 0]
    df = df.with_columns(
        pl.when(pl.col(vrbl) == max_value)
        .then(None)
        .otherwise(pl.col(vrbl))
        .alias(vrbl)
    )
    return df.with_columns(pl.col(vrbl).interpolate())

def apply_median_filter(df, vrbl, kernel_size):
    # Convert column to numpy array and apply median filter.
    # Filtered by stid already
    arr = df[vrbl].to_numpy().astype("float32")
    med_filtered = medfilt(arr, kernel_size=kernel_size)
    return df.with_columns(pl.Series(name=vrbl, values=med_filtered))

def filter_snow_depth(df, kernel_size):
    # Run the preprocessing steps in sequence.
    df = replace_max_values(df, "snow_depth")
    df = apply_median_filter(df, "snow_depth", kernel_size=kernel_size)
    return df

def plot_snow_depth(ax, df, stid, kernel_size=5):
    # Filter rows using Polars' filter method.
    df_filtered = df.filter(pl.col("stid") == stid)

    if kernel_size is not None:
        df_filtered = filter_snow_depth(df_filtered, kernel_size=kernel_size)
        df_filtered = df_filtered.with_columns(pl.col("snow_depth").interpolate())

    # Convert time to Mountain Time Zone.
    df_filtered = df_filtered.with_columns(pl.col("date_time").dt.convert_time_zone("America/Denver"))
    # Make linestyle dashed if stid begins with "COOP"; else use solid
    ls = "--" if stid.startswith("COOP") else "-"
    ax.plot(df_filtered["date_time"], df_filtered["snow_depth"], label=f"{stid}", alpha=0.5, lw=0.75,
            linestyle=ls)



In [None]:
# Collect all strings for the "synoptic" key from the nested dictionaries
synoptic_vrbls = [
    value['synoptic'] for value in _VRBLS.values()]

In [None]:
df_meta = synoptic.Metadata(stid=stid_list, verbose=True).df()
df_meta

In [None]:
df_data = synoptic.TimeSeries(stid=stid_list,start=start_date, end=end_date,
                                 vars=synoptic_vrbls, verbose=True,
                                 # rename_set_1=False, rename_value_1=False
                                ).df().synoptic.pivot()
df_data.head(20)

In [None]:
def clean_dataframe_for_json(df):
    # If the dataframe is a Polars dataframe, convert it to Pandas.
    if hasattr(df, "to_pandas"):
        df = df.to_pandas()

    # Replace NaN with None to become proper JSON null.
    df = df.where(pd.notnull(df), None)

    # Clean string columns (remove unnecessary quotes).
    for col in df.select_dtypes(include=['object']):
        df[col] = df[col].str.strip('"')

    return df

def export_data(df, filename, orient='records'):
    df = clean_dataframe_for_json(df)

    # Export to JSON.
    with open(filename, 'w') as f:
        json.dump(df.to_dict(orient=orient), f, default=str)

    print(f"Exported {len(df)} records to {filename}")
    return

In [None]:
export_data(df_data, "data/df_obs_test.json")

# I might create one for a subsample (random) or subset by station, etc
# Operationally on UBAIR site, we want  obs for map stations in last hour

## Visuals

In [None]:
# Plot snow depth for each station over time
# Each station has a different reporting frequency and/or time, so plot independently
# All stations are in Mountain Time Zone



fig, ax = plt.subplots(figsize=(12, 6))
for stid in stid_list:
    # Skips here
    # if stid in ("KSLC",):

    # Plus for plot zooming
    if stid in ("KSLC","UTCOP","CLN"):
        continue

    if stid.startswith("COOP"):
        ks = None
        # But the snow 24h variable has 0.51 cm while depth has zero! It was cold!
    else:
        ks = 51
    plot_snow_depth(ax, df_data, stid, kernel_size=ks)

ax.set_xlabel("Time")
ax.set_ylabel(_VRBLS["snow"]["label"])
ax.set_title("Case study 2024/2025: high ozone in UB")

# Light grey background
ax.set_facecolor("#f0f0f0")

ax.legend()
ax.grid(False)
plt.show()