# Load data

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from datetime import timedelta
import matplotlib.pyplot as plt
from IPython.display import display, Markdown

def skip_cell():
    """Skip execution of the current cell when called."""
    display(Markdown("**⏭️ Skipped this cell**"))
    # raise SystemExit
    return # Exits the function, does NOT stop notebook execution
# --------------------------
# CONFIGURATION
# --------------------------

GROUNDWATER_PATH = '../data/wasserportal/processed/gw_master_2022-01-01_2025-04-30.parquet'
STATIONS_PATH = '../data/wasserportal/stations_groundwater.csv'
PRECIP_ZARR_PATH = '../data/dwd/processed/radolan_berlin_2022-01-01_2025-04-30.zarr'

STATION_ID = '9931'  # Change to your target station
N_GW_LAGS = 4
N_PRCP_LAGS = 4
INCLUDE_PRCP_T_PLUS_1 = True
SEASONALITY = True

# Data Loading and Preprocessing

In this section, we wi'll load the required datasets for groundwater level prediction:
1. **Groundwater levels**: Historical measurements from monitoring stations
2. **Precipitation data**: Radar-based precipitation data from DWD (German Weather Service)
3. **Station metadata**: Geographic coordinates and station information

Our goal is to predict groundwater levels using historical groundwater data and precipitation patterns.

In [None]:
# --------------------------
# LOAD DATA
# --------------------------

# Groundwater levels
gw_df = pd.read_parquet(GROUNDWATER_PATH)
display(gw_df.head())
gw_df['date'] = pd.to_datetime(gw_df['date'])
gw_series = gw_df[gw_df["station"] == int(STATION_ID)].dropna()
gw_series = gw_series.set_index('date')
gw_series.index = pd.to_datetime(gw_series.index)

# Timestamps
dates = gw_series.index
display(dates)

# Load precipitation from zarr
import xarray as xr

precip_ds = xr.open_zarr(PRECIP_ZARR_PATH)
precip_array = precip_ds['precipitation'].values  # shape: (time, 30, 30)

# Load coordinates
lats = precip_ds['lat'].values
lons = precip_ds['lon'].values

# Load station metadata
stations_df = pd.read_csv(STATIONS_PATH)
station = stations_df[stations_df['ID'] == int(STATION_ID)].iloc[0]


## Connect to the server , mySQL

In [None]:
import os
import io
import time
import datetime
from sqlalchemy import create_engine, text, DateTime
import mysql.connector
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
# Inside container: DB_HOST is set via docker-compose
# On host: fallback to LOCAL_DB_HOST
DB_HOST = os.getenv("DB_HOST", os.getenv("LOCAL_DB_HOST", "localhost"))
DB_USER = os.getenv("DB_USER", "root")
DB_PASSWORD = os.getenv("DB_PASSWORD", "mysecretpassword")
DB_NAME = os.getenv("DB_NAME", "mydatabase")

# Build SQLAlchemy connection string
DATABASE_URL = f"mysql+mysqlconnector://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}"

# Create engine
engine = create_engine(DATABASE_URL, pool_pre_ping=True)

# Test connection
try:
    with engine.connect() as conn:
        result = conn.execute(text("SELECT 1"))
        print("✅ Connected to MySQL, test query result:", result.scalar())
except Exception as e:
    print("❌ Connection failed:", e)

print (engine)


## Write data to database

In [None]:
import sys
sys.path.append("..")
sys.path.append('../src/utils')

import importlib
import MySQL_write_append  # since you added ../utils to sys.path
importlib.reload(MySQL_write_append)
from MySQL_write_append import *


In [None]:
write_df_to_sql(gw_df,engine,"gw_table")
write_df_to_sql(stations_df, engine, "stations_meta")
write_precip_data(precip_ds, engine, table_name="precip_table")


## Read back from database

1. Data frames are straightforward, with read_sql
2. 3D dataset, the DataFrame retreived from MySQL need to be transformed into zarr grid structure


In [None]:
# Test by reading back, gw_table should be identical to gw_df
df_tmp = pd.read_sql("SELECT * FROM gw_table ", engine)
df_tmp.set_index('date', inplace=True)
display(df_tmp.head())
df_tmp.equals(gw_df) # should be True

In [None]:
# Test by read back stantion meta data
engine.dispose()
df_stations = pd.read_sql("SELECT * FROM stations_meta",engine)
df_stations.head()

#### Here comes the 3D grid

In [None]:
# Test read back precipitation data
df_tmp2 = pd.read_sql("SELECT * FROM precip_df ", engine, parse_dates=['time'])

tmp_ds = df_tmp2.set_index(["time", "x", "y"]).to_xarray()
# Recreate xarray Dataset from DataFrame read from SQL
# Reattach curvilinear lat/lon as coords (from stored values)
lat_grid = df_tmp2.drop_duplicates(subset=["x", "y"]).pivot(index="x", columns="y", values="lat").values
lon_grid = df_tmp2.drop_duplicates(subset=["x", "y"]).pivot(index="x", columns="y", values="lon").values

tmp_ds3 = tmp_ds.assign_coords(
    lat=(("x", "y"), lat_grid),
    lon=(("x", "y"), lon_grid)
)

tmp_ds3 = tmp_ds3.drop_vars(['x', 'y'])

# this should be similar to precip_ds
print(tmp_ds3.all) # This will be the final zarr grid
print(precip_ds.all)


In [None]:
# Skip this cell
should_skip = False
if should_skip:
    # raise SystemExit
    skip_cell()
else:
    pass
    # Any code below will run if "should_skip" is False
    # Compare datasets ignoring the coordinates
    print(tmp_ds3.all)
    precip_ds["time"] = ("time", pd.to_datetime(precip_ds["time"].values).normalize()) # remove time component
    print(xr.testing.assert_allclose(precip_ds, tmp_ds3))

    # Direct difference of precipitation values
    diff = (precip_ds["precipitation"] - tmp_ds3["precipitation"])

    print("Max difference:", float(diff.max()))
    print("Min difference:", float(diff.min()))

    # Check if all values are exactly equal
    print("All equal:", bool((diff == 0).all())) ## almost zero