# DWD Data
This notebook shows how to use the import and process RADOLAN precipitation data from DWD. 

## Data Import
Functions from `data_loading_dwd.py` are used to download and extract the data.

In [None]:
import sys

sys.path.append("..")
sys.path.append('../utils')

import importlib
from src.utils import data_loading_dwd

importlib.reload(data_loading_dwd)
# importlib.reload(radolan_handler)

# Usage examples:

# Keep raw files (recommended for production)
results = data_loading_dwd.import_radolan_recent('2025-05-01',
                                                 '2025-06-30',
                                                 '../data/dwd/',
                                                 keep_raw=True)

# Usage examples:
# # Extract only noon files (12:50) for January 2021
results = data_loading_dwd.import_radolan_historical('2022-01-01',
                                                     '2022-02-28',
                                                     '../data/dwd/',
                                                     time_to_keep=1250,
                                                     keep_raw=True)


## Data Processing


Functions from `data_processing_dwd.py` can be used to process the data. The goal is to combine the daily readings of a certain region of the RADOLAN grid and combine it into a time series. This time series and a coordinate array (wsg84 latitute and longitute decimal values) are then saved as a zarr file. 

Say we have 365 days of data wit a coordinates restricting the data to a 10x10 grid. Then we can create a tensor of shape (10,10,2) for the coordinates and a (10,10,365) tensor for the time series.

In [None]:
from src.utils import data_processing_dwd

importlib.reload(data_processing_dwd)

# Example usage with Berlin bounds
config = {
    'bounds': {
        'min_lat': 52.4,
        'max_lat': 52.65,
        'min_lon': 13.15,
        'max_lon': 13.6
    },
    'date_range': {
        'start_date': '2022-01-01',
        'end_date': '2022-02-28'
    },
    'region_name': 'berlin',
    'data_directory': '../data/dwd/extracted',
    'output_directory': '../data/dwd/processed'
}

data_processing_dwd.create_radolan_timeseries(config)

## Data Loading

In [None]:
import xarray as xr

data_series_loaded = xr.open_zarr(
    '../data/dwd/processed/radolan_berlin_2022-01-01_2022-02-28.zarr')
data_series_loaded

## Data Exploration
Having created a timeseries of cropped RADOLAN data, this section serves to visualize the results and plot selected statistics of the data. 

In [None]:
# import numpy as np
cropped_data = data_series_loaded.precipitation.values[10]
# print(data_series_loaded.time[10])
# cropped_coords = data_series_loaded.lat.values
# cropped_coords = np.stack([lons_2d, lats_2d], axis=-1)

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 6))
pm = ax.pcolormesh(
    data_series_loaded.lon.values,  # longitude
    data_series_loaded.lat.values,  # latitude
    cropped_data,
    cmap="viridis",
    shading="auto",
    vmin=0)
plt.colorbar(pm, ax=ax, label="mm/h")
ax.set_title("Cropped RADOLAN Data (Berlin Area)")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

In [None]:
# Calculate total precipitation for each time step (sum over all grid points)
precip_sum = data_series_loaded.precipitation.values.sum(axis=(1, 2))

# Create subplots for different views
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Full time series
axes[0, 0].plot(data_series_loaded.time.values,
                precip_sum,
                linewidth=1,
                alpha=0.8)
axes[0, 0].set_title('Full Time Series')
axes[0, 0].set_ylabel('Total Precipitation (mm/h)')
axes[0, 0].grid(True, alpha=0.3)

# 2. Histogram of precipitation values
axes[0, 1].hist(precip_sum, bins=50, alpha=0.7, edgecolor='black')
axes[0, 1].set_title('Distribution of Precipitation Values')
axes[0, 1].set_xlabel('Total Precipitation (mm/h)')
axes[0, 1].set_ylabel('Frequency')

# 3. Box plot by month (if you want monthly patterns)
import pandas as pd

df = pd.DataFrame({
    'time': data_series_loaded.time.values,
    'precip': precip_sum
})
df['month'] = pd.to_datetime(df['time']).dt.month
df.boxplot(column='precip', by='month', ax=axes[1, 0])
# df.plot.bar(x = 'month',y = 'precip')
axes[1, 0].set_title('Monthly Precipitation Distribution')
axes[1, 0].set_xlabel('Month')

# 4. Recent data (last 100 points)
axes[1, 1].plot(data_series_loaded.time.values[-100:],
                precip_sum[-100:],
                'o-',
                linewidth=1,
                markersize=3)
axes[1, 1].set_title('Recent Precipitation (Last 100 Points)')
axes[1, 1].set_ylabel('Total Precipitation (mm/h)')
axes[1, 1].tick_params(axis='x', rotation=45)

axes[1, 0].set_title(
    'Monthly Precipitation Distribution')  # This sets your custom title
plt.suptitle('')
plt.tight_layout()
plt.show()