In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import layers, models, optimizers, losses
import datetime as dt
import pymannkendall as mk
from properscoring import crps_ensemble
from dtaidistance import dtw
from statsmodels.tsa.stattools import acf

In [2]:
%run C:\Users\mvana\Documents\MSc\Temp_O3_H2O\000_Functions.ipynb

In [3]:
# Load data into tf.data.Dataset
dataset = tf.data.experimental.make_csv_dataset(
    file_pattern=r"C:\Users\mvana\Documents\MSc\Temp_O3_H2O\Data\monthly_series.csv",
    batch_size=1, num_epochs=1, shuffle=False)

In [4]:
# Collect into a dict of tensors
dataset = dataset.unbatch().batch(10_000)  # adjust size
data_dict = next(iter(dataset))

In [5]:
# Separate year_month
year_month = data_dict.pop("year_month")  # shape (N,)
timeseries = tf.stack(list(data_dict.values()), axis=1)  # (N, num_features)

In [6]:
# Fill missing values (NaN -> 0 or forward-fill)
timeseries = tf.where(tf.math.is_nan(timeseries),
                      tf.zeros_like(timeseries),
                      timeseries)

In [7]:
# Create numeric time index
time_index = tf.range(tf.shape(timeseries)[0], dtype=tf.float32)

In [8]:
# --- MinMax scaling ---
min_vals = tf.reduce_min(timeseries, axis=0)
max_vals = tf.reduce_max(timeseries, axis=0)
timeseries_scaled = (timeseries - min_vals) / (max_vals - min_vals + 1e-8)

In [9]:
SEQ_LEN = 24
PRED_LEN = 24

# Build the targets explicitly as sliding windows of length PRED_LEN
targets = np.array([
    timeseries_scaled[i+SEQ_LEN : i+SEQ_LEN+PRED_LEN]
    for i in range(len(timeseries_scaled) - SEQ_LEN - PRED_LEN + 1)
])

# Now create the dataset
dataset = tf.keras.utils.timeseries_dataset_from_array(
    data=timeseries_scaled[:-PRED_LEN],  # input sequences
    targets=targets,                     # multi-step targets
    sequence_length=SEQ_LEN,
    sequence_stride=1,
    shuffle=True,
    batch_size=16
)

# Inspect one batch
for X_batch, y_batch in dataset.take(1):
    print("X shape:", X_batch.shape)  # (batch, SEQ_LEN, features)
    print("y shape:", y_batch.shape)  # (batch, PRED_LEN, features) or (batch, PRED_LEN)

X shape: (16, 24, 1)
y shape: (16, 24, 1)


In [10]:
# Model
def build_model(seq_len, pred_len):
    inputs = layers.Input(shape=(seq_len, 1))
    x = layers.LSTM(64, return_sequences=False, activation='relu')(inputs)
    x = layers.Dense(32, activation='relu')(x)
    outputs = layers.Dense(pred_len)(x)
    model = models.Model(inputs=inputs, outputs = outputs)
    loss_fn = combined_loss_fn(a=slope, b=intercept_scaled, lambda_mse=1.0, lambda_ode=0.5)
    model.compile(optimizer='adam', loss=loss_fn, metrics=['mae', 'mse'])

    return model

model = build_model(SEQ_LEN, PRED_LEN)

NameError: name 'slope' is not defined

In [None]:
history = model.fit(dataset, epochs=500, validation_split=0.2)

In [None]:
# Plotting accuracy
plt.figure(figsize=(20, 5))

plt.subplot(1, 3, 1)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.title('Model MSE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(history.history['mse'], label='Train MSE')
plt.plot(history.history['val_mse'], label='Val MSE')
plt.title('Model MSE')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()

# Plotting loss
plt.subplot(1, 3, 3)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Get the last sequence from your dataset
last_input = timeseries_scaled[-SEQ_LEN:]
# Reshape to match model input shape
last_input = last_input.reshape(1, SEQ_LEN, 1)
# Predict
forecast_scaled = model.predict(last_input)
# Unscale forecast
forecast_unscaled_raw = scaler.inverse_transform(forecast_scaled.reshape(-1, 1)).flatten()
# Shift forecast: timeseries[-1] becomes forecast_unscaled[0], forecast_unscaled_raw[0] becomes [1], etc.
forecast_unscaled = np.concatenate([timeseries.values[-1], forecast_unscaled_raw[:-1]])
# Set parameters
start_date = np.datetime64("2002-01", 'M')

date_index = np.arange(start_date, start_date + n, dtype='datetime64[M]')

# Create forecast dates: PRED_LEN months after last original date
PRED_LEN = 24  # or your actual prediction length
forecast_start = date_index[-1]# + 1
forecast_dates = np.arange(forecast_start, forecast_start + PRED_LEN, dtype='datetime64[M]')

# Combine full time range for xticks
full_time_range = np.concatenate([date_index, forecast_dates])
tick_locs = np.arange(date_index[0], full_time_range[-1] + 1, 6, dtype='datetime64[M]')

# Compute the trend line equation string
slope = result.slope
intercept = result.intercept
equation_text = f"Linear Trend: y ={slope:.4f}*t +  {intercept:.2f}"

In [None]:
# Plot
plt.figure(figsize=(15, 3), dpi=1000)
plt.plot(date_index, timeseries, marker='o', label="Original")
plt.plot(date_index, df['sen_trend'], label="Sen's Slope Trend")
plt.plot(forecast_dates, forecast_unscaled, label="Forecast", color="red")
# plt.fill_between(forecast_dates, lower_bound, upper_bound, color='red', alpha=0.2, label="95% CI")
plt.axvline(date_index[-1], color='gray', linestyle='--', linewidth=1)
plt.xticks(tick_locs, rotation=45)

# Add the equation text
plt.text(
    x=date_index[len(date_index) // 3],  # adjust index for placement
    y=max(df['ktemp']) - 0.5,     # adjust y-position as needed
    s=equation_text,
    fontsize=12,
    color='darkred',
    bbox=dict(facecolor='white', alpha=0.8, edgecolor='gray')
)

plt.legend()
# plt.title("Physics-Informed LSTM Forecast")
plt.xlabel("Date")
plt.ylabel("ktemp")
plt.grid(True)
plt.tight_layout()
plt.savefig(r"C:\Users\mvana\Documents\MSc\Temp_O3_H2O\Plots\publish\Time_Series_Forecast.png", format='png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
y_pred = model.predict(X_test.reshape(-1, SEQ_LEN, 1))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse}")

In [None]:
print("MAE:", mean_absolute_error(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
print("RMSE:", root_mean_squared_error(y_test, y_pred))
print("MAPE:", mean_absolute_percentage_error(y_test, y_pred))
print("sMAPE:", symmetric_mape(y_test, y_pred))
print("MASE:", mean_absolute_scaled_error(y_test, y_pred))
print("Quantile loss:", quantile_loss(y_test, y_pred, quantile=0.9))

In [None]:
print("DTW:", dtw_distance(y_test[0], y_pred[0]))
print("Pinball (q=0.9):", pinball_loss(y_test, y_pred, 0.9))
print("FSS (vs. naive):", forecast_skill_score(np.mean((y_test - y_pred)**2), 1.0))
print("NSE:", nash_sutcliffe_efficiency(y_test, y_pred))

In [None]:
# Naive baseline
baseline_pred = np.tile(X_test[:, -1].reshape(-1, 1), (1, PRED_LEN))  # last observed value repeated

# Evaluate
evaluate_forecast(y_test, y_pred, baseline_pred=baseline_pred, title="LSTM Prediction")