In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from sklearn.metrics import r2_score

# ----------------- CONFIG -----------------
PARQUET_FILE = "Exports-by-branches-of-processing-and-countries-2015-2025.parquet"

# Univariate exports-only
VAL_FRACTION   = 0.2
MIN_SERIES_LEN = 40
MIN_NONZERO    = 12
EPOCHS         = 80
BATCH_SIZE     = 1

# Try these lookbacks and architectures
LOOKBACK_OPTIONS = [6, 9, 12, 18]

ARCH_CONFIGS = [
    {"name": "1x64",      "layers": [64]},
    {"name": "1x128",     "layers": [128]},
    {"name": "2x64_32",   "layers": [64, 32]},
    {"name": "2x128_64",  "layers": [128, 64]},
]

# Your categories
TOP_CATEGORIES = [
    ("Sweden", "00 Whole fish, fresh, chilled or on ice"),
]

np.random.seed(42)

# ============================================================
#                       HELPERS
# ============================================================
def make_sequences(X, y, lookback):
    Xs, ys = [], []
    for i in range(len(X) - lookback):
        Xs.append(X[i:i+lookback])
        ys.append(y[i+lookback])
    return np.array(Xs), np.array(ys)

def calculate_wape(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = np.sum(np.abs(y_true))
    if denom == 0:
        return np.nan
    return np.sum(np.abs(y_true - y_pred)) / denom * 100.0

def build_lstm_model_simple(input_shape, layer_sizes):
    """
    Improved univariate LSTM:
      - 1 or 2 LSTM layers (as given by layer_sizes)
      - Dropout after each LSTM
      - Mild L2 regularization
      - Slightly lower learning rate (3e-4)
    """
    model = Sequential()
    model.add(Input(shape=input_shape))

    for i, units in enumerate(layer_sizes):
        return_seq = (i < len(layer_sizes) - 1)
        model.add(
            LSTM(
                units,
                return_sequences=return_seq,
                kernel_regularizer=l2(1e-4),
                recurrent_regularizer=l2(1e-5),
            )
        )
        model.add(Dropout(0.2))

    model.add(Dense(32, activation="relu", kernel_regularizer=l2(1e-4)))
    model.add(Dense(1))

    model.compile(optimizer=Adam(learning_rate=3e-4), loss="mse")
    return model

def train_univariate_lstm_for_series(
    series_values,
    lookback,
    layer_sizes
):
    """
    series_values: 1D numpy array of FOB values (already sorted by date)
    lookback: int
    layer_sizes: list of LSTM units, e.g. [64] or [64, 32]

    Returns metrics + validation predictions + indices
    """
    y = np.asarray(series_values, dtype=float)

    if len(y) < MIN_SERIES_LEN:
        return None

    nonzero_count = np.count_nonzero(y)
    if nonzero_count < MIN_NONZERO:
        return None

    if np.allclose(y, y[0]):
        return None

    # Features = just y itself
    X = y.reshape(-1, 1)

    # Normalization
    y_mean = y.mean()
    y_std  = y.std()
    if y_std == 0:
        return None

    X_mean = X.mean(axis=0)
    X_std  = X.std(axis=0)
    X_std[X_std == 0] = 1.0

    X_n = (X - X_mean) / X_std
    y_n = (y.reshape(-1, 1) - y_mean) / y_std

    # Sequences
    X_seq, y_seq = make_sequences(X_n, y_n, lookback)
    if len(X_seq) < 10:
        return None

    n_total = len(X_seq)
    n_train = int(np.floor(n_total * (1.0 - VAL_FRACTION)))
    if n_train < 1 or n_train >= n_total:
        return None

    # Map sequence indexes back to original time indexes
    # sequence i predicts original y[i + lookback]
    target_indices = np.arange(len(y))[lookback:]
    val_indices = target_indices[n_train:]

    X_train, X_val = X_seq[:n_train], X_seq[n_train:]
    y_train, y_val = y_seq[:n_train], y_seq[n_train:]

    model = build_lstm_model_simple(
        input_shape=(lookback, X_seq.shape[2]),
        layer_sizes=layer_sizes,
    )

    callbacks = [
        EarlyStopping(
            monitor="val_loss",
            patience=10,
            restore_best_weights=True,
            verbose=0
        )
    ]

    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=callbacks,
        verbose=0
    )

    # Predict and un-normalize
    y_pred_n = model.predict(X_val, verbose=0)
    y_true = (y_val * y_std) + y_mean
    y_pred = (y_pred_n * y_std) + y_mean

    y_true = y_true.ravel()
    y_pred = y_pred.ravel()

    r2   = r2_score(y_true, y_pred)
    wape = calculate_wape(y_true, y_pred)

    return {
        "model":        model,
        "r2":           float(r2),
        "wape":         float(wape),
        "y_mean":       float(y_mean),
        "y_std":        float(y_std),
        "n_points":     int(len(y)),
        "y_true_val":   y_true,
        "y_pred_val":   y_pred,
        "val_indices":  val_indices,
    }

# ============================================================
#                       DATA LOADING
# ============================================================
def load_exports_univariate(country_filter=None):
    """
    Load only FOB value as a univariate series:
      columns: Country, Branches, date, value
    """
    df = pd.read_parquet(PARQUET_FILE)
    print("Parquet columns:", df.columns)

    # Clean
    df["Country"] = (
        df["Country"]
        .astype(str)
        .str.replace("\xa0", " ", regex=False)
        .str.strip()
    )
    df["Branches"] = df["Branches"].astype(str).str.strip()

    # Month -> datetime
    df["date"] = pd.to_datetime(
        df["Month"].astype(str).str.replace("M", "-") + "-01",
        format="%Y-%m-%d",
        errors="coerce"
    )
    df = df.dropna(subset=["date"])

    # Keep FOB value only
    df = df[df["Unit"] == "Fob value"].copy()
    df["DATA"] = pd.to_numeric(df["DATA"], errors="coerce")
    df = df.dropna(subset=["DATA"])

    # Country filter
    if country_filter is not None:
        if isinstance(country_filter, (list, tuple, set)):
            df = df[df["Country"].isin(country_filter)]
        else:
            df = df[df["Country"] == country_filter]

    # Aggregate
    df = (
        df.groupby(["Country", "Branches", "date"], as_index=False)["DATA"]
          .sum()
          .rename(columns={"DATA": "value"})
    )

    print(f"\nTotal rows after filtering: {len(df)}")
    print("Countries present:", sorted(df["Country"].unique()))
    print("Example rows:")
    print(df.head())

    return df

# ============================================================
#       HYPERPARAM SEARCH ON SIMPLE UNIVARIATE MODEL
# ============================================================
def evaluate_univariate_hyperparams_on_categories(
    categories,
    exports_df,
    lookback_options,
    arch_configs
):
    """
    For each (country, branch) and each (lookback, arch),
    train univariate LSTM and record metrics. Then pick the best
    config (by R²) per category.
    """
    all_results = []

    for country, branch in categories:
        print(f"\n===== Category: {country} - {branch} =====")

        series_df = exports_df[
            (exports_df["Country"] == country) &
            (exports_df["Branches"] == branch)
        ].copy()

        if series_df.empty:
            print("  No export data found; skipping.")
            continue

        series_df = series_df.sort_values("date")
        y_series = series_df["value"].values

        best_for_cat = None

        for lookback in lookback_options:
            for arch in arch_configs:
                arch_name = arch["name"]
                layer_sizes = arch["layers"]

                info = train_univariate_lstm_for_series(
                    series_values=y_series,
                    lookback=lookback,
                    layer_sizes=layer_sizes
                )

                if info is None:
                    print(f"  lookback={lookback}, arch={arch_name}: skipped")
                    continue

                print(
                    f"  lookback={lookback}, arch={arch_name}: "
                    f"R²={info['r2']:.3f}, WAPE={info['wape']:.2f}%, n={info['n_points']}"
                )

                row = {
                    "country":   country,
                    "branch":    branch,
                    "lookback":  lookback,
                    "arch":      arch_name,
                    "layers":    str(layer_sizes),
                    "n_points":  info["n_points"],
                    "r2":        info["r2"],
                    "wape":      info["wape"],
                }
                all_results.append(row)

                if (best_for_cat is None) or (info["r2"] > best_for_cat["r2"]):
                    best_for_cat = row

        if best_for_cat is not None:
            print(
                f"  >>> Best for {country} - {branch}: "
                f"lookback={best_for_cat['lookback']}, arch={best_for_cat['arch']}, "
                f"R²={best_for_cat['r2']:.3f}, WAPE={best_for_cat['wape']:.2f}%"
            )
        else:
            print("  No valid model for this category.")

    if not all_results:
        print("\nNo models trained successfully.")
        return pd.DataFrame(), pd.DataFrame()

    all_df = pd.DataFrame(all_results)

    print("\n===== ALL CONFIGS (TOP 20 BY R²) =====")
    print(
        all_df.sort_values("r2", ascending=False)
              .head(20)
              .to_string(index=False)
    )

    best_per_cat = (
        all_df.sort_values("r2", ascending=False)
              .groupby(["country", "branch"], as_index=False)
              .first()
    )

    print("\n===== BEST CONFIG PER CATEGORY =====")
    print(best_per_cat.to_string(index=False))

    return all_df, best_per_cat

# ============================================================
#                 PLOTTING BEST CONFIGS
# ============================================================
def plot_best_configs(best_per_cat_df, exports_df):
    """
    For each best (country, branch, lookback, arch),
    re-train once and plot Actual vs Predicted on the validation window.
    """
    # helper: map arch name -> layer sizes
    arch_map = {cfg["name"]: cfg["layers"] for cfg in ARCH_CONFIGS}

    for _, row in best_per_cat_df.iterrows():
        country = row["country"]
        branch  = row["branch"]
        lookback = int(row["lookback"])
        arch_name = row["arch"]
        layer_sizes = arch_map[arch_name]

        print(f"\nPlotting {country} - {branch} (lookback={lookback}, arch={arch_name})")

        series_df = exports_df[
            (exports_df["Country"] == country) &
            (exports_df["Branches"] == branch)
        ].copy().sort_values("date")

        if series_df.empty:
            print("  No data for this category, skipping plot.")
            continue

        y_series = series_df["value"].values
        info = train_univariate_lstm_for_series(
            series_values=y_series,
            lookback=lookback,
            layer_sizes=layer_sizes
        )

        if info is None:
            print("  Could not train model for plotting; skipping.")
            continue

        y_true = info["y_true_val"]
        y_pred = info["y_pred_val"]
        val_idx = info["val_indices"]
        dates_val = series_df["date"].values[val_idx]

        plt.figure(figsize=(10, 4))
        plt.plot(dates_val, y_true, label="Actual", linewidth=2)
        plt.plot(dates_val, y_pred, label="Predicted", linestyle="--")
        plt.title(
            f"{country} - {branch}\n"
            f"lookback={lookback}, arch={arch_name}, "
            f"R²={info['r2']:.3f}, WAPE={info['wape']:.2f}%"
        )
        plt.xlabel("Date")
        plt.ylabel("Fob value")
        plt.grid(True, alpha=0.3)
        plt.legend()
        plt.tight_layout()
        plt.show()

# ============================================================
#                       MAIN
# ============================================================
if __name__ == "__main__":
    selected_countries = sorted({c for (c, _) in TOP_CATEGORIES})
    exports_df = load_exports_univariate(country_filter=selected_countries)

    all_configs_df, best_per_cat_df = evaluate_univariate_hyperparams_on_categories(
        TOP_CATEGORIES,
        exports_df,
        LOOKBACK_OPTIONS,
        ARCH_CONFIGS,
    )

    # Plot actual vs predicted for each best config
    if not best_per_cat_df.empty:
        plot_best_configs(best_per_cat_df, exports_df)


Parquet columns: Index(['Branches', 'Country', 'Month', 'Unit', 'DATA'], dtype='object')

Total rows after filtering: 6096
Countries present: ['Sweden']
Example rows:
  Country                                 Branches       date    value
0  Sweden  00 Whole fish, fresh, chilled or on ice 2015-01-01  24.5692
1  Sweden  00 Whole fish, fresh, chilled or on ice 2015-02-01  23.7582
2  Sweden  00 Whole fish, fresh, chilled or on ice 2015-03-01  48.4907
3  Sweden  00 Whole fish, fresh, chilled or on ice 2015-04-01  22.6574
4  Sweden  00 Whole fish, fresh, chilled or on ice 2015-05-01  46.9166

===== Category: Sweden - 00 Whole fish, fresh, chilled or on ice =====
  lookback=6, arch=1x64: R²=0.771, WAPE=35.36%, n=127
  lookback=6, arch=1x128: R²=0.767, WAPE=35.94%, n=127
  lookback=6, arch=2x64_32: R²=0.750, WAPE=39.38%, n=127
  lookback=6, arch=2x128_64: R²=0.770, WAPE=38.40%, n=127
  lookback=9, arch=1x64: R²=0.605, WAPE=51.81%, n=127
  lookback=9, arch=1x128: R²=0.708, WAPE=38.98%, n=127
  

KeyboardInterrupt: 