In [1]:
# Imports and settings
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks


In [5]:
# Load dataset and basic cleaning

PATH = "widr_zone_day_model_ready_2021_2023.csv"
TARGET = "waste_tons_day"

df = pd.read_csv(PATH)
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["zone_id", "date"]).reset_index(drop=True)

print("Rows:", len(df))
print("Columns:", len(df.columns))


Rows: 37345
Columns: 41


In [6]:
# Choose LSTM input features
# We use your feature-engineered dataset but we exclude lag/rolling features
# because the LSTM learns temporal memory from sequences

exclude = {"zone_id", "zone_name", "date", TARGET}

feature_cols = [c for c in df.columns if c not in exclude]

# Remove lag and rolling features for a clean LSTM setup
feature_cols = [
    c for c in feature_cols
    if not (c.startswith("lag_") or c.startswith("roll_"))
]

print("Num features used:", len(feature_cols))
print("Sample features:", feature_cols[:25])


Num features used: 37
Sample features: ['population', 'waste_baseline_tons', 'dow', 'month', 'is_umuganda', 'event_count_cal', 'event_intensity_cal', 'rain_mm', 'temp_c', 'humidity', 'ndvi', 'season_month', 'season_week', 'event_factor', 'weather_factor', 'ndvi_factor', 'year', 'day', 'is_weekend', 'weekofyear', 'month_sin', 'month_cos', 'dow_sin', 'dow_cos', 'waste_tons_day_lag1']


In [7]:
# Time-based split to avoid leakage

train_df = df[df["date"] < "2023-01-01"].copy()
test_df  = df[df["date"] >= "2023-01-01"].copy()

val_cutoff = "2022-10-01"
tr_df = train_df[train_df["date"] < val_cutoff].copy()
va_df = train_df[train_df["date"] >= val_cutoff].copy()

print("Train rows:", len(tr_df))
print("Val rows:", len(va_df))
print("Test rows:", len(test_df))


Train rows: 21350
Val rows: 3220
Test rows: 12775


In [10]:
# Scale features

scaler = StandardScaler()
scaler.fit(tr_df[feature_cols])

def scale_block(block: pd.DataFrame) -> pd.DataFrame:
    out = block.copy()
    out[feature_cols] = scaler.transform(out[feature_cols])
    return out

tr_df = scale_block(tr_df)
va_df = scale_block(va_df)
test_df = scale_block(test_df)


# Build sequences per zone (sliding window)
# Predict next-day waste from the previous SEQ_LEN days of inputs

def make_sequences(panel_df: pd.DataFrame, feature_cols, target_col: str, seq_len: int = 28):
    X_list, y_list, meta_list = [], [], []

    for zid, g in panel_df.groupby("zone_id", sort=False):
        g = g.sort_values("date")

        Xg = g[feature_cols].to_numpy(dtype=np.float32)
        yg = g[target_col].to_numpy(dtype=np.float32)

        dates = g["date"].to_numpy()
        zname = g["zone_name"].to_numpy() if "zone_name" in g.columns else np.array([None] * len(g))

        for i in range(seq_len, len(g)):
            X_list.append(Xg[i - seq_len:i, :])
            y_list.append(yg[i])
            meta_list.append((dates[i], zid, zname[i]))

    X = np.stack(X_list) if X_list else np.empty((0, seq_len, len(feature_cols)), dtype=np.float32)
    y = np.array(y_list, dtype=np.float32)
    meta = pd.DataFrame(meta_list, columns=["date", "zone_id", "zone_name"])

    return X, y, meta

SEQ_LEN = 28

X_train, y_train, meta_train = make_sequences(tr_df, feature_cols, TARGET, SEQ_LEN)
X_val, y_val, meta_val       = make_sequences(va_df, feature_cols, TARGET, SEQ_LEN)
X_test, y_test, meta_test    = make_sequences(test_df, feature_cols, TARGET, SEQ_LEN)


In [11]:
# Define the LSTM model

n_features = X_train.shape[-1]

model = models.Sequential([
    layers.Input(shape=(SEQ_LEN, n_features)),
    layers.LSTM(64, return_sequences=True),
    layers.Dropout(0.2),
    layers.LSTM(32),
    layers.Dropout(0.2),
    layers.Dense(32, activation="relu"),
    layers.Dense(1)
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse",
    metrics=[tf.keras.metrics.MeanAbsoluteError(name="mae")]
)

model.summary()


# Train with early stopping

cb = [
    callbacks.EarlyStopping(monitor="val_loss", patience=8, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=60,
    batch_size=256,
    callbacks=cb,
    verbose=1
)


In [12]:
# Evaluate and compute metrics (MAE, RMSE, MAPE)

def rmse(y_true, y_pred):
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def mae(y_true, y_pred):
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    return float(np.mean(np.abs(y_true - y_pred)))

def mape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    denom = np.maximum(np.abs(y_true), eps)
    return float(np.mean(np.abs((y_true - y_pred) / denom)) * 100.0)

y_pred_test = model.predict(X_test, verbose=0).reshape(-1)

print("Test MAE :", mae(y_test, y_pred_test))
print("Test RMSE:", rmse(y_test, y_pred_test))
print("Test MAPE:", mape(y_test, y_pred_test), "%")




Test MAE : 30.208993911743164
Test RMSE: 34.71136474609375
Test MAPE: 101.45325469970703 %


In [13]:
# Export predictions in the format you need for routing
# date, zone_id, zone_name, pred_lstm

pred_out = meta_test.copy()
pred_out["pred_lstm"] = y_pred_test
pred_out = pred_out.sort_values(["date", "zone_id"]).reset_index(drop=True)

pred_out.to_csv("forecast_zone_day_lstm.csv", index=False)
pred_out.head()

# Save model and scaler info

model.save("widr_lstm_model.keras")

# Save the feature list used
pd.Series(feature_cols).to_csv("lstm_feature_cols.csv", index=False, header=["feature"])

print("Saved: widr_lstm_model.keras")
print("Saved: forecast_zone_day_lstm.csv")
print("Saved: lstm_feature_cols.csv")


Saved: widr_lstm_model.keras
Saved: forecast_zone_day_lstm.csv
Saved: lstm_feature_cols.csv
