In [1]:
# Dataset load

from google.colab import files
uploaded = files.upload()

Saving final_climate_data_with_interpolation.csv to final_climate_data_with_interpolation.csv


In [None]:
import pandas as pd

final_data = pd.read_csv("/content/final_climate_data_with_interpolation.csv")

In [None]:
final_data.head()


Unnamed: 0,dt,CleanName,Continent,AverageTemperature,AverageTemperatureUncertainty,is_nan,gap_block,gap_size
0,1838-04-01,Afghanistan,Asia,13.008,2.586,False,1,0
1,1838-05-01,Afghanistan,Asia,18.479,,True,2,1
2,1838-06-01,Afghanistan,Asia,23.95,2.51,False,3,0
3,1838-07-01,Afghanistan,Asia,26.877,2.883,False,3,0
4,1838-08-01,Afghanistan,Asia,24.938,2.992,False,3,0


In [None]:
print(final_data.shape)
print(final_data.columns.tolist)

(533022, 8)
<bound method IndexOpsMixin.tolist of Index(['dt', 'CleanName', 'Continent', 'AverageTemperature',
       'AverageTemperatureUncertainty', 'is_nan', 'gap_block', 'gap_size'],
      dtype='object')>


# Global LSTM Model

We are starting off with normalization

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Make sure sorted
final_data["dt"] = pd.to_datetime(final_data["dt"])
final_data = final_data.sort_values(["CleanName", "dt"]).reset_index(drop=True)

Min-Max scalar standardization

In [None]:
scaler = MinMaxScaler()

final_data["TempNorm"] = scaler.fit_transform(
    final_data[["AverageTemperature"]]
)

we made look back window 36 months

In [None]:
WINDOW = 36

def build_sequences(df, window=WINDOW):
    X_list = []
    y_list = []
    meta_list = []

    for country, g in df.groupby("CleanName"):
        g = g.sort_values("dt")
        temps = g["TempNorm"].values

        # If a country has fewer than window+1 months, skip it
        if len(temps) <= window:
            continue

        for i in range(len(temps) - window):
            X_list.append(temps[i:i+window])
            y_list.append(temps[i+window])  # next month
            meta_list.append((country, g.iloc[i+window]["dt"]))

    X = np.array(X_list)
    y = np.array(y_list)
    return X, y, meta_list

X, y, meta = build_sequences(final_data, window=WINDOW)
print(X.shape, y.shape)  # (num_samples, 36), (num_samples,)

(524490, 36) (524490,)


In [None]:
X = X.reshape(X.shape[0], X.shape[1], 1)  # (N, 36, 1)

Train / Val / Test

In [None]:
n = len(X)

train_end = int(0.70 * n)
val_end   = int(0.85 * n)

X_train = X[:train_end]
y_train = y[:train_end]

X_val   = X[train_end:val_end]
y_val   = y[train_end:val_end]

X_test  = X[val_end:]
y_test  = y[val_end:]

print(X_train.shape, X_val.shape, X_test.shape)

(367143, 36, 1) (78673, 36, 1) (78674, 36, 1)


In [None]:
class ClimateDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = ClimateDataset(X_train, y_train)
val_ds   = ClimateDataset(X_val, y_val)
test_ds  = ClimateDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=False)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=64, shuffle=False)

In [None]:
class AttentionLSTM(nn.Module):
    def __init__(self, hidden_dim=64, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=1,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
        )

        self.attn = nn.Linear(hidden_dim, 1)
        self.fc   = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        # x: (batch, seq_len, 1)
        lstm_out, _ = self.lstm(x)

        attn_scores  = self.attn(lstm_out)
        attn_weights = torch.softmax(attn_scores, dim=1)

        # weighted sum of LSTM outputs
        context = torch.sum(attn_weights * lstm_out, dim=1)

        out = self.fc(context)
        return out.squeeze(-1)

move to gpu

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AttentionLSTM(hidden_dim=64, num_layers=1).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

In [None]:
EPOCHS = 30

for epoch in range(EPOCHS):
    # ---- TRAIN ----
    model.train()
    train_loss = 0.0

    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        preds = model(X_batch)
        loss  = loss_fn(preds, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # ---- VALIDATION ----
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            preds = model(X_batch)
            loss  = loss_fn(preds, y_batch)
            val_loss += loss.item()

    val_loss /= len(val_loader)

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

Epoch 1/30 | Train Loss: 0.0103 | Val Loss: 0.0074
Epoch 2/30 | Train Loss: 0.0007 | Val Loss: 0.0006
Epoch 3/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 4/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 5/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 6/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 7/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 8/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 9/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 10/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 11/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 12/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 13/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 14/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 15/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 16/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 17/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 18/30 | Train Loss: 0.0006 | Val Loss: 0.0006
Epoch 19/30 | Train Loss: 0.0005 | Val Loss: 0.0006
Epoch 20/30 | Train L

# Evaluation

In [None]:
model.eval()
test_preds = []

with torch.no_grad():
    for X_batch, _ in test_loader:
        X_batch = X_batch.to(device)
        preds = model(X_batch)
        test_preds.append(preds.cpu().numpy())

test_preds = np.concatenate(test_preds)

In [None]:
mse  = mean_squared_error(y_test, test_preds)
mae  = mean_absolute_error(y_test, test_preds)
rmse = np.sqrt(mse)

print("Test RMSE (normalized):", rmse)
print("Test MAE (normalized):", mae)

Test RMSE (normalized): 0.025829872774188106
Test MAE (normalized): 0.015533677984101516


Directional Accuracy (DA) measures whether your model correctly predicts the direction of change, not the exact value.


In [None]:
actual_diff = np.sign(np.diff(y_test_orig))
pred_diff   = np.sign(np.diff(preds_orig))

directional_accuracy = (actual_diff == pred_diff).mean()
print("Directional Accuracy:", directional_accuracy)

Directional Accuracy: 0.8423601489710574


In [None]:
final_data

Unnamed: 0,dt,CleanName,Continent,AverageTemperature,AverageTemperatureUncertainty,is_nan,gap_block,gap_size,TempNorm
0,1838-04-01,Afghanistan,Asia,13.008,2.586,False,1,0,0.662301
1,1838-05-01,Afghanistan,Asia,18.479,,True,2,1,0.733817
2,1838-06-01,Afghanistan,Asia,23.950,2.510,False,3,0,0.805333
3,1838-07-01,Afghanistan,Asia,26.877,2.883,False,3,0,0.843595
4,1838-08-01,Afghanistan,Asia,24.938,2.992,False,3,0,0.818248
...,...,...,...,...,...,...,...,...,...
533017,2013-05-01,Åland,Europe,10.327,0.612,False,3697,0,0.627255
533018,2013-06-01,Åland,Europe,14.068,0.423,False,3697,0,0.676157
533019,2013-07-01,Åland,Europe,16.447,0.483,False,3697,0,0.707255
533020,2013-08-01,Åland,Europe,16.425,0.378,False,3697,0,0.706967


# Continent Models

In [None]:
continents = [
    "Europe",
    "Asia",
    "Africa",
    "North America",
    "Oceania",
    "South America"
    # Excluding Antarctica
]

In [None]:
def train_lstm_continent(df, continent_name, window=36, epochs=30):
    print(f"\n\n===== Training model for {continent_name} =====")

    # ------------------------
    # 1. Subset for continent
    # ------------------------
    df_sub = df[df["Continent"] == continent_name].copy()
    df_sub = df_sub.sort_values(["CleanName", "dt"])

    if df_sub.empty:
        print(f"No data for {continent_name}")
        return None

    # ------------------------
    # 2. Normalize
    # ------------------------
    scaler = MinMaxScaler()
    df_sub["TempNorm"] = scaler.fit_transform(df_sub[["AverageTemperature"]])

    # ------------------------
    # 3. Build sequences
    # ------------------------
    X, y, meta = build_sequences(df_sub, window)
    X = X.reshape(X.shape[0], X.shape[1], 1)

    if len(X) < 500:
        print(f"Warning: only {len(X)} samples for {continent_name}.")

    # ------------------------
    # 4. Train/Val/Test split
    # ------------------------
    n = len(X)
    train_end = int(0.7 * n)
    val_end = int(0.85 * n)

    X_train, y_train = X[:train_end], y[:train_end]
    X_val,   y_val   = X[train_end:val_end], y[train_end:val_end]
    X_test,  y_test  = X[val_end:], y[val_end:]

    # PyTorch datasets
    train_ds = ClimateDataset(X_train, y_train)
    val_ds   = ClimateDataset(X_val, y_val)
    test_ds  = ClimateDataset(X_test, y_test)

    train_loader = DataLoader(train_ds, batch_size=64, shuffle=False)
    val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False)
    test_loader  = DataLoader(test_ds, batch_size=64, shuffle=False)

    # ------------------------
    # 5. Build model
    # ------------------------
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AttentionLSTM(hidden_dim=64).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    # ------------------------
    # 6. Training loop
    # ------------------------
    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)

            optimizer.zero_grad()
            preds = model(Xb)
            loss = loss_fn(preds, yb)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation loss
        model.eval()
        val_loss = 0

        with torch.no_grad():
            for Xb, yb in val_loader:
                Xb, yb = Xb.to(device), yb.to(device)
                preds = model(Xb)
                loss = loss_fn(preds, yb)
                val_loss += loss.item()

        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}/{epochs} | Train {train_loss:.4f} | Val {val_loss:.4f}")

    # ------------------------
    # 7. Evaluation
    # ------------------------
    model.eval()
    preds = []

    with torch.no_grad():
        for Xb, _ in test_loader:
            Xb = Xb.to(device)
            preds.append(model(Xb).cpu().numpy())

    preds = np.concatenate(preds)

    # Denormalize
    y_test_orig = scaler.inverse_transform(y_test.reshape(-1,1)).flatten()
    preds_orig  = scaler.inverse_transform(preds.reshape(-1,1)).flatten()

    # Metrics
    mse  = mean_squared_error(y_test_orig, preds_orig)
    mae  = mean_absolute_error(y_test_orig, preds_orig)
    rmse = np.sqrt(mse)

    # Directional Accuracy
    actual_diff = np.sign(np.diff(y_test_orig))
    pred_diff   = np.sign(np.diff(preds_orig))
    directional_acc = (actual_diff == pred_diff).mean()

    print(f"{continent_name} → RMSE: {rmse:.3f}, MAE: {mae:.3f}, DA: {directional_acc:.3f}")

    return {
        "continent": continent_name,
        "rmse": rmse,
        "mae": mae,
        "directional_accuracy": directional_acc
    }

In [None]:
results = []

for cont in continents:
    res = train_lstm_continent(final_data, cont, window=36, epochs=30)
    if res:
        results.append(res)

pd.DataFrame(results)



===== Training model for Europe =====
Epoch 1/30 | Train 0.0157 | Val 0.0109
Epoch 2/30 | Train 0.0019 | Val 0.0044
Epoch 3/30 | Train 0.0019 | Val 0.0063
Epoch 4/30 | Train 0.0019 | Val 0.0078
Epoch 5/30 | Train 0.0019 | Val 0.0068
Epoch 6/30 | Train 0.0018 | Val 0.0060
Epoch 7/30 | Train 0.0018 | Val 0.0060
Epoch 8/30 | Train 0.0018 | Val 0.0065
Epoch 9/30 | Train 0.0018 | Val 0.0076
Epoch 10/30 | Train 0.0018 | Val 0.0089
Epoch 11/30 | Train 0.0018 | Val 0.0097
Epoch 12/30 | Train 0.0019 | Val 0.0098
Epoch 13/30 | Train 0.0019 | Val 0.0097
Epoch 14/30 | Train 0.0019 | Val 0.0093
Epoch 15/30 | Train 0.0019 | Val 0.0089
Epoch 16/30 | Train 0.0019 | Val 0.0086
Epoch 17/30 | Train 0.0018 | Val 0.0083
Epoch 18/30 | Train 0.0018 | Val 0.0081
Epoch 19/30 | Train 0.0018 | Val 0.0079
Epoch 20/30 | Train 0.0018 | Val 0.0078
Epoch 21/30 | Train 0.0018 | Val 0.0077
Epoch 22/30 | Train 0.0018 | Val 0.0076
Epoch 23/30 | Train 0.0018 | Val 0.0075
Epoch 24/30 | Train 0.0018 | Val 0.0075
Epoch 25/

Unnamed: 0,continent,rmse,mae,directional_accuracy
0,Europe,4.391178,3.693739,0.869093
1,Asia,2.036672,1.249834,0.885417
2,Africa,1.204461,0.839264,0.830061
3,North America,1.41378,0.735268,0.84163
4,Oceania,0.6802,0.489724,0.651606
5,South America,1.082204,0.722365,0.792405


# LSTM with month sin/month cos, lag, and roll

adding month_sin and month_cos

month_sin and month_cos encode the month of the year as a smooth, continuous cycle so the model can learn seasonality and understand that December and January are close together.

In [None]:
final_data["month"] = final_data["dt"].dt.month

final_data["month_sin"] = np.sin(2 * np.pi * final_data["month"] / 12)
final_data["month_cos"] = np.cos(2 * np.pi * final_data["month"] / 12)

adding lag features

Lag features give the model the temperature from previous years (12 or 24 months ago) so it can learn long-term seasonal patterns and year-to-year trends.

In [None]:
final_data = final_data.sort_values(["CleanName", "dt"])

final_data["lag12"] = final_data.groupby("CleanName")["AverageTemperature"].shift(12)
final_data["lag24"] = final_data.groupby("CleanName")["AverageTemperature"].shift(24)

adding rolling average


A rolling average smooths the data by replacing each month’s temperature with the average of that month and the previous 2 months, capturing short-term trends and reducing noise.

In [None]:
final_data["roll3"] = (
    final_data.groupby("CleanName")["AverageTemperature"]
    .rolling(window=3, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

scaling
This code uses MinMaxScaler to scale your temperature features (TempNorm, lag12, lag24, roll3) into a 0–1 range so all features are on the same scale and the LSTM can train more smoothly.

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler_main = MinMaxScaler()
final_data["TempNorm"] = scaler_main.fit_transform(final_data[["AverageTemperature"]])

scaler_extra = MinMaxScaler()
final_data[["lag12", "lag24", "roll3"]] = scaler_extra.fit_transform(
    final_data[["lag12", "lag24", "roll3"]]
)

we need to remvove the NANs for the lag values

NaNs happen because lag features shift the data backward, so the first 12–24 months have no earlier values to pull from (e.g., you can’t get “12 months ago” for the first year), leaving those rows empty.

In [None]:
final_data_clean = final_data.dropna(subset=[
    "TempNorm",
    "month_sin", "month_cos",
    "lag12", "lag24",
    "roll3"
]).copy()

In [None]:
final_data[["lag12", "lag24"]].isna().sum()

Unnamed: 0,0
lag12,2844
lag24,5688


updated model

In [None]:
feature_cols = ["TempNorm", "month_sin", "month_cos", "lag12", "lag24", "roll3"]

def build_sequences_multifeat(df, window=36):
    X_list = []
    y_list = []
    meta_list = []

    for country, g in df.groupby("CleanName"):
        g = g.sort_values("dt")

        feats = g[feature_cols].values
        targets = g["TempNorm"].values

        if len(feats) <= window:
            continue

        for i in range(len(feats) - window):
            X_list.append(feats[i:i+window])
            y_list.append(targets[i+window])
            meta_list.append((country, g.iloc[i+window]["dt"]))

    return np.array(X_list), np.array(y_list), meta_list

In [None]:
X, y, meta = build_sequences_multifeat(final_data_clean)
print(X.shape)

(518802, 36, 6)


In [None]:
print("X contains NaNs?", np.isnan(X).any())
print("y contains NaNs?", np.isnan(y).any())

X contains NaNs? False
y contains NaNs? False


In [None]:
n = len(X)

train_end = int(0.70 * n)
val_end   = int(0.85 * n)

X_train = X[:train_end]
y_train = y[:train_end]

X_val = X[train_end:val_end]
y_val = y[train_end:val_end]

X_test = X[val_end:]
y_test = y[val_end:]

In [None]:
train_ds = ClimateDataset(X_train, y_train)
val_ds   = ClimateDataset(X_val, y_val)
test_ds  = ClimateDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=False)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=64, shuffle=False)

defining LSTM + attention with input_size = 6

In [None]:
class AttentionLSTM(nn.Module):
    def __init__(self, hidden_dim=64, num_layers=1):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=6,           # IMPORTANT
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True
        )

        self.attn = nn.Linear(hidden_dim, 1)
        self.fc   = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)           # (batch, seq_len, hidden_dim)

        attn_scores = self.attn(lstm_out)    # (batch, seq_len, 1)
        attn_weights = torch.softmax(attn_scores, dim=1)

        context = torch.sum(attn_weights * lstm_out, dim=1)

        out = self.fc(context)
        return out.squeeze(-1)

# Gradient clipping

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AttentionLSTM(hidden_dim=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
loss_fn = nn.MSELoss()

EPOCHS = 30

for epoch in range(EPOCHS):
    # ---- TRAIN ----
    model.train()
    train_loss = 0

    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(Xb)
        loss = loss_fn(preds, yb)
        loss.backward()

        # prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)

        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # ---- VALIDATION ----
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            preds = model(Xb)
            val_loss += loss_fn(preds, yb).item()

    val_loss /= len(val_loader)

    print(f"Epoch {epoch+1}/{EPOCHS} | Train {train_loss:.4f} | Val {val_loss:.4f}")

Epoch 1/30 | Train 0.0050 | Val 0.0080
Epoch 2/30 | Train 0.0009 | Val 0.0005
Epoch 3/30 | Train 0.0004 | Val 0.0004
Epoch 4/30 | Train 0.0004 | Val 0.0004
Epoch 5/30 | Train 0.0004 | Val 0.0004
Epoch 6/30 | Train 0.0004 | Val 0.0004
Epoch 7/30 | Train 0.0004 | Val 0.0004
Epoch 8/30 | Train 0.0004 | Val 0.0004
Epoch 9/30 | Train 0.0004 | Val 0.0004
Epoch 10/30 | Train 0.0004 | Val 0.0004
Epoch 11/30 | Train 0.0004 | Val 0.0004
Epoch 12/30 | Train 0.0004 | Val 0.0004
Epoch 13/30 | Train 0.0004 | Val 0.0004
Epoch 14/30 | Train 0.0004 | Val 0.0004
Epoch 15/30 | Train 0.0004 | Val 0.0004
Epoch 16/30 | Train 0.0004 | Val 0.0004
Epoch 17/30 | Train 0.0004 | Val 0.0004
Epoch 18/30 | Train 0.0004 | Val 0.0004
Epoch 19/30 | Train 0.0004 | Val 0.0004
Epoch 20/30 | Train 0.0004 | Val 0.0003
Epoch 21/30 | Train 0.0004 | Val 0.0004
Epoch 22/30 | Train 0.0004 | Val 0.0004
Epoch 23/30 | Train 0.0004 | Val 0.0004
Epoch 24/30 | Train 0.0003 | Val 0.0004
Epoch 25/30 | Train 0.0003 | Val 0.0003
Epoch 26/

Evaluation

In [None]:
model.eval()
test_preds = []

with torch.no_grad():
    for Xb, _ in test_loader:
        Xb = Xb.to(device)
        test_preds.append(model(Xb).cpu().numpy())

test_preds = np.concatenate(test_preds)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np


rmse = np.sqrt(mean_squared_error(y_test, test_preds))
mae  = mean_absolute_error(y_test, test_preds)

# ---- Directional Accuracy ----
actual_diff = np.sign(np.diff(y_test))
pred_diff   = np.sign(np.diff(test_preds))
directional_acc = (actual_diff == pred_diff).mean()

# ---- Print Results ----
print("RMSE (norm):", rmse)
print("MAE (norm):", mae)
print("Directional Accuracy:", directional_acc)

RMSE (norm): 0.020401446259499367
MAE (norm): 0.01328609722683089
Directional Accuracy: 0.8753919300950912


# continent-specific models

In [None]:
def train_lstm_continent_multifeat(df, continent_name, window=36, epochs=30):
    print(f"\n\n===== Training model for {continent_name} =====")

    # ------------------------
    # 1. Subset for continent
    # ------------------------
    df_sub = df[df["Continent"] == continent_name].copy()
    df_sub = df_sub.sort_values(["CleanName", "dt"])

    if df_sub.empty:
        print(f"No data for {continent_name}")
        return None

    # ------------------------
    # 2. Normalize temperature only
    # ------------------------
    scaler = MinMaxScaler()
    df_sub["TempNorm"] = scaler.fit_transform(df_sub[["AverageTemperature"]])

    # ------------------------
    # 3. Build multi-feature sequences
    # ------------------------
    X, y, meta = build_sequences_multifeat(df_sub, window)

    if len(X) < 500:
        print(f"Warning: only {len(X)} samples for {continent_name}.")

    # ------------------------
    # 4. Train/Val/Test split
    # ------------------------
    n = len(X)
    train_end = int(0.7 * n)
    val_end   = int(0.85 * n)

    X_train, y_train = X[:train_end], y[:train_end]
    X_val,   y_val   = X[train_end:val_end], y[train_end:val_end]
    X_test,  y_test  = X[val_end:], y[val_end:]

    train_ds = ClimateDataset(X_train, y_train)
    val_ds   = ClimateDataset(X_val, y_val)
    test_ds  = ClimateDataset(X_test, y_test)

    train_loader = DataLoader(train_ds, batch_size=64, shuffle=False)
    val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False)
    test_loader  = DataLoader(test_ds, batch_size=64, shuffle=False)

    # ------------------------
    # 5. Model
    # ------------------------
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AttentionLSTM(hidden_dim=64).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
    loss_fn = nn.MSELoss()

    # ------------------------
    # 6. Training
    # ------------------------
    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            optimizer.zero_grad()

            preds = model(Xb)
            loss = loss_fn(preds, yb)
            loss.backward()

            # prevent NaNs
            torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)

            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for Xb, yb in val_loader:
                Xb, yb = Xb.to(device), yb.to(device)
                preds = model(Xb)
                val_loss += loss_fn(preds, yb).item()

        val_loss /= len(val_loader)
        print(f"Epoch {epoch+1}/{epochs} | Train {train_loss:.4f} | Val {val_loss:.4f}")

    # ------------------------
    # 7. Evaluation
    # ------------------------
    model.eval()
    preds = []
    with torch.no_grad():
        for Xb, _ in test_loader:
            Xb = Xb.to(device)
            preds.append(model(Xb).cpu().numpy())

    preds = np.concatenate(preds)

    # denormalize
    y_test_orig = scaler.inverse_transform(y_test.reshape(-1,1)).flatten()
    preds_orig  = scaler.inverse_transform(preds.reshape(-1,1)).flatten()

    # metrics
    mse  = mean_squared_error(y_test_orig, preds_orig)
    mae  = mean_absolute_error(y_test_orig, preds_orig)
    rmse = np.sqrt(mse)

    actual_diff = np.sign(np.diff(y_test_orig))
    pred_diff   = np.sign(np.diff(preds_orig))
    directional_acc = (actual_diff == pred_diff).mean()

    print(f"{continent_name} → RMSE: {rmse:.3f}, MAE: {mae:.3f}, DA: {directional_acc:.3f}")

    return {
        "continent": continent_name,
        "rmse": rmse,
        "mae": mae,
        "directional_accuracy": directional_acc
    }

In [None]:
results = []
for cont in continents:
    res = train_lstm_continent_multifeat(final_data_clean, cont, window=36, epochs=30)
    if res:
        results.append(res)

pd.DataFrame(results)



===== Training model for Europe =====
Epoch 1/30 | Train 0.0110 | Val 0.0114
Epoch 2/30 | Train 0.0022 | Val 0.0128
Epoch 3/30 | Train 0.0022 | Val 0.0160
Epoch 4/30 | Train 0.0021 | Val 0.0166
Epoch 5/30 | Train 0.0020 | Val 0.0139
Epoch 6/30 | Train 0.0019 | Val 0.0121
Epoch 7/30 | Train 0.0018 | Val 0.0109
Epoch 8/30 | Train 0.0017 | Val 0.0097
Epoch 9/30 | Train 0.0016 | Val 0.0086
Epoch 10/30 | Train 0.0016 | Val 0.0077
Epoch 11/30 | Train 0.0015 | Val 0.0068
Epoch 12/30 | Train 0.0014 | Val 0.0061
Epoch 13/30 | Train 0.0014 | Val 0.0056
Epoch 14/30 | Train 0.0013 | Val 0.0051
Epoch 15/30 | Train 0.0013 | Val 0.0048
Epoch 16/30 | Train 0.0013 | Val 0.0046
Epoch 17/30 | Train 0.0013 | Val 0.0044
Epoch 18/30 | Train 0.0013 | Val 0.0042
Epoch 19/30 | Train 0.0013 | Val 0.0041
Epoch 20/30 | Train 0.0012 | Val 0.0040
Epoch 21/30 | Train 0.0012 | Val 0.0039
Epoch 22/30 | Train 0.0012 | Val 0.0039
Epoch 23/30 | Train 0.0012 | Val 0.0038
Epoch 24/30 | Train 0.0012 | Val 0.0038
Epoch 25/

Unnamed: 0,continent,rmse,mae,directional_accuracy
0,Europe,3.02192,2.478485,0.881826
1,Asia,1.65024,1.138645,0.904542
2,Africa,1.072081,0.723852,0.851309
3,North America,3.075578,1.732746,0.847692
4,Oceania,0.885159,0.699873,0.702714
5,South America,2.609257,2.226599,0.694011


In [None]:
final_data.head()

Unnamed: 0,dt,CleanName,Continent,AverageTemperature,AverageTemperatureUncertainty,is_nan,gap_block,gap_size,TempNorm,month,month_sin,month_cos,lag12,lag24,roll3
0,1838-04-01,Afghanistan,Asia,13.008,2.586,False,1,0,0.662301,4,0.8660254,-0.5,,,0.660903
1,1838-05-01,Afghanistan,Asia,18.479,,True,2,1,0.733817,5,0.5,-0.866025,,,0.698109
2,1838-06-01,Afghanistan,Asia,23.95,2.51,False,3,0,0.805333,6,1.224647e-16,-1.0,,,0.735315
3,1838-07-01,Afghanistan,Asia,26.877,2.883,False,3,0,0.843595,7,-0.5,-0.866025,,,0.798194
4,1838-08-01,Afghanistan,Asia,24.938,2.992,False,3,0,0.818248,8,-0.8660254,-0.5,,,0.827477


In [None]:
final_data[["lag12", "lag24"]].isna().sum()

Unnamed: 0,0
lag12,2844
lag24,5688


Bi-LSTM

In [None]:
class AttentionBiLSTM(nn.Module):
    def __init__(self, hidden_dim=64, num_layers=1):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=6,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True      # represents the Bi-LSTM
        )


        self.attn = nn.Linear(hidden_dim * 2, 1)
        self.fc   = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)

        # Attention scores
        attn_scores = self.attn(lstm_out)
        attn_weights = torch.softmax(attn_scores, dim=1)

        # Weighted sum
        context = torch.sum(attn_weights * lstm_out, dim=1)

        out = self.fc(context)
        return out.squeeze(-1)

In [None]:
model = AttentionBiLSTM(hidden_dim=64).to(device)

In [None]:
# Instantiate model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AttentionBiLSTM(hidden_dim=64).to(device)

# Optimizer and loss
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
loss_fn = nn.MSELoss()

In [None]:
X, y, meta = build_sequences_multifeat(final_data_clean, window=36)
print(X.shape, y.shape)

(518802, 36, 6) (518802,)


In [None]:
train_ds = ClimateDataset(X_train, y_train)
val_ds   = ClimateDataset(X_val, y_val)
test_ds  = ClimateDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=False)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=64, shuffle=False)

In [None]:
EPOCHS = 30

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0

    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)

        optimizer.zero_grad()
        preds = model(Xb)
        loss = loss_fn(preds, yb)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            val_loss += loss_fn(model(Xb), yb).item()

    val_loss /= len(val_loader)
    print(f"Epoch {epoch+1}/{EPOCHS} | Train {train_loss:.4f} | Val {val_loss:.4f}")

Epoch 1/30 | Train 0.0022 | Val 0.0044
Epoch 2/30 | Train 0.0010 | Val 0.0007
Epoch 3/30 | Train 0.0004 | Val 0.0006
Epoch 4/30 | Train 0.0004 | Val 0.0005
Epoch 5/30 | Train 0.0004 | Val 0.0005
Epoch 6/30 | Train 0.0004 | Val 0.0004
Epoch 7/30 | Train 0.0003 | Val 0.0004
Epoch 8/30 | Train 0.0003 | Val 0.0004
Epoch 9/30 | Train 0.0003 | Val 0.0004
Epoch 10/30 | Train 0.0003 | Val 0.0004
Epoch 11/30 | Train 0.0003 | Val 0.0003
Epoch 12/30 | Train 0.0003 | Val 0.0004
Epoch 13/30 | Train 0.0003 | Val 0.0004
Epoch 14/30 | Train 0.0003 | Val 0.0004
Epoch 15/30 | Train 0.0003 | Val 0.0004
Epoch 16/30 | Train 0.0003 | Val 0.0004
Epoch 17/30 | Train 0.0003 | Val 0.0004
Epoch 18/30 | Train 0.0003 | Val 0.0004
Epoch 19/30 | Train 0.0003 | Val 0.0004
Epoch 20/30 | Train 0.0003 | Val 0.0004
Epoch 21/30 | Train 0.0003 | Val 0.0004
Epoch 22/30 | Train 0.0003 | Val 0.0004
Epoch 23/30 | Train 0.0003 | Val 0.0004
Epoch 24/30 | Train 0.0003 | Val 0.0004
Epoch 25/30 | Train 0.0003 | Val 0.0004
Epoch 26/

# evaluation

In [None]:
model.eval()
test_preds = []

with torch.no_grad():
    for Xb, _ in test_loader:
        Xb = Xb.to(device)
        test_preds.append(model(Xb).cpu().numpy())

test_preds = np.concatenate(test_preds)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, test_preds))
mae  = mean_absolute_error(y_test, test_preds)

actual_diff = np.sign(np.diff(y_test))
pred_diff   = np.sign(np.diff(test_preds))
directional_acc = (actual_diff == pred_diff).mean()

print("GLOBAL BiLSTM RMSE:", rmse)
print("GLOBAL BiLSTM MAE:", mae)
print("GLOBAL BiLSTM Directional Accuracy:", directional_acc)

GLOBAL BiLSTM RMSE: 0.021112179862847875
GLOBAL BiLSTM MAE: 0.014814876988453114
GLOBAL BiLSTM Directional Accuracy: 0.8700462606013878


Continent-Specific BiLSTM

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# ===============================
# DATASET CLASS
# ===============================
class ClimateDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# ===============================
# MULTI-FEATURE SEQUENCE BUILDER
# ===============================
feature_cols = ["TempNorm", "month_sin", "month_cos", "lag12", "lag24", "roll3"]

def build_sequences_multifeat(df, window=36):
    X_list, y_list, meta_list = [], [], []

    for country, g in df.groupby("CleanName"):
        g = g.sort_values("dt")
        feats = g[feature_cols].values
        targets = g["TempNorm"].values

        if len(feats) <= window:
            continue

        for i in range(len(feats) - window):
            X_list.append(feats[i:i+window])
            y_list.append(targets[i+window])
            meta_list.append((country, g.iloc[i+window]["dt"]))

    return np.array(X_list), np.array(y_list), meta_list


# ===============================
# ATTENTION BiLSTM MODEL
# ===============================
class AttentionBiLSTM(nn.Module):
    def __init__(self, input_size=6, hidden_dim=64, num_layers=1):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True
        )

        self.attn = nn.Linear(hidden_dim * 2, 1)
        self.fc   = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        attn_scores = self.attn(lstm_out)
        attn_weights = torch.softmax(attn_scores, dim=1)
        context = torch.sum(attn_weights * lstm_out, dim=1)
        out = self.fc(context)
        return out.squeeze(-1)


# ===============================
# TRAINING FUNCTION (PER CONTINENT)
# ===============================
def train_bilstm_continent(df, continent_name, window=36, epochs=30):
    print(f"\n\n===== Training BiLSTM for {continent_name} =====")

    df_sub = df[df["Continent"] == continent_name].copy()
    df_sub = df_sub.sort_values(["CleanName", "dt"])

    if df_sub.empty:
        print("No data for this continent.")
        return None

    # Scale temperature only
    scaler = MinMaxScaler()
    df_sub["TempNorm"] = scaler.fit_transform(df_sub[["AverageTemperature"]])

    # Build sequences
    X, y, meta = build_sequences_multifeat(df_sub, window)

    if len(X) < 500:
        print(f"Warning: low samples ({len(X)})")

    # Train/Val/Test
    n = len(X)
    train_end = int(0.70 * n)
    val_end   = int(0.85 * n)

    X_train, y_train = X[:train_end], y[:train_end]
    X_val,   y_val   = X[train_end:val_end], y[train_end:val_end]
    X_test,  y_test  = X[val_end:], y[val_end:]

    train_ds = ClimateDataset(X_train, y_train)
    val_ds   = ClimateDataset(X_val, y_val)
    test_ds  = ClimateDataset(X_test, y_test)

    train_loader = DataLoader(train_ds, batch_size=64, shuffle=False)
    val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False)
    test_loader  = DataLoader(test_ds, batch_size=64, shuffle=False)

    # Model
    model = AttentionBiLSTM(hidden_dim=64).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
    loss_fn = nn.MSELoss()

    # Training loop
    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            optimizer.zero_grad()

            preds = model(Xb)
            loss = loss_fn(preds, yb)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)
            optimizer.step()

            train_loss += loss.item()
        train_loss /= len(train_loader)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for Xb, yb in val_loader:
                Xb, yb = Xb.to(device), yb.to(device)
                val_loss += loss_fn(model(Xb), yb).item()
        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}/{epochs} | Train {train_loss:.4f} | Val {val_loss:.4f}")

    # Predictions
    model.eval()
    preds = []
    with torch.no_grad():
        for Xb, _ in test_loader:
            Xb = Xb.to(device)
            preds.append(model(Xb).cpu().numpy())
    preds = np.concatenate(preds)

    # Metrics (normalized)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    mae  = mean_absolute_error(y_test, preds)

    actual_diff = np.sign(np.diff(y_test))
    pred_diff   = np.sign(np.diff(preds))
    da = (actual_diff == pred_diff).mean()

    print(f"{continent_name} → RMSE: {rmse:.4f}, MAE: {mae:.4f}, DA: {da:.4f}")

    return {
        "continent": continent_name,
        "rmse": rmse,
        "mae": mae,
        "directional_accuracy": da
    }


# ===============================
# RUN FOR ALL CONTINENTS
# ===============================
continents = [
    "Europe", "Asia", "Africa",
    "North America", "Oceania", "South America"
]

results_bilstm = []
for cont in continents:
    res = train_bilstm_continent(final_data_clean, cont)
    if res:
        results_bilstm.append(res)

df_results_bilstm = pd.DataFrame(results_bilstm)
df_results_bilstm



===== Training BiLSTM for Europe =====
Epoch 1/30 | Train 0.0079 | Val 0.0136
Epoch 2/30 | Train 0.0023 | Val 0.0139
Epoch 3/30 | Train 0.0022 | Val 0.0179
Epoch 4/30 | Train 0.0021 | Val 0.0171
Epoch 5/30 | Train 0.0020 | Val 0.0145
Epoch 6/30 | Train 0.0019 | Val 0.0133
Epoch 7/30 | Train 0.0019 | Val 0.0124
Epoch 8/30 | Train 0.0018 | Val 0.0115
Epoch 9/30 | Train 0.0018 | Val 0.0107
Epoch 10/30 | Train 0.0017 | Val 0.0098
Epoch 11/30 | Train 0.0016 | Val 0.0091
Epoch 12/30 | Train 0.0015 | Val 0.0083
Epoch 13/30 | Train 0.0015 | Val 0.0076
Epoch 14/30 | Train 0.0014 | Val 0.0070
Epoch 15/30 | Train 0.0014 | Val 0.0065
Epoch 16/30 | Train 0.0013 | Val 0.0060
Epoch 17/30 | Train 0.0013 | Val 0.0056
Epoch 18/30 | Train 0.0012 | Val 0.0051
Epoch 19/30 | Train 0.0012 | Val 0.0046
Epoch 20/30 | Train 0.0011 | Val 0.0042
Epoch 21/30 | Train 0.0011 | Val 0.0037
Epoch 22/30 | Train 0.0011 | Val 0.0034
Epoch 23/30 | Train 0.0011 | Val 0.0042
Epoch 24/30 | Train 0.0011 | Val 0.0043
Epoch 25

Unnamed: 0,continent,rmse,mae,directional_accuracy
0,Europe,0.041422,0.03369,0.883119
1,Asia,0.03152,0.023536,0.881022
2,Africa,0.034632,0.024564,0.853076
3,North America,0.028855,0.019073,0.850125
4,Oceania,0.049648,0.038547,0.699769
5,South America,0.085169,0.071998,0.687718
