# Multivariate LSTM for Arctic Sea Ice Extent Forecasting

This notebook extends the basic LSTM by incorporating pan-arctic climate variables as additional features.

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from src.data_utils import load_data

In [2]:
class MultivariateArcticDataset(torch.utils.data.Dataset):

    def __init__(self, data, sequence_length=30, forecast_horizon=1, features=None, target='extent_mkm2', scaler=None, lag_features=None, add_cyclical_time=False):
        self.data = data.sort_values('date').reset_index(drop=True)
        self.sequence_length = sequence_length
        self.forecast_horizon = forecast_horizon
        self.target = target
        
        if features is None:
            self.features = ['extent_mkm2']
        else:
            self.features = features.copy()
        
        if add_cyclical_time:
            day_of_year = pd.to_datetime(self.data['date']).dt.dayofyear
            self.data['day_of_year_sin'] = np.sin(2 * np.pi * day_of_year / 365.25)
            self.data['day_of_year_cos'] = np.cos(2 * np.pi * day_of_year / 365.25)
            self.features.extend(['day_of_year_sin', 'day_of_year_cos'])
        
        if lag_features is not None:
            for column, lags in lag_features.items():
                for lag_days in lags:
                    lagged_column_name = f"{column}_lag{lag_days}"
                    self.data[lagged_column_name] = self.data[column].shift(lag_days)
                    self.features.append(lagged_column_name)
            
            self.data = self.data.dropna().reset_index(drop=True)
        
        self.data_values = self.data[self.features].values.astype(np.float32)
        
        self.target_idx = self.features.index(self.target)
        
        if scaler is None:
            self.mean = self.data_values.mean(axis=0, keepdims=True)
            self.std = self.data_values.std(axis=0, keepdims=True)
            self.std = np.where(self.std == 0, 1.0, self.std)
        else:
            self.mean, self.std = scaler
        
        self.data_normalized = (self.data_values - self.mean) / self.std
    
    def __len__(self):
        return len(self.data_normalized) - self.sequence_length - self.forecast_horizon + 1
    
    def __getitem__(self, idx):
        X = self.data_normalized[idx:idx + self.sequence_length]
        
        y = self.data_normalized[idx + self.sequence_length + self.forecast_horizon - 1][self.target_idx]
        
        X = torch.tensor(X, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32)
        
        return X, y

## Load Data and Explore Features

In [3]:
# Load training data (1989-2019)
train_data = load_data(regions='pan_arctic', years=range(1989, 2020))

# Load test data (2020-2023)
test_data = load_data(regions='pan_arctic', years=range(2020, 2024))

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(f"\nAvailable columns:")
for col in train_data.columns:
    print(f"  - {col}")

Training data shape: (11322, 27)
Test data shape: (1461, 27)

Available columns:
  - date
  - region
  - msl_mean
  - msl_p15
  - msl_p85
  - msl_std
  - t2m_mean
  - t2m_p15
  - t2m_p85
  - t2m_std
  - tp_mean
  - tp_p15
  - tp_p85
  - tp_std
  - u10_mean
  - u10_p15
  - u10_p85
  - u10_std
  - v10_mean
  - v10_p15
  - v10_p85
  - v10_std
  - wind_speed_mean
  - wind_speed_p15
  - wind_speed_p85
  - wind_speed_std
  - extent_mkm2


In [4]:
features = [
    'extent_mkm2',
    't2m_mean',
    't2m_std',
    'msl_mean',
    'msl_std',
    'wind_speed_mean',
    'wind_speed_std',
]

## Multivariate LSTM Model

LSTM cells naturally handle multivariate sequences because they process each timestep's feature vector through the same weight matrices.

In [5]:
class IceExtentLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2, output_size=1, dropout=0.2):
        super(IceExtentLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, (h_n, c_n) = self.lstm(x, (h_0, c_0))
        
        out = self.fc(out[:, -1, :])
        
        return out

## Create Datasets (Non-Lagged)

In [6]:
train_dataset = MultivariateArcticDataset(
    train_data, 
    sequence_length=30, 
    forecast_horizon=1, 
    features=features,
    target='extent_mkm2'
)

test_dataset = MultivariateArcticDataset(
    test_data,
    sequence_length=30,
    forecast_horizon=1,
    features=features,
    target='extent_mkm2',
    scaler=(train_dataset.mean, train_dataset.std)
)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

for X_batch, y_batch in train_loader:
    print(f"\nBatch X shape: {X_batch.shape}")
    print(f"Batch y shape: {y_batch.shape}")
    break

Training samples: 11292
Test samples: 1431

Batch X shape: torch.Size([32, 30, 7])
Batch y shape: torch.Size([32])


## Train Model (Non-Lagged)

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

model = IceExtentLSTM(input_size=len(features), hidden_size=64, num_layers=2, output_size=1, dropout=0.2)
model = model.to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)

print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Using device: cpu
Model parameters: 52,033


In [8]:
num_epochs = 200
best_val_loss = float('inf')
patience = 15
patience_counter = 0

train_losses = []
val_losses = []

print("Starting training...\n")
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        predictions = model(X_batch)
        loss = criterion(predictions.squeeze(), y_batch)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            predictions = model(X_batch)
            loss = criterion(predictions.squeeze(), y_batch)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(test_loader)

    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

    scheduler.step(avg_val_loss)

    if (epoch + 1) % 5 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'  Train Loss: {avg_train_loss:.6f}')
        print(f'  Val Loss: {avg_val_loss:.6f}')

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_multivariate_model.pt')
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"\nEarly stopping at epoch {epoch+1}")
            break

print(f"\nTraining complete! Best validation loss: {best_val_loss:.6f}")

Starting training...

Epoch 5/200
  Train Loss: 0.002130
  Val Loss: 0.001298
Epoch 10/200
  Train Loss: 0.001572
  Val Loss: 0.001344
Epoch 15/200
  Train Loss: 0.001221
  Val Loss: 0.001461
Epoch 20/200
  Train Loss: 0.001112
  Val Loss: 0.000868
Epoch 25/200
  Train Loss: 0.000983
  Val Loss: 0.000851
Epoch 30/200
  Train Loss: 0.000938
  Val Loss: 0.001075
Epoch 35/200
  Train Loss: 0.000852
  Val Loss: 0.000688
Epoch 40/200
  Train Loss: 0.000779
  Val Loss: 0.000665
Epoch 45/200
  Train Loss: 0.000703
  Val Loss: 0.000447
Epoch 50/200
  Train Loss: 0.000661
  Val Loss: 0.000419
Epoch 55/200
  Train Loss: 0.000578
  Val Loss: 0.000502
Epoch 60/200
  Train Loss: 0.000497
  Val Loss: 0.000504
Epoch 65/200
  Train Loss: 0.000489
  Val Loss: 0.000400
Epoch 70/200
  Train Loss: 0.000430
  Val Loss: 0.000406
Epoch 75/200
  Train Loss: 0.000425
  Val Loss: 0.000396
Epoch 80/200
  Train Loss: 0.000399
  Val Loss: 0.000350
Epoch 85/200
  Train Loss: 0.000386
  Val Loss: 0.000342
Epoch 90/2

## Lagged Features

Add temporal lags to capture recent trends in ice extent and temperature.

In [9]:
lag_features = {
    'extent_mkm2': [7, 14, 30],
    't2m_mean': [7, 14, 30],
}

## Create Datasets with Lagged Features

The Dataset class automatically creates lagged columns and adds them to the feature set.

In [10]:
train_dataset_lagged = MultivariateArcticDataset(
    train_data, 
    sequence_length=30, 
    forecast_horizon=1,
    features=features,
    target='extent_mkm2',
    lag_features=lag_features
)

test_dataset_lagged = MultivariateArcticDataset(
    test_data,
    sequence_length=30,
    forecast_horizon=1,
    features=features,
    target='extent_mkm2',
    scaler=(train_dataset_lagged.mean, train_dataset_lagged.std),
    lag_features=lag_features
)

train_loader_lagged = torch.utils.data.DataLoader(train_dataset_lagged, batch_size=32, shuffle=True)
test_loader_lagged = torch.utils.data.DataLoader(test_dataset_lagged, batch_size=32, shuffle=False)

print(f"Training samples: {len(train_dataset_lagged)}")
print(f"Test samples: {len(test_dataset_lagged)}")
print(f"Number of features: {len(train_dataset_lagged.features)}")
print(f"\nFeatures:")
for i, f in enumerate(train_dataset_lagged.features):
    print(f"  {i}: {f}")

Training samples: 11262
Test samples: 1401
Number of features: 13

Features:
  0: extent_mkm2
  1: t2m_mean
  2: t2m_std
  3: msl_mean
  4: msl_std
  5: wind_speed_mean
  6: wind_speed_std
  7: extent_mkm2_lag7
  8: extent_mkm2_lag14
  9: extent_mkm2_lag30
  10: t2m_mean_lag7
  11: t2m_mean_lag14
  12: t2m_mean_lag30


In [11]:
for X_batch, y_batch in train_loader_lagged:
    print(f"Batch X shape: {X_batch.shape}")
    print(f"Batch y shape: {y_batch.shape}")
    break

Batch X shape: torch.Size([32, 30, 13])
Batch y shape: torch.Size([32])


## Train Model with Lagged Features

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

model_lagged = IceExtentLSTM(
    input_size=len(train_dataset_lagged.features), 
    hidden_size=64, 
    num_layers=2, 
    output_size=1, 
    dropout=0.2
)
model_lagged = model_lagged.to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_lagged.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)

print(f"Model parameters: {sum(p.numel() for p in model_lagged.parameters()):,}")

Using device: cpu
Model parameters: 53,569


In [13]:
num_epochs = 200
best_val_loss = float('inf')
patience = 15
patience_counter = 0

train_losses_lagged = []
val_losses_lagged = []

print("Starting training...\n")
for epoch in range(num_epochs):
    model_lagged.train()
    train_loss = 0.0

    for X_batch, y_batch in train_loader_lagged:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        predictions = model_lagged(X_batch)
        loss = criterion(predictions.squeeze(), y_batch)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_lagged.parameters(), max_norm=1.0)
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader_lagged)

    model_lagged.eval()
    val_loss = 0.0

    with torch.no_grad():
        for X_batch, y_batch in test_loader_lagged:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            predictions = model_lagged(X_batch)
            loss = criterion(predictions.squeeze(), y_batch)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(test_loader_lagged)

    train_losses_lagged.append(avg_train_loss)
    val_losses_lagged.append(avg_val_loss)

    scheduler.step(avg_val_loss)

    if (epoch + 1) % 5 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'  Train Loss: {avg_train_loss:.6f}')
        print(f'  Val Loss: {avg_val_loss:.6f}')

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model_lagged.state_dict(), 'best_lagged_model.pt')
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"\nEarly stopping at epoch {epoch+1}")
            break

print(f"\nTraining complete! Best validation loss: {best_val_loss:.6f}")

Starting training...

Epoch 5/200
  Train Loss: 0.002292
  Val Loss: 0.002560
Epoch 10/200
  Train Loss: 0.001585
  Val Loss: 0.001721
Epoch 15/200
  Train Loss: 0.001330
  Val Loss: 0.000811
Epoch 20/200
  Train Loss: 0.001038
  Val Loss: 0.000773
Epoch 25/200
  Train Loss: 0.000939
  Val Loss: 0.001622
Epoch 30/200
  Train Loss: 0.000908
  Val Loss: 0.000952
Epoch 35/200
  Train Loss: 0.000741
  Val Loss: 0.000665
Epoch 40/200
  Train Loss: 0.000698
  Val Loss: 0.000914
Epoch 45/200
  Train Loss: 0.000633
  Val Loss: 0.000693
Epoch 50/200
  Train Loss: 0.000697
  Val Loss: 0.000641
Epoch 55/200
  Train Loss: 0.000501
  Val Loss: 0.000447
Epoch 60/200
  Train Loss: 0.000498
  Val Loss: 0.000542
Epoch 65/200
  Train Loss: 0.000471
  Val Loss: 0.000364
Epoch 70/200
  Train Loss: 0.000424
  Val Loss: 0.000365
Epoch 75/200
  Train Loss: 0.000421
  Val Loss: 0.000361
Epoch 80/200
  Train Loss: 0.000411
  Val Loss: 0.000393
Epoch 85/200
  Train Loss: 0.000376
  Val Loss: 0.000313
Epoch 90/2

## Cyclical Day-of-Year Encoding

Arctic ice extent has strong seasonal patterns. Cyclical encoding (sin/cos) preserves the circular nature of time where Dec 31 and Jan 1 are neighbors.

In [14]:
train_dataset_cyclical = MultivariateArcticDataset(
    train_data, 
    sequence_length=30, 
    forecast_horizon=1,
    features=features,
    target='extent_mkm2',
    add_cyclical_time=True
)

test_dataset_cyclical = MultivariateArcticDataset(
    test_data,
    sequence_length=30,
    forecast_horizon=1,
    features=features,
    target='extent_mkm2',
    scaler=(train_dataset_cyclical.mean, train_dataset_cyclical.std),
    add_cyclical_time=True
)

train_loader_cyclical = torch.utils.data.DataLoader(train_dataset_cyclical, batch_size=32, shuffle=True)
test_loader_cyclical = torch.utils.data.DataLoader(test_dataset_cyclical, batch_size=32, shuffle=False)

print(f"Training samples: {len(train_dataset_cyclical)}")
print(f"Test samples: {len(test_dataset_cyclical)}")
print(f"Number of features: {len(train_dataset_cyclical.features)}")
print(f"\nFeatures:")
for i, f in enumerate(train_dataset_cyclical.features):
    print(f"  {i}: {f}")

Training samples: 11292
Test samples: 1431
Number of features: 9

Features:
  0: extent_mkm2
  1: t2m_mean
  2: t2m_std
  3: msl_mean
  4: msl_std
  5: wind_speed_mean
  6: wind_speed_std
  7: day_of_year_sin
  8: day_of_year_cos


## Train Model with Cyclical Time Features

In [15]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

model_cyclical = IceExtentLSTM(
    input_size=len(train_dataset_cyclical.features), 
    hidden_size=64, 
    num_layers=2, 
    output_size=1, 
    dropout=0.2
)
model_cyclical = model_cyclical.to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_cyclical.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)

print(f"Model parameters: {sum(p.numel() for p in model_cyclical.parameters()):,}")

Using device: cpu
Model parameters: 52,545


In [16]:
num_epochs = 200
best_val_loss = float('inf')
patience = 15
patience_counter = 0

train_losses_cyclical = []
val_losses_cyclical = []

print("Starting training...\n")
for epoch in range(num_epochs):
    model_cyclical.train()
    train_loss = 0.0

    for X_batch, y_batch in train_loader_cyclical:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        predictions = model_cyclical(X_batch)
        loss = criterion(predictions.squeeze(), y_batch)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_cyclical.parameters(), max_norm=1.0)
        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader_cyclical)

    model_cyclical.eval()
    val_loss = 0.0

    with torch.no_grad():
        for X_batch, y_batch in test_loader_cyclical:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            predictions = model_cyclical(X_batch)
            loss = criterion(predictions.squeeze(), y_batch)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(test_loader_cyclical)

    train_losses_cyclical.append(avg_train_loss)
    val_losses_cyclical.append(avg_val_loss)

    scheduler.step(avg_val_loss)

    if (epoch + 1) % 5 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'  Train Loss: {avg_train_loss:.6f}')
        print(f'  Val Loss: {avg_val_loss:.6f}')

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model_cyclical.state_dict(), 'best_cyclical_model.pt')
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"\nEarly stopping at epoch {epoch+1}")
            break

print(f"\nTraining complete! Best validation loss: {best_val_loss:.6f}")

Starting training...

Epoch 5/200
  Train Loss: 0.002004
  Val Loss: 0.001188
Epoch 10/200
  Train Loss: 0.001784
  Val Loss: 0.001101
Epoch 15/200
  Train Loss: 0.001332
  Val Loss: 0.001086
Epoch 20/200
  Train Loss: 0.001158
  Val Loss: 0.000900
Epoch 25/200
  Train Loss: 0.000848
  Val Loss: 0.000647
Epoch 30/200
  Train Loss: 0.000827
  Val Loss: 0.000807
Epoch 35/200
  Train Loss: 0.000663
  Val Loss: 0.000672
Epoch 40/200
  Train Loss: 0.000619
  Val Loss: 0.000508
Epoch 45/200
  Train Loss: 0.000582
  Val Loss: 0.000520
Epoch 50/200
  Train Loss: 0.000559
  Val Loss: 0.000421
Epoch 55/200
  Train Loss: 0.000525
  Val Loss: 0.000424
Epoch 60/200
  Train Loss: 0.000511
  Val Loss: 0.000360
Epoch 65/200
  Train Loss: 0.000512
  Val Loss: 0.000404
Epoch 70/200
  Train Loss: 0.000442
  Val Loss: 0.000398
Epoch 75/200
  Train Loss: 0.000436
  Val Loss: 0.000346
Epoch 80/200
  Train Loss: 0.000407
  Val Loss: 0.000328
Epoch 85/200
  Train Loss: 0.000397
  Val Loss: 0.000339
Epoch 90/2