DGM

In [None]:
import os
import yaml
import torch
import pickle
import numpy as np
from models.mart import MART
from types import SimpleNamespace
from torch.utils.data import Dataset

In [None]:
!nvidia-smi


Mon Jan 13 15:25:01 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Install the correct version of PyTorch with GPU support
!pip install torch==2.1.0+cu118 torchvision==0.15.0+cu118 torchaudio==2.1.0+cu118 -f https://download.pytorch.org/whl/torch_stable.html

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==2.1.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torch-2.1.0%2Bcu118-cp310-cp310-linux_x86_64.whl (2325.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m680.5 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.15.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.15.0%2Bcu118-cp310-cp310-linux_x86_64.whl (6.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.1.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.1.0%2Bcu118-cp310-cp310-linux_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.1.0 (from torch==2.1.0+cu118)
  Downloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_6

In [None]:
import torch
print(torch.cuda.is_available())

True


In [None]:
def load_config(config_path):
    """
    Load configuration from a YAML file.

    Args:
        config_path (str): Path to the YAML configuration file.

    Returns:
        dict: Loaded configuration dictionary.
    """
    print(f"[INFO] Loading configuration from {config_path}...")
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    print("[INFO] Configuration loaded successfully!")
    return config


In [None]:
CHECKPOINT_PATH = "files/sdd_ckpt_best.pth"  # Update this
TEST_DATA_PATH = "files/sdd_test.pkl"  # Update this
CONFIG_PATH = "files/mart_sdd_reproduce.yaml"  # Update this
BATCH_SIZE = 64

In [None]:
def load_model(checkpoint_path, opts):
    print("[INFO] Loading model from checkpoint...")
    args = SimpleNamespace(**opts)
    model = MART(args)  # Remove .cuda()
    checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=True)


    model.load_state_dict(checkpoint['state_dict'])  # or whatever loading method you are using

    print("[INFO] Model loaded successfully!")
    return model


In [None]:
def evaluate_model(model, test_loader, opts):
    print("[INFO] Starting evaluation...")
    ade_sum, fde_sum, total_agents = 0.0, 0.0, 0

    # Set the device to GPU or CPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move model to device

    with torch.no_grad():
        for sample in test_loader:
            x_abs, y = sample
            x_abs, y = x_abs.to(device), y.to(device)  # Move tensors to device

            batch_size, num_agents, length, _ = x_abs.size()

            x_rel = torch.zeros_like(x_abs).to(device)  # Ensure x_rel is on the same device
            x_rel[:, :, 1:] = x_abs[:, :, 1:] - x_abs[:, :, :-1]
            x_rel[:, :, 0] = x_rel[:, :, 1]

            y_pred = model(x_abs, x_rel)

            if opts["pred_rel"]:
                cur_pos = x_abs[:, :, [-1]].unsqueeze(2)
                y_pred = torch.cumsum(y_pred, dim=3) + cur_pos

            ade = torch.min(torch.mean(torch.norm(y_pred - y[:, :, None], dim=-1), dim=3), dim=2)[0].mean().item()
            fde = torch.min(torch.mean(torch.norm(y_pred[:, :, :, -1:] - y[:, :, None, -1:], dim=-1), dim=3), dim=2)[0].mean().item()

            ade_sum += ade * num_agents * batch_size
            fde_sum += fde * num_agents * batch_size
            total_agents += num_agents * batch_size

    ade_avg = (ade_sum / total_agents) * opts["scale"]
    fde_avg = (fde_sum / total_agents) * opts["scale"]

    print(f"[INFO] Evaluation Results: ADE = {ade_avg:.4f}, FDE = {fde_avg:.4f}")
    return ade_avg, fde_avg


In [None]:
import torch
import numpy as np

class TrajectoryDataset(Dataset):
    def __init__(
        self, obs_len=8, pred_len=12, mode='train', scale=10, inputs=None, max_agents=50
    ):
        super(TrajectoryDataset, self).__init__()

        self.obs_len = obs_len
        self.pred_len = pred_len
        self.seq_len = self.obs_len + self.pred_len
        self.scale = scale
        self.max_agents = max_agents

        with open('files/sdd_test.pkl'.format(mode), 'rb') as f:
            traj = pickle.load(f)

        traj_tmp = []

        for t in traj:
            traj_tmp.append(t)
            if mode == 'train':
                traj_tmp.append(np.flip(t, axis=1))

        self.traj = []
        if 'pos_x' in inputs and 'pos_y' in inputs:
            for t in traj_tmp:
                t -= t[:, :1, :]
                self.traj.append(t)
        else:
            self.traj = traj_tmp


    def __len__(self):
        return len(self.traj)

    def __getitem__(self, index):
        past_traj = self.traj[index][:, :self.obs_len] * self.scale
        future_traj = self.traj[index][:, self.obs_len:] * self.scale

        # Padding the number of agents to max_agents if necessary
        num_agents = past_traj.shape[0]
        if num_agents < self.max_agents:
            pad_size = self.max_agents - num_agents
            past_traj_padded = np.pad(past_traj, ((0, pad_size), (0, 0), (0, 0)), mode='constant')
            future_traj_padded = np.pad(future_traj, ((0, pad_size), (0, 0), (0, 0)), mode='constant')
        else:
            past_traj_padded = past_traj
            future_traj_padded = future_traj

        past_traj = torch.from_numpy(past_traj_padded).type(torch.float)
        future_traj = torch.from_numpy(future_traj_padded).type(torch.float)

        return [past_traj, future_traj]


In [None]:
from torch.utils.data import DataLoader
if __name__ == "__main__":
    opts = load_config(CONFIG_PATH)
    model = load_model(CHECKPOINT_PATH, opts)

    # Prepare the test dataset and DataLoader
    test_dataset = TrajectoryDataset(
        obs_len=opts["past_length"],
        pred_len=opts["future_length"],
        mode="test",  # This loads the test set
        scale=opts["scale"],
        inputs=opts["inputs"]
    )
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

    print("[INFO] Starting evaluation with test data...")
    evaluate_model(model, test_loader, opts)

[INFO] Loading configuration from files/mart_sdd_reproduce.yaml...
[INFO] Configuration loaded successfully!
[INFO] Loading model from checkpoint...
[INFO] PRT Agg: cat
[INFO] HRT Agg: avg
[INFO] Binary Threshold Function Type: 2
[INFO] Model loaded successfully!
[INFO] Starting evaluation with test data...
[INFO] Starting evaluation...




[INFO] Evaluation Results: ADE = 0.7582, FDE = 1.2163


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from copy import deepcopy
from collections import OrderedDict
import yaml
from types import SimpleNamespace
import importlib
import warnings
import os

In [None]:
# Import the updated TrajectoryDataset and seq_collate
import dataloader_eth  # Replace with the correct module name if different
importlib.reload(dataloader_eth)
from dataloader_eth import TrajectoryDataset, seq_collate

In [None]:
# Import the custom loss function
class MinMSELoss(nn.Module):
    def __init__(self, reduction='mean'):
        super(MinMSELoss, self).__init__()
        self.reduction = reduction
        self.mse = nn.MSELoss(reduction='none')

    def forward(self, inputs, targets):
        """
        Args:
            inputs: Tensor of shape [batch_size, max_agents, num_preds, pred_len, 2]
            targets: Tensor of shape [batch_size, max_agents, pred_len, 2]
        Returns:
            Scalar loss
        """
        # Compute MSE loss without reduction
        mse = self.mse(inputs, targets.unsqueeze(2))  # Shape: [batch_size, max_agents, num_preds, pred_len, 2]
        mse = mse.mean(dim=-1)  # Shape: [batch_size, max_agents, num_preds, pred_len]
        mse = mse.mean(dim=-1)  # Shape: [batch_size, max_agents, num_preds]

        # Select the minimum loss across predictions
        min_mse, _ = mse.min(dim=-1)  # Shape: [batch_size, max_agents]

        if self.reduction == 'mean':
            return min_mse.mean()
        elif self.reduction == 'sum':
            return min_mse.sum()
        else:
            return min_mse

In [None]:
# Load model configuration
with open('files/mart_sdd_reproduce.yaml', 'r') as file:
    config_dict = yaml.safe_load(file)

In [None]:
# Convert the config dictionary to an object with attributes
config = SimpleNamespace(**config_dict)

In [None]:
# Ensure the base model is loaded from the SDD checkpoint
from models.mart import MART  # Ensure this import is correct

def load_model(checkpoint_path, opts):
    print("[INFO] Loading model from checkpoint...")
    args = opts  # Directly use the SimpleNamespace without unpacking
    model = MART(args)
    # Handle FutureWarning by setting weights_only=True if possible
    try:
        checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=True)
    except TypeError:
        # If weights_only is not supported, suppress the warning
        warnings.filterwarnings("ignore", category=FutureWarning)
        checkpoint = torch.load(checkpoint_path, map_location='cpu')
    # Ensure the correct key is used based on how the model was saved
    if 'model_state_dict' in checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'])
    elif 'state_dict' in checkpoint:
        model.load_state_dict(checkpoint['state_dict'])
    else:
        raise KeyError("Checkpoint does not contain 'model_state_dict' or 'state_dict'.")
    print("[INFO] Model loaded successfully!")
    return model

In [None]:
CHECKPOINT_PATH = "files/sdd_ckpt_best.pth"  # Ensure this path is correct
base_model = load_model(CHECKPOINT_PATH, config)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
base_model.to(device)

In [None]:
# Define datasets for ETH-UCY (organized as directories)
datasets = {
    'eth': 'Data/eth',
    'hotel': 'Data/hotel',
    'univ': 'Data/univ',  # Corrected path
    'zara1': 'Data/zara1',
    'zara2': 'Data/zara2',
}

In [None]:
# Prepare data loaders for each subset
data_loaders = {}
for subset, path in datasets.items():
    # Check if the path exists
    if not os.path.isdir(path):
        print(f"[ERROR] Directory not found: {path}. Please ensure the dataset is correctly downloaded and extracted.")
        continue  # Skip this subset

    dataset = TrajectoryDataset(
        args=config,
        data_dir=path,
        obs_len=config.past_length,
        pred_len=config.future_length,
        delim='\t',
        max_agents=60,  # Increased to cover maximum agents
    )
    loader = DataLoader(
        dataset,
        batch_size=config.batch_size,
        shuffle=True,
        collate_fn=seq_collate,
        num_workers=2,  # Reduced to 2 based on system recommendation
    )
    data_loaders[subset] = loader

In [None]:
# Verify that at least one DataLoader is available
if not data_loaders:
    raise ValueError("No valid dataset directories found. Please check your dataset paths.")

def fine_tune_model(model, data_loader, optimizer, criterion, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for batch_idx, batch in enumerate(data_loader):
            # Ensure that 'past_traj' and 'future_traj' are keys returned by seq_collate
            inputs = batch['past_traj'].to(device)      # Shape: [batch_size, max_agents, obs_len, 2]
            targets = batch['future_traj'].to(device)   # Shape: [batch_size, max_agents, pred_len, 2]

            # Debugging statements
            if batch_idx == 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx+1}:")
                print(f"  Inputs shape: {inputs.shape}")
                print(f"  Targets shape: {targets.shape}")

            # Compute relative positions if required by the model
            x_rel = torch.zeros_like(inputs).to(device)
            x_rel[:, :, 1:] = inputs[:, :, 1:] - inputs[:, :, :-1]
            x_rel[:, :, 0] = x_rel[:, :, 1]

            # Forward pass
            outputs = model(inputs, x_rel)  # Adjust if your model requires different inputs

            # Compute loss
            loss = criterion(outputs, targets)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
        avg_loss = epoch_loss / len(data_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}] Loss: {avg_loss:.4f}")
    return deepcopy(model.state_dict())


In [None]:
def average_weights(weight_list):
    avg_weights = OrderedDict()
    for key in weight_list[0].keys():
        # Stack weights for the current key from all models and compute the mean
        avg_weights[key] = torch.mean(torch.stack([weights[key] for weights in weight_list]), dim=0)
    return avg_weights

fine_tuned_weights = []


In [None]:
# Fine-tune on each subset separately
for subset_name, data_loader in data_loaders.items():
    print(f"\nFine-tuning on subset: {subset_name}")
    # Create a fresh copy of the base model for each subset
    model_copy = deepcopy(base_model)
    model_copy.to(device)

    # Define a separate optimizer for each model copy
    optimizer = torch.optim.Adam(model_copy.parameters(), lr=config.lr)

    # Define the loss criterion
    # Replace nn.MSELoss with the custom MinMSELoss
    criterion = MinMSELoss(reduction='mean')

    # Fine-tune the model copy on the current subset
    state_dict = fine_tune_model(model_copy, data_loader, optimizer, criterion, num_epochs=config.num_epochs)

    # Collect the fine-tuned weights
    fine_tuned_weights.append(state_dict)
    print(f"Finished fine-tuning on subset: {subset_name}")


if not fine_tuned_weights:
    raise ValueError("No models were fine-tuned. Please check your data loaders and dataset directories.")

In [None]:
# Perform weight averaging across all fine-tuned models
print("\nAveraging weights from all fine-tuned models...")
final_weights = average_weights(fine_tuned_weights)

# Load the averaged weights into the base model
base_model.load_state_dict(final_weights)

# Save the fine-tuned and averaged model
torch.save({'model_state_dict': base_model.state_dict()}, 'files/eth_ucy_finetuned.pth')
print("[INFO] Fine-tuned model saved successfully at 'files/eth_ucy_finetuned.pth'.")


[INFO] Loading model from checkpoint...
[INFO] PRT Agg: cat
[INFO] HRT Agg: avg
[INFO] Binary Threshold Function Type: 2
[INFO] Model loaded successfully!
Processing Data .....


100%|██████████| 253/253 [00:00<00:00, 1551.58it/s]


Processing Data .....


100%|██████████| 445/445 [00:00<00:00, 1185.22it/s]


Processing Data .....


100%|██████████| 320/320 [00:00<00:00, 1171.66it/s]


Processing Data .....


100%|██████████| 705/705 [00:00<00:00, 1033.40it/s]


Processing Data .....


100%|██████████| 695/695 [00:01<00:00, 612.26it/s]


Fine-tuning on subset: eth





Epoch 1, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/300] Loss: 0.3578
Epoch 2, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [2/300] Loss: 0.1949
Epoch 3, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [3/300] Loss: 0.1504
Epoch 4, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [4/300] Loss: 0.1053
Epoch 5, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [5/300] Loss: 0.0786
Epoch 6, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [6/300] Loss: 0.0712
Epoch 7, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [7/300] Loss: 0.0570
Epoch 8, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [8/300] Loss: 0.0466
Epoch 9, Batch 1:
  Inputs shape: torch.Si

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 2, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [2/300] Loss: 0.1547
Epoch 3, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [3/300] Loss: 0.0821
Epoch 4, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [4/300] Loss: 0.0738
Epoch 5, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [5/300] Loss: 0.0769
Epoch 6, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [6/300] Loss: 0.0654
Epoch 7, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [7/300] Loss: 0.0626
Epoch 8, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Targets shape: torch.Size([64, 60, 12, 2])
Epoch [8/300] Loss: 0.0680
Epoch 9, Batch 1:
  Inputs shape: torch.Size([64, 60, 8, 2])
  Target

In [None]:
def evaluate_finetuned_model(model, test_loader, opts):
    print("[INFO] Starting evaluation of the fine-tuned model...")
    ade_sum, fde_sum, total_agents = 0.0, 0.0, 0

    model.eval()
    with torch.no_grad():
        for sample in test_loader:
            inputs = sample['past_traj'].to(device)      # Shape: [batch_size, max_agents, obs_len, 2]
            targets = sample['future_traj'].to(device)   # Shape: [batch_size, max_agents, pred_len, 2]

            batch_size, num_agents, length, _ = inputs.size()

            x_rel = torch.zeros_like(inputs).to(device)
            x_rel[:, :, 1:] = inputs[:, :, 1:] - inputs[:, :, :-1]
            x_rel[:, :, 0] = x_rel[:, :, 1]

            y_pred = model(inputs, x_rel)

            if opts.pred_rel:
                cur_pos = inputs[:, :, [-1]].unsqueeze(2)
                y_pred = torch.cumsum(y_pred, dim=3) + cur_pos

            ade = torch.min(torch.mean(torch.norm(y_pred - targets[:, :, None], dim=-1), dim=3), dim=2)[0].mean().item()
            fde = torch.min(torch.mean(torch.norm(y_pred[:, :, :, -1:] - targets[:, :, None, -1:], dim=-1), dim=3), dim=2)[0].mean().item()

            ade_sum += ade * num_agents * batch_size
            fde_sum += fde * num_agents * batch_size
            total_agents += num_agents * batch_size

    ade_avg = (ade_sum / total_agents) * opts.scale
    fde_avg = (fde_sum / total_agents) * opts.scale

    print(f"[INFO] Fine-Tuned Model Evaluation Results: ADE = {ade_avg:.4f}, FDE = {fde_avg:.4f}")
    return ade_avg, fde_avg

# Prepare the test DataLoader
test_dataset = TrajectoryDataset(
    args=config,
    data_dir='Data/eth_ucy_test',  # Ensure this is a directory containing test `.txt` files
    obs_len=config.past_length,
    pred_len=config.future_length,
    delim='\t',
    max_agents=50,  # Ensure consistency with training
)
test_loader = DataLoader(
    test_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    collate_fn=seq_collate,
    num_workers=2,  # Reduced to 2
)

# Evaluate the fine-tuned model
evaluate_finetuned_model(base_model, test_loader, config)
