In [1]:
from video_diffusion_pytorch import Unet3D, GaussianDiffusion, Trainer
import torch


In [2]:
from video_diffusion_pytorch import Unet3D

model = Unet3D(
    dim=64,
    dim_mults=(1, 2, 4),
    channels=6   # or 5, depending on how many spectral bands you use
)


In [3]:
from video_diffusion_pytorch import GaussianDiffusion
diffusion = GaussianDiffusion(
    model,
    image_size=128,
    num_frames=8,
    timesteps=1000,
    loss_type='l2',
    channels=6   # ✅ explicitly set channels here
)

In [4]:
# from video_diffusion_pytorch import GaussianDiffusion

# diffusion = GaussianDiffusion(
#     model,            # the U-Net we just created
#     image_size=128,         # input images must be resized to 128x128
#     num_frames=8,           # number of time steps (frames) in your video clip
#     timesteps=1000,         # number of diffusion steps (noise levels)
#     loss_type='l2'          # standard pixel-wise loss
# )


In [6]:
import h5py
import numpy as np
import torch

# Function to load a sequence of 8 frames from multiple .h5 files
def load_sequence(h5_file_paths, bands, sequence_length=8):
    """
    Load a sequence of 8 frames, each with stacked 6-channel data (VIS/WV/etc.)

    Args:
        h5_file_paths: list of 8 h5 file paths (1 per timestamp)
        bands: list of 6 band names to extract
        sequence_length: number of frames (default=8)

    Returns:
        A tensor of shape (8, 6, 128, 128)
    """
    sequence = []

    for path in h5_file_paths[:sequence_length]:
        with h5py.File(path, 'r') as f:
            frame = []
            for band in bands:
                data = f[band][0]  # shape: (128, 128)
                frame.append(data)
            frame = np.stack(frame)  # shape: (6, 128, 128)
            sequence.append(frame)

    sequence_np = np.stack(sequence)  # shape: (8, 6, 128, 128)
    return torch.tensor(sequence_np, dtype=torch.float32)


In [7]:
from torch.utils.data import Dataset, DataLoader

class CloudMotionDataset(Dataset):
    def __init__(self, h5_file_paths, bands, sequence_length=8):
        self.h5_file_paths = h5_file_paths
        self.bands = bands
        self.sequence_length = sequence_length

        # Ensure only full sequences are kept
        self.valid_start_indices = list(range(0, len(h5_file_paths) - sequence_length + 1))

    def __len__(self):
        return len(self.valid_start_indices)

    def __getitem__(self, idx):
        start_idx = self.valid_start_indices[idx]
        sequence_paths = self.h5_file_paths[start_idx : start_idx + self.sequence_length]
        return load_sequence(sequence_paths, self.bands, self.sequence_length)  # shape: (8, 6, 128, 128)


In [8]:
from google.colab import drive
import os

# 1. Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import glob
import os
# Replace this path with your actual folder containing the .h5 files
folder_path = ''

# This will collect all .h5 files in sorted time order
all_h5_paths = sorted(glob.glob(folder_path + '*.h5'))

# Define spectral bands you want to use
band_list = ['IMG_VIS', 'IMG_WV', 'IMG_TIR1', 'IMG_TIR2', 'IMG_MIR', 'IMG_SWIR']

# Create dataset
dataset = CloudMotionDataset(all_h5_paths, band_list)


In [11]:
print(len(all_h5_paths))  # should be >= 8


9


In [13]:
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0)

# Optimizer
optimizer = torch.optim.Adam(diffusion.parameters(), lr=8e-5)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
diffusion.to(device)

# Training Loop
num_epochs = 10  # or define based on steps you want
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    for batch in tqdm(dataloader):
        batch = batch.to(device)  # shape: (B, 8, 6, 128, 128)

        # Reorder dims to (B, C, F, H, W) as expected by diffusion
        batch = batch.permute(0, 2, 1, 3, 4)  # (B, C, F, H, W)

        loss = diffusion(batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} loss: {loss.item():.6f}")


Epoch 1/10


100%|██████████| 1/1 [03:09<00:00, 189.74s/it]


Epoch 1 loss: 14341.846680
Epoch 2/10


100%|██████████| 1/1 [03:19<00:00, 199.34s/it]


Epoch 2 loss: 9011.032227
Epoch 3/10


100%|██████████| 1/1 [03:09<00:00, 189.65s/it]


Epoch 3 loss: 670.778076
Epoch 4/10


100%|██████████| 1/1 [02:34<00:00, 154.03s/it]


Epoch 4 loss: 1806.480591
Epoch 5/10


100%|██████████| 1/1 [03:15<00:00, 195.37s/it]


Epoch 5 loss: 1016.767517
Epoch 6/10


100%|██████████| 1/1 [02:36<00:00, 156.09s/it]


Epoch 6 loss: 1515.173706
Epoch 7/10


100%|██████████| 1/1 [02:54<00:00, 174.81s/it]


Epoch 7 loss: 870.424133
Epoch 8/10


100%|██████████| 1/1 [02:40<00:00, 160.17s/it]


Epoch 8 loss: 1256.790039
Epoch 9/10


100%|██████████| 1/1 [02:48<00:00, 168.31s/it]


Epoch 9 loss: 1062.807861
Epoch 10/10


100%|██████████| 1/1 [02:49<00:00, 169.29s/it]

Epoch 10 loss: 1550.648071





In [14]:
# Save the model and diffusion weights
torch.save({
    'model_state_dict': model.state_dict(),
    'diffusion_state_dict': diffusion.state_dict()
}, 'cloud_motion_diffusion.pth')


In [15]:
from video_diffusion_pytorch import Unet3D, GaussianDiffusion
import torch

# Recreate the model structure
model = Unet3D(
    dim=64,
    dim_mults=(1, 2, 4),
    channels=6  # must match training
)

# Recreate the diffusion wrapper
diffusion = GaussianDiffusion(
    model,
    image_size=128,
    num_frames=8,
    timesteps=1000
)

# Load the saved weights
checkpoint = torch.load('cloud_motion_diffusion.pth', map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
diffusion.load_state_dict(checkpoint['diffusion_state_dict'])

# Set to evaluation mode
model.eval()
diffusion.eval()

GaussianDiffusion(
  (denoise_fn): Unet3D(
    (time_rel_pos_bias): RelativePositionBias(
      (relative_attention_bias): Embedding(32, 8)
    )
    (init_conv): Conv3d(6, 64, kernel_size=(1, 7, 7), stride=(1, 1, 1), padding=(0, 3, 3))
    (init_temporal_attn): Residual(
      (fn): PreNorm(
        (fn): EinopsToAndFrom(
          (fn): Attention(
            (rotary_emb): RotaryEmbedding()
            (to_qkv): Linear(in_features=64, out_features=768, bias=False)
            (to_out): Linear(in_features=256, out_features=64, bias=False)
          )
        )
        (norm): LayerNorm()
      )
    )
    (time_mlp): Sequential(
      (0): SinusoidalPosEmb()
      (1): Linear(in_features=64, out_features=256, bias=True)
      (2): GELU(approximate='none')
      (3): Linear(in_features=256, out_features=256, bias=True)
    )
    (downs): ModuleList(
      (0): ModuleList(
        (0-1): 2 x ResnetBlock(
          (mlp): Sequential(
            (0): SiLU()
            (1): Linear(in_fea

In [26]:
import torch



# move to GPU if available

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

diffusion.to(device)



# define shape: (B, C, F, H, W)

sample_shape = (1, 6, 8, 128, 128)



# generate sample

with torch.no_grad():

    sample = diffusion.sample(shape=sample_shape).to('cpu')  # shape: (1, 6, 8, 128, 128)



# remove batch dim

sample_np = sample[0].numpy()  # shape: (6, 8, 128, 128)

TypeError: GaussianDiffusion.sample() got an unexpected keyword argument 'shape'