In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torchdiffeq import odeint

In [2]:
print(torch.cuda.device_count())
print(torch.version.cuda)  # Should print 12.6

1
12.8


In [None]:
# Define the ODE function as before
# This class defines a neural network module that represents the function for an Ordinary Differential Equation (ODE).
# It is typically used in Neural ODEs (Neural Ordinary Differential Equations), where the forward pass involves solving an ODE.
class ODEFunc(nn.Module):  # Inherits from PyTorch's nn.Module
    def __init__(self, hidden_dim):
        # Constructor for the ODEFunc class
        # `hidden_dim` is the dimensionality of the input and output of the ODE function.
        super(ODEFunc, self).__init__()  # Initializes the parent nn.Module class
        
        # Define a simple feedforward neural network with:
        # - An input layer of size `hidden_dim`
        # - A hidden layer with 50 units and ReLU activation
        # - An output layer of size `hidden_dim`
        self.net = nn.Sequential(
            nn.Linear(hidden_dim, 50),  # Fully connected layer: input -> hidden
            nn.ReLU(),                 # Activation function: ReLU
            nn.Linear(50, hidden_dim)  # Fully connected layer: hidden -> output
        )

    def forward(self, t, x):
        # Forward pass of the ODE function
        # `t` is the time variable (often unused in simple ODE functions like this)
        # `x` is the input tensor
        # Returns the output of the neural network applied to `x`
        return self.net(x)

In [4]:
print(torch.cuda.is_available())

True


In [11]:
# Define the ODE block that integrates the ODE function
class ODEBlock(nn.Module):
    def __init__(self, odefunc, t=torch.tensor([0., 1.])):
        # Constructor for the ODEBlock class
        # `odefunc` is an instance of the ODEFunc class (or any compatible ODE function)
        # `t` is a tensor specifying the time interval for solving the ODE
        super(ODEBlock, self).__init__()
        self.odefunc = odefunc  # Store the ODE function
        self.register_buffer('t', t)  # Register `t` as a buffer (non-trainable parameter)

    def forward(self, x):
        # Forward pass of the ODEBlock
        # `x` is the input tensor (initial state for the ODE solver)
        # Solves the ODE defined by `odefunc` over the time interval `t`
        out = odeint(self.odefunc, x, self.t)  # `odeint` integrates the ODE
        return out[-1]  # Return the final state (last time step)

In [None]:
# Define the full Neural ODE model for MNIST
class NeuralODEMNIST(nn.Module):
    def __init__(self, hidden_dim=64):
        # Constructor for the NeuralODEMNIST class
        # `hidden_dim` specifies the dimensionality of the hidden layer in the ODE block
        super(NeuralODEMNIST, self).__init__()
        
        # Convolutional layers for feature extraction
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, 3, stride=1, padding=1),  # Convolution: 1 input channel -> 16 output channels
            nn.ReLU(),                                # Activation function: ReLU
            nn.MaxPool2d(2),                          # Downsampling: 2x2 max pooling
            nn.Conv2d(16, 32, 3, stride=1, padding=1),# Convolution: 16 input channels -> 32 output channels
            nn.ReLU(),                                # Activation function: ReLU
            nn.MaxPool2d(2)                           # Downsampling: 2x2 max pooling
        )
        # After two pooling layers, the image size reduces from 28x28 to 7x7
        
        # Fully connected layer to map the flattened features to the hidden dimension
        self.fc_in = nn.Linear(32 * 7 * 7, hidden_dim)
        
        # ODE block for modeling continuous dynamics
        self.odeblock = ODEBlock(ODEFunc(hidden_dim))
        
        # Fully connected output layer to map the hidden dimension to 10 classes (digits 0-9)
        self.fc_out = nn.Linear(hidden_dim, 10)
        
    def forward(self, x):
        # Forward pass of the NeuralODEMNIST model
        x = self.conv(x)            # Apply convolutional layers
        x = x.view(x.size(0), -1)   # Flatten the output of the convolutional layers
        x = self.fc_in(x)           # Map to the hidden dimension
        x = self.odeblock(x)        # Apply the ODE block
        x = self.fc_out(x)          # Map to the output classes
        return x

In [None]:
# Set up training parameters
batch_size = 64
learning_rate = 0.001
epochs = 5

# Define transforms for the MNIST data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # Normalize using MNIST mean and std
])

In [None]:
# Download and load the MNIST training and test datasets
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset  = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Instantiate the model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = NeuralODEMNIST().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
print(model)

cuda
NeuralODEMNIST(
  (conv): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_in): Linear(in_features=1568, out_features=64, bias=True)
  (odeblock): ODEBlock(
    (odefunc): ODEFunc(
      (net): Sequential(
        (0): Linear(in_features=64, out_features=50, bias=True)
        (1): ReLU()
        (2): Linear(in_features=50, out_features=64, bias=True)
      )
    )
  )
  (fc_out): Linear(in_features=64, out_features=10, bias=True)
)


In [None]:
# Training loop
model.train()
for epoch in range(1, epochs+1):
    total_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)  # Move data and labels to the selected device (CPU/GPU)
        optimizer.zero_grad()  # Clear the gradients from the previous step
        output = model(data)  # Forward pass: compute model predictions
        loss = criterion(output, target)  # Compute the loss
        loss.backward()  # Backpropagation: compute gradients
        optimizer.step()  # Update model parameters using the optimizer
        total_loss += loss.item()  # Accumulate the loss for the current batch
        if batch_idx % 100 == 0:  # Print progress every 100 batches
            print(f"Epoch [{epoch}/{epochs}], Batch [{batch_idx}/{len(train_loader)}], Loss: {loss.item():.4f}")
    avg_loss = total_loss / len(train_loader)  # Compute the average loss for the epoch
    print(f"Epoch [{epoch}/{epochs}] Average Loss: {avg_loss:.4f}")

Epoch [1/5], Batch [0/938], Loss: 2.3122
Epoch [1/5], Batch [100/938], Loss: 0.2021
Epoch [1/5], Batch [200/938], Loss: 0.1393
Epoch [1/5], Batch [300/938], Loss: 0.0466
Epoch [1/5], Batch [400/938], Loss: 0.1049
Epoch [1/5], Batch [500/938], Loss: 0.0866
Epoch [1/5], Batch [600/938], Loss: 0.0269
Epoch [1/5], Batch [700/938], Loss: 0.0547
Epoch [1/5], Batch [800/938], Loss: 0.0582
Epoch [1/5], Batch [900/938], Loss: 0.0454
Epoch [1/5] Average Loss: 0.1459
Epoch [2/5], Batch [0/938], Loss: 0.0392
Epoch [2/5], Batch [100/938], Loss: 0.0288
Epoch [2/5], Batch [200/938], Loss: 0.0717
Epoch [2/5], Batch [300/938], Loss: 0.0079
Epoch [2/5], Batch [400/938], Loss: 0.1161
Epoch [2/5], Batch [500/938], Loss: 0.0725
Epoch [2/5], Batch [600/938], Loss: 0.0475
Epoch [2/5], Batch [700/938], Loss: 0.0198
Epoch [2/5], Batch [800/938], Loss: 0.0123
Epoch [2/5], Batch [900/938], Loss: 0.0637
Epoch [2/5] Average Loss: 0.0498
Epoch [3/5], Batch [0/938], Loss: 0.0095
Epoch [3/5], Batch [100/938], Loss: 0

In [None]:
# Evaluation on test dataset
model.eval()  # Set the model to evaluation mode
correct = 0
total = 0
with torch.no_grad():  # Disable gradient computation for evaluation
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)  # Move data and labels to the selected device (CPU/GPU)
        output = model(data)  # Forward pass: compute model predictions
        # The class with the highest logit is the prediction
        pred = output.argmax(dim=1)  # Get the index of the highest logit (predicted class)
        correct += (pred == target).sum().item()  # Count correct predictions
        total += target.size(0)  # Count total samples

print(f"Test Accuracy: {100 * correct / total:.2f}%")  # Compute and print the accuracy

Test Accuracy: 98.91%
