In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import mlflow
import mlflow.pytorch
from mlflow.exceptions import MlflowException

# Define the CNN model
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 8, 256),
            nn.ReLU(),
            nn.Linear(256, 10)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x

# Load CIFAR-10 Dataset
def load_data(batch_size):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    train_dataset = datasets.CIFAR10(root='./data', train=True, transform=transform, download=True)
    test_dataset = datasets.CIFAR10(root='./data', train=False, transform=transform, download=True)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    return train_loader, test_loader

# Train the model
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    return total_loss / len(train_loader), 100 * correct / total

# Evaluate the model
def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return total_loss / len(test_loader), 100 * correct / total

# Hyperparameter tuning and MLflow logging
def run_experiment(batch_size, learning_rate, num_epochs, device):
    train_loader, test_loader = load_data(batch_size)
    model = CNNModel().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    with mlflow.start_run():
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("num_epochs", num_epochs)

        for epoch in range(num_epochs):
            train_loss, train_acc = train_model(model, train_loader, criterion, optimizer, device)
            test_loss, test_acc = evaluate_model(model, test_loader, criterion, device)
            print(f"Epoch {epoch + 1}/{num_epochs}, "
                  f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, "
                  f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%")

            mlflow.log_metric("train_loss", train_loss, step=epoch)
            mlflow.log_metric("train_acc", train_acc, step=epoch)
            mlflow.log_metric("test_loss", test_loss, step=epoch)
            mlflow.log_metric("test_acc", test_acc, step=epoch)

        # Log the final model
        mlflow.pytorch.log_model(model, "cnn_model")
        print("Model logged in MLflow.")

# Main execution
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    hyperparams = [
        {"batch_size": 64, "learning_rate": 0.001, "num_epochs": 10},
        {"batch_size": 128, "learning_rate": 0.0005, "num_epochs": 10}
    ]

    for params in hyperparams:
        print(f"Running experiment with params: {params}")
        run_experiment(params["batch_size"], params["learning_rate"], params["num_epochs"], device)


Running experiment with params: {'batch_size': 64, 'learning_rate': 0.001, 'num_epochs': 10}


0.1%

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data\cifar-10-python.tar.gz


100.0%


Extracting ./data\cifar-10-python.tar.gz to ./data
Files already downloaded and verified
Epoch 1/10, Train Loss: 1.3171, Train Acc: 52.51%, Test Loss: 1.0623, Test Acc: 62.37%
Epoch 2/10, Train Loss: 0.9384, Train Acc: 66.65%, Test Loss: 0.9015, Test Acc: 68.50%
Epoch 3/10, Train Loss: 0.7651, Train Acc: 73.08%, Test Loss: 0.8198, Test Acc: 71.55%
Epoch 4/10, Train Loss: 0.6253, Train Acc: 78.10%, Test Loss: 0.8118, Test Acc: 72.27%
Epoch 5/10, Train Loss: 0.4963, Train Acc: 82.66%, Test Loss: 0.8191, Test Acc: 73.40%
Epoch 6/10, Train Loss: 0.3819, Train Acc: 86.56%, Test Loss: 0.8627, Test Acc: 72.84%
Epoch 7/10, Train Loss: 0.2749, Train Acc: 90.65%, Test Loss: 0.9570, Test Acc: 73.29%
Epoch 8/10, Train Loss: 0.1946, Train Acc: 93.31%, Test Loss: 1.1124, Test Acc: 72.33%
Epoch 9/10, Train Loss: 0.1355, Train Acc: 95.41%, Test Loss: 1.3744, Test Acc: 71.00%
Epoch 10/10, Train Loss: 0.1081, Train Acc: 96.32%, Test Loss: 1.4060, Test Acc: 71.43%




Model logged in MLflow.
Running experiment with params: {'batch_size': 128, 'learning_rate': 0.0005, 'num_epochs': 10}
Files already downloaded and verified
Files already downloaded and verified
Epoch 1/10, Train Loss: 1.4895, Train Acc: 47.06%, Test Loss: 1.2274, Test Acc: 56.23%
Epoch 2/10, Train Loss: 1.1439, Train Acc: 59.81%, Test Loss: 1.0996, Test Acc: 61.31%
Epoch 3/10, Train Loss: 1.0050, Train Acc: 64.88%, Test Loss: 1.0025, Test Acc: 65.20%
Epoch 4/10, Train Loss: 0.8971, Train Acc: 68.72%, Test Loss: 0.9141, Test Acc: 67.85%
Epoch 5/10, Train Loss: 0.8166, Train Acc: 71.63%, Test Loss: 0.8791, Test Acc: 69.12%
Epoch 6/10, Train Loss: 0.7360, Train Acc: 74.58%, Test Loss: 0.8832, Test Acc: 69.39%
Epoch 7/10, Train Loss: 0.6684, Train Acc: 76.91%, Test Loss: 0.8576, Test Acc: 70.47%
Epoch 8/10, Train Loss: 0.6023, Train Acc: 79.33%, Test Loss: 0.8409, Test Acc: 71.33%
Epoch 9/10, Train Loss: 0.5379, Train Acc: 81.58%, Test Loss: 0.8571, Test Acc: 71.23%
Epoch 10/10, Train Los



Model logged in MLflow.
