In [22]:
%pip install mlflow
%pip install torch
%pip install torchvision

from time import time
from typing import List, Dict, Any, Tuple

import mlflow
import torch
from torch.utils.data.dataloader import DataLoader
from torch import nn
from torch import optim
from torchvision import datasets, transforms

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [23]:
def load_images(batch_size: int) -> Tuple[Any]:
    # Start of load time.
    start_time = time()
    print("Loading images")
    # Define a transform to normalize the data
    transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,)),
                                ])

    # Download and load the training data
    train_dataset = datasets.MNIST('./mnistdata', download=True, train=True, transform=transform)
    test_dataset = datasets.MNIST('./mnistdata', download=True, train=False, transform=transform)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    return train_loader, test_loader, len(train_dataset), len(test_dataset), (time()-start_time)

In [24]:
class MNISTModel(nn.Module):
    def __init__(self, input_size: int, hidden_sizes: List[int], output_size: int):
        super().__init__()
        
        self.lin1 = nn.Linear(input_size, hidden_sizes[0])
        self.lin2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.lin3 = nn.Linear(hidden_sizes[1], output_size)
        self.activation = nn.ReLU()
        self.output_activation = nn.LogSoftmax(dim=1)

    def forward(self, x):
        out = self.lin1(x)
        out = self.activation(out)
        out = self.lin2(out)
        out = self.activation(out)
        out = self.lin3(out)
        out = self.output_activation(out)
        return out

In [25]:
def train_model(model: MNISTModel, loader: DataLoader, params: Dict[str, Any]) -> Dict[str, Any]:
    start_time = time()
    loss_func = nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=params['lr'], momentum=params['momentum'])
    training_metrics = {}
    for epoch in range(params['epochs']):
        total_loss = 0
        for images, labels in loader:
            # Flatten MNIST images into a 784 long vector.
            images = images.view(images.shape[0], -1)
        
            # Training pass
            optimizer.zero_grad()
            
            output = model(images)
            loss = loss_func(output, labels)
            
            # This is where the model learns by backpropagating
            loss.backward()
            
            # And optimizes its weights here
            optimizer.step()
            
            total_loss += loss.item()
        else:
            mlflow.log_metric('training_loss', total_loss/len(loader), epoch+1)
            print("Epoch {} - Training loss: {}".format(epoch+1, total_loss/len(loader)))

    training_time_sec = (time()-start_time)
    training_metrics['training_time_sec'] = training_time_sec
    print("\nTraining Time (in seconds) =",training_time_sec)
    return training_metrics

In [26]:
def test_model(model: MNISTModel, loader: DataLoader) -> Dict[str, Any]:
    correct_count, total_count = 0, 0
    for images,labels in loader:
        for i in range(len(labels)):
            img = images[i].view(1, 784)
            # Turn off gradients to speed up this part
            with torch.no_grad():
                logps = model(img)

            # Output of the network are log-probabilities, need to take exponential for probabilities
            ps = torch.exp(logps)
            probab = list(ps.numpy()[0])
            pred_label = probab.index(max(probab))
            true_label = labels.numpy()[i]
            if(true_label == pred_label):
                correct_count += 1
            total_count += 1
    
    testing_metrics = {
        'incorrect_count': total_count-correct_count,
        'correct_count': correct_count,
        'accuracy': (correct_count/total_count)
    }
    print("Number Of Images Tested =", total_count)
    print("\nModel Accuracy =", (correct_count/total_count))
    return testing_metrics

In [28]:
# Setup parameters
params = {
    'batch_size': 64,
    'epochs': 5,
    'input_size': 784,
    'hidden_sizes': [128, 64],
    'lr': 0.035,
    'momentum': 0.5,
    'output_size': 10
    }

# Setup mlflow to point to our server.
run_name = f'Learning rate={params["lr"]}'
mlflow.set_tracking_uri('http://localhost:5000/')
mlflow.set_experiment('MNIST 3-layer network2')
mlflow.start_run(run_name=run_name)

# Log parameters
mlflow.log_params(params)

# Load the data and log loading metrics.
train_loader, test_loader, train_size, test_size, load_time_sec = load_images(params['batch_size'])
mlflow.log_metric('train_size', train_size)
mlflow.log_metric('test_size', test_size)
mlflow.log_metric('load_time_sec', load_time_sec)

# Train the model and log training metrics.
model = MNISTModel(params['input_size'], params['hidden_sizes'], params['output_size'])
training_metrics = train_model(model, train_loader, params)
mlflow.log_metrics(training_metrics)

# Test the model and log the accuracy as a metric.
testing_metrics = test_model(model, test_loader)
mlflow.log_metrics(testing_metrics)

# Log the raw data and the trained model as artifacts.
mlflow.log_artifacts('./mnistdata', artifact_path='mnistdata')
mlflow.pytorch.log_model(model, artifact_path='mnistmodel')

# End the run
mlflow.end_run()

Loading images
Epoch 1 - Training loss: 0.45849590311680777
Epoch 2 - Training loss: 0.20103183534862137
Epoch 3 - Training loss: 0.14401927826675906
Epoch 4 - Training loss: 0.11503322042826651
Epoch 5 - Training loss: 0.09628694527137127

Training Time (in seconds) = 49.57977652549744
Number Of Images Tested = 10000

Model Accuracy = 0.9674




🏃 View run Learning rate=0.035 at: http://localhost:5000/#/experiments/2/runs/75a63a8441554366852239cc4f1be19e
🧪 View experiment at: http://localhost:5000/#/experiments/2
