
## Saving and Loading Models
To load previously trained models to use in making predictions or to continue training on new data.

In [8]:

%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torchvision import datasets, transforms


# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))])
# Download and load the training data
trainset = datasets.FashionMNIST('F_MNIST_data/', download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

# Download and load the test data
testset = datasets.FashionMNIST('F_MNIST_data/', download=True, train=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=True)





In [14]:
#Creating model 
import torch
from torch import nn
import torch.nn.functional as F


class Network(nn.Module):
    def __init__(self, input_size, output_size, hidden_layers, drop_p=0.5):
        ''' Builds a feedforward network with arbitrary hidden layers.
        
            Arguments
            ---------
            input_size: integer, size of the input layer
            output_size: integer, size of the output layer
            hidden_layers: list of integers, the sizes of the hidden layers
        
        '''
        super().__init__()
        # Input to a hidden layer
        self.hidden_layers = nn.ModuleList([nn.Linear(input_size, hidden_layers[0])])
        
        # Add a variable number of more hidden layers
        layer_sizes = zip(hidden_layers[:-1], hidden_layers[1:])
        self.hidden_layers.extend([nn.Linear(h1, h2) for h1, h2 in layer_sizes])
        
        self.output = nn.Linear(hidden_layers[-1], output_size)
        
        self.dropout = nn.Dropout(p=drop_p)
        
    def forward(self, x):
        ''' Forward pass through the network, returns the output logits '''
        
        for each in self.hidden_layers:
            x = F.relu(each(x))
            x = self.dropout(x)
        x = self.output(x)
        
        return F.log_softmax(x, dim=1)


def validation(model, testloader, criterion):
    accuracy = 0
    test_loss = 0
    for images, labels in testloader:

        images = images.resize_(images.size()[0], 784)

        output = model.forward(images)
        test_loss += criterion(output, labels).item()

        ## Calculating the accuracy 
        # Model's output is log-softmax, take exponential to get the probabilities
        ps = torch.exp(output)
        # Class with highest probability is our predicted class, compare with true label
        equality = (labels.data == ps.max(1)[1])
        # Accuracy is number of correct predictions divided by all predictions, just take the mean
        accuracy += equality.type_as(torch.FloatTensor()).mean()

    return test_loss, accuracy


def train(model, trainloader, testloader, criterion, optimizer, epochs=5, print_every=40):
    
    steps = 0
    running_loss = 0
    for e in range(epochs):
        # Model in training mode, dropout is on
        model.train()
        for images, labels in trainloader:
            steps += 1
            
            # Flatten images into a 784 long vector
            images.resize_(images.size()[0], 784)
            
            optimizer.zero_grad()
            
            output = model.forward(images)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

            if steps % print_every == 0:
                # Model in inference mode, dropout is off
                model.eval()
                
                # Turn off gradients for validation, will speed up inference
                with torch.no_grad():
                    test_loss, accuracy = validation(model, testloader, criterion)
                
                print("Epoch: {}/{}.. ".format(e+1, epochs),
                      "Training Loss: {:.3f}.. ".format(running_loss/print_every),
                      "Test Loss: {:.3f}.. ".format(test_loss/len(testloader)),
                      "Test Accuracy: {:.3f}".format(accuracy/len(testloader)))
                
                running_loss = 0
                
                # Make sure dropout and grads are on for training
                model.train()
                

                

# Create the network, define the criterion and optimizer
modelDeep = Network(784, 10, [512, 256, 128])
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [18]:
train(modelDeep, trainloader, testloader, criterion, optimizer)


Epoch: 1/5..  Training Loss: 2.317..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 1/5..  Training Loss: 2.312..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 1/5..  Training Loss: 2.309..  Test Loss: 2.308..  Test Accuracy: 0.070
Epoch: 1/5..  Training Loss: 2.313..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 1/5..  Training Loss: 2.307..  Test Loss: 2.308..  Test Accuracy: 0.070
Epoch: 1/5..  Training Loss: 2.313..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 1/5..  Training Loss: 2.314..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 1/5..  Training Loss: 2.314..  Test Loss: 2.308..  Test Accuracy: 0.070
Epoch: 1/5..  Training Loss: 2.313..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 1/5..  Training Loss: 2.313..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 1/5..  Training Loss: 2.311..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 1/5..  Training Loss: 2.309..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 1/5..  Training Loss: 2.311..  Test Loss: 2.3

Epoch: 5/5..  Training Loss: 2.314..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 5/5..  Training Loss: 2.315..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 5/5..  Training Loss: 2.314..  Test Loss: 2.308..  Test Accuracy: 0.070
Epoch: 5/5..  Training Loss: 2.312..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 5/5..  Training Loss: 2.317..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 5/5..  Training Loss: 2.313..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 5/5..  Training Loss: 2.311..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 5/5..  Training Loss: 2.310..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 5/5..  Training Loss: 2.310..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 5/5..  Training Loss: 2.308..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 5/5..  Training Loss: 2.314..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 5/5..  Training Loss: 2.314..  Test Loss: 2.308..  Test Accuracy: 0.069
Epoch: 5/5..  Training Loss: 2.312..  Test Loss: 2.3


## Saving and loading networks
- The parameters for PyTorch networks are stored in a **model's state_dict**
- We can see the state dict contains the weight and bias matrices for each of our layers.





In [19]:
# model_name.save_dict() contains all parameters and bias list wise
print("Our model: \n\n", modelDeep, '\n')
print("The state dict keys: \n\n", modelDeep.state_dict().keys())

#Save the state_dict with torch.save
#We can save it in a file checkpoint.pth
torch.save(modelDeep.state_dict(), 'checkpoint.pth')


Our model: 

 Network(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): Linear(in_features=512, out_features=256, bias=True)
    (2): Linear(in_features=256, out_features=128, bias=True)
  )
  (output): Linear(in_features=128, out_features=10, bias=True)
  (dropout): Dropout(p=0.5)
) 

The state dict keys: 

 odict_keys(['hidden_layers.0.weight', 'hidden_layers.0.bias', 'hidden_layers.1.weight', 'hidden_layers.1.bias', 'hidden_layers.2.weight', 'hidden_layers.2.bias', 'output.weight', 'output.bias'])


In [21]:
#Loading parameters using torch.load 
state_dict = torch.load('checkpoint.pth')



# initialise new model with random parameters and then replace those by earlier ones

#Creating model 
import torch
from torch import nn
import torch.nn.functional as F


class Network(nn.Module):
    def __init__(self, input_size, output_size, hidden_layers, drop_p=0.5):
        ''' Builds a feedforward network with arbitrary hidden layers.
        
            Arguments
            ---------
            input_size: integer, size of the input layer
            output_size: integer, size of the output layer
            hidden_layers: list of integers, the sizes of the hidden layers
        
        '''
        super().__init__()
        # Input to a hidden layer
        self.hidden_layers = nn.ModuleList([nn.Linear(input_size, hidden_layers[0])])
        
        # Add a variable number of more hidden layers
        layer_sizes = zip(hidden_layers[:-1], hidden_layers[1:])
        self.hidden_layers.extend([nn.Linear(h1, h2) for h1, h2 in layer_sizes])
        
        self.output = nn.Linear(hidden_layers[-1], output_size)
        
        self.dropout = nn.Dropout(p=drop_p)
        
    def forward(self, x):
        ''' Forward pass through the network, returns the output logits '''
        
        for each in self.hidden_layers:
            x = F.relu(each(x))
            x = self.dropout(x)
        x = self.output(x)
        
        return F.log_softmax(x, dim=1)


def validation(model, testloader, criterion):
    accuracy = 0
    test_loss = 0
    for images, labels in testloader:

        images = images.resize_(images.size()[0], 784)

        output = model.forward(images)
        test_loss += criterion(output, labels).item()

        ## Calculating the accuracy 
        # Model's output is log-softmax, take exponential to get the probabilities
        ps = torch.exp(output)
        # Class with highest probability is our predicted class, compare with true label
        equality = (labels.data == ps.max(1)[1])
        # Accuracy is number of correct predictions divided by all predictions, just take the mean
        accuracy += equality.type_as(torch.FloatTensor()).mean()

    return test_loss, accuracy


def train(model, trainloader, testloader, criterion, optimizer, epochs=5, print_every=40):
    
    steps = 0
    running_loss = 0
    for e in range(epochs):
        # Model in training mode, dropout is on
        model.train()
        for images, labels in trainloader:
            steps += 1
            
            # Flatten images into a 784 long vector
            images.resize_(images.size()[0], 784)
            
            optimizer.zero_grad()
            
            output = model.forward(images)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

            if steps % print_every == 0:
                # Model in inference mode, dropout is off
                model.eval()
                
                # Turn off gradients for validation, will speed up inference
                with torch.no_grad():
                    test_loss, accuracy = validation(model, testloader, criterion)
                
                print("Epoch: {}/{}.. ".format(e+1, epochs),
                      "Training Loss: {:.3f}.. ".format(running_loss/print_every),
                      "Test Loss: {:.3f}.. ".format(test_loss/len(testloader)),
                      "Test Accuracy: {:.3f}".format(accuracy/len(testloader)))
                
                running_loss = 0
                
                # Make sure dropout and grads are on for training
                model.train()
                

                

# Create the network, define the criterion and optimizer
model = Network(784, 10, [512, 256, 128])
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [22]:
#loading new model with earlier parameters
model.load_state_dict(state_dict)

In [23]:
# this models trains with continuation of earlier model 
# Initial model ended at 70% this continues and ended at 81% accuracy
train(model, trainloader, testloader, criterion, optimizer,epochs=1)

Epoch: 1/1..  Training Loss: 1.737..  Test Loss: 1.014..  Test Accuracy: 0.655
Epoch: 1/1..  Training Loss: 1.031..  Test Loss: 0.742..  Test Accuracy: 0.730
Epoch: 1/1..  Training Loss: 0.877..  Test Loss: 0.680..  Test Accuracy: 0.745
Epoch: 1/1..  Training Loss: 0.769..  Test Loss: 0.630..  Test Accuracy: 0.755
Epoch: 1/1..  Training Loss: 0.772..  Test Loss: 0.623..  Test Accuracy: 0.758
Epoch: 1/1..  Training Loss: 0.707..  Test Loss: 0.602..  Test Accuracy: 0.774
Epoch: 1/1..  Training Loss: 0.680..  Test Loss: 0.586..  Test Accuracy: 0.783
Epoch: 1/1..  Training Loss: 0.675..  Test Loss: 0.558..  Test Accuracy: 0.789
Epoch: 1/1..  Training Loss: 0.694..  Test Loss: 0.560..  Test Accuracy: 0.792
Epoch: 1/1..  Training Loss: 0.666..  Test Loss: 0.539..  Test Accuracy: 0.801
Epoch: 1/1..  Training Loss: 0.631..  Test Loss: 0.543..  Test Accuracy: 0.804
Epoch: 1/1..  Training Loss: 0.619..  Test Loss: 0.543..  Test Accuracy: 0.794
Epoch: 1/1..  Training Loss: 0.612..  Test Loss: 0.5

### Above approach only works if architecture of new model is same as earlier model ,so rebuild the new model same as the old one

In [25]:
# Trying over new model architecture
# Try this# Try t 
model2  = Network(784, 10, [400, 200, 100])
# This will throw an error because the tensor sizes are wrong!
model2.load_state_dict(state_dict)

RuntimeError: Error(s) in loading state_dict for Network:
	size mismatch for hidden_layers.0.weight: copying a param with shape torch.Size([512, 784]) from checkpoint, the shape in current model is torch.Size([400, 784]).
	size mismatch for hidden_layers.0.bias: copying a param with shape torch.Size([512]) from checkpoint, the shape in current model is torch.Size([400]).
	size mismatch for hidden_layers.1.weight: copying a param with shape torch.Size([256, 512]) from checkpoint, the shape in current model is torch.Size([200, 400]).
	size mismatch for hidden_layers.1.bias: copying a param with shape torch.Size([256]) from checkpoint, the shape in current model is torch.Size([200]).
	size mismatch for hidden_layers.2.weight: copying a param with shape torch.Size([128, 256]) from checkpoint, the shape in current model is torch.Size([100, 200]).
	size mismatch for hidden_layers.2.bias: copying a param with shape torch.Size([128]) from checkpoint, the shape in current model is torch.Size([100]).
	size mismatch for output.weight: copying a param with shape torch.Size([10, 128]) from checkpoint, the shape in current model is torch.Size([10, 100]).

## Note - BETTER APPROACH
- This means we need to rebuild the model exactly as it was when trained.
- Information about the model architecture needs to be saved in the checkpoint, along with the state dict.
- To do this, you build a dictionary with all the information you need to compeletely rebuild the model.

In [30]:
checkpoint = {'input_size': 784,
              'output_size': 10,
              'hidden_layers': [each.out_features for each in model.hidden_layers],
              'state_dict': model.state_dict()}

torch.save(checkpoint, 'checkpoint2.pth')

def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model =Network(checkpoint['input_size'],
                             checkpoint['output_size'],
                             checkpoint['hidden_layers'])
    model.load_state_dict(checkpoint['state_dict'])
    
    return model


In [32]:
model = load_checkpoint('checkpoint2.pth')
print(model)

Network(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): Linear(in_features=512, out_features=256, bias=True)
    (2): Linear(in_features=256, out_features=128, bias=True)
  )
  (output): Linear(in_features=128, out_features=10, bias=True)
  (dropout): Dropout(p=0.5)
)
