# References
https://tangshusen.me/Dive-into-DL-PyTorch/#/chapter03_DL-basics/3.6_softmax-regression-scratch

#  Basic elements:

1. training data
2. model
3. loss function
4. optimization function


# Implement the Softmax Regression from Scratch

In [1]:
import torch
import torchvision
import numpy as np
import torchvision.transforms as transforms

## Load data

In [2]:
batch_size = 256
mnist_train = torchvision.datasets.FashionMNIST(root='../data', 
                                                train=True, download=False, transform=transforms.ToTensor())
mnist_test = torchvision.datasets.FashionMNIST(root='../data', 
                                               train=False, download=False, transform=transforms.ToTensor())

train_iter = torch.utils.data.DataLoader(mnist_train, batch_size=batch_size, shuffle=True, num_workers=4)
test_iter = torch.utils.data.DataLoader(mnist_test, batch_size=batch_size, shuffle=False, num_workers=4)

## Initialize Parameters

* For the sample problem, the number of class is 10, size of input vector for each sample is 28x28=784
* Thus the W.shape is (784,10) b.shape is (1,10)

In [3]:
num_inputs = 28*28  
num_outputs = 10

W = torch.tensor(np.random.normal(0, 0.01, size=(num_inputs, num_outputs)), dtype=torch.float)
b = torch.zeros(num_outputs, dtype=torch.float)

W.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True) 

W.shape, b.shape

(torch.Size([784, 10]), torch.Size([10]))

## Define model

In [4]:
def softmax(O):
    O_exp = O.exp()
    partition = O_exp.sum(dim=1, keepdim=True)
    return O_exp / partition   # broadcast

def net(X):
    return softmax(torch.mm(X.view((-1, num_inputs)), W) + b)

## Define loss function

In [5]:
def cross_entropy(y_hat, y):
    return - torch.log(y_hat.gather(1, y.view(-1, 1)))

## Define optimization function

In [10]:
def sgd(params, lr, batch_size):  
    for param in params:
        param.data -= lr * param.grad / batch_size  # use param.data

## Evaluation (accuracy)

In [6]:
def accuracy(y_hat, y):
    return (y_hat.argmax(dim=1) == y).float().mean().item()

# explanation:
y = torch.tensor([1,2])
y_hat = torch.tensor([[0.1, 0.3, 0.6],
                      [0.3, 0.2, 0.5]])
print(y_hat.argmax(dim=1),(y_hat.argmax(dim=1) == y).float().mean().item())


def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    for X, y in data_iter:
        acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
        n += y.shape[0]
    return acc_sum / n

tensor([2, 2]) 0.5


In [7]:
evaluate_accuracy(train_iter,net)

0.17428333333333335

## Train model

In [13]:
num_epochs, lr = 5, 0.1

def train_softmax(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params=None, lr=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            y_hat = net(X)
            l = loss(y_hat, y).sum()
            
            if params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
                    
            l.backward()

            # optimize parameters
            sgd(params, lr, batch_size)

            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n += y.shape[0]
            
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

train_softmax(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr)

epoch 1, loss 0.7867, train acc 0.752, test acc 0.794
epoch 2, loss 0.5703, train acc 0.813, test acc 0.810
epoch 3, loss 0.5245, train acc 0.826, test acc 0.822
epoch 4, loss 0.5000, train acc 0.833, test acc 0.826
epoch 5, loss 0.4861, train acc 0.837, test acc 0.827


## Make Prediction

In [14]:
## ....

# Implement the Softmax Regression by using Pytorch Module

## Load data

In [16]:
# same as above

## Define model & initialize parameters

In [24]:
import torch.nn as nn
num_inputs = 784
num_outputs = 10

class FlattenLayer(nn.Module):
    def __init__(self):
        super(FlattenLayer, self).__init__()
    def forward(self, x): # x shape: (batch, *, *, ...)
        return x.view(x.shape[0], -1)
    
class LinearNet(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(LinearNet, self).__init__()
        self.linear = nn.Linear(num_inputs, num_outputs)
    def forward(self, x): # x shape: (batch, 1, 28, 28)
        y = self.linear(x.view(x.shape[0], -1))
        return y


In [25]:
net = nn.Sequential()
net.add_module('flatten', FlattenLayer())
net.add_module('linear',  nn.Linear(num_inputs, num_outputs))

print(net)

Sequential(
  (flatten): FlattenLayer()
  (linear): Linear(in_features=784, out_features=10, bias=True)
)


In [34]:
import torch.nn.init as init
init.normal_(net.linear.weight, mean=0, std=0.01)
init.constant_(net.linear.bias, val=0) 

Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)

## Define loss function & optimization algorithm

In [43]:
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.1)

`CrossEntropyLoss in PyTorch  = softmax + CrossEntropyLoss` which means the output of the network is the value of linear regression instead of probability

## Train model

In [46]:
num_epochs = 5

def train_softmax_torch(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params=None, lr=None, optimizer=None):
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
        for X, y in train_iter:
            y_hat = net(X)
            l = loss(y_hat, y).sum()

            # zero gradient
            optimizer.zero_grad()
            
            l.backward()
        
            optimizer.step() 


            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n += y.shape[0]
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))

In [47]:
train_softmax_torch(net, train_iter, test_iter, loss, num_epochs, batch_size, None, None, optimizer)

epoch 1, loss 0.0031, train acc 0.746, test acc 0.793
epoch 2, loss 0.0022, train acc 0.814, test acc 0.808
epoch 3, loss 0.0021, train acc 0.825, test acc 0.813
epoch 4, loss 0.0020, train acc 0.831, test acc 0.821
epoch 5, loss 0.0019, train acc 0.838, test acc 0.823


## Prediction

In [48]:
# ...