In [1]:
import torch
import numpy as np
import random

# Reference: 
https://d2l.ai/chapter_linear-networks/linear-regression-scratch.html <br>
https://tangshusen.me/Dive-into-DL-PyTorch/#/chapter03_DL-basics/3.3_linear-regression-pytorch

#  Basic elements:
1. training data
2. model
3. loss function
4. optimization function

# Implement the Linear Regression from Scratch

## Prepare data 

In [2]:
num_inputs = 2                                                       # number of input feature
num_examples = 1000                                                  # number of input
true_w = [2, -3.4]                                                   # true w
true_b = 4.2                                                         # true bias 
X = torch.randn(num_examples, num_inputs,dtype=torch.float32)        # X
y = true_w[0] * X[:, 0] + true_w[1] * X[:, 1] + true_b               # y
y += torch.tensor(np.random.normal(0, 0.01, size=y.size()),dtype=torch.float32)

## Read  Data

In [3]:
def data_iter(batch_size, X, y):
    num_examples = len(X)
    indices = list(range(num_examples)) 
    random.shuffle(indices)   # shuffle data by shuffling each sample's index 
    for i in range(0, num_examples, batch_size):
        j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)]) # at last time, may not eough bacth_size
        yield  X.index_select(0, j), y.index_select(0, j)

In [4]:
batch_size = 10

for X_batch, y_batch in data_iter(batch_size, X, y):
    print(X_batch, y_batch)
    break

tensor([[-0.3273, -2.2829],
        [ 1.0348,  1.1587],
        [ 0.2759, -0.8850],
        [-0.6894, -0.3430],
        [-0.5639,  0.1136],
        [-0.6001, -0.2558],
        [-1.3017,  1.2472],
        [-0.6990,  1.3417],
        [-0.4300,  1.4635],
        [ 0.4237, -0.8379]]) tensor([11.3000,  2.3287,  7.7660,  3.9756,  2.7125,  3.8669, -2.6294, -1.7832,
        -1.6199,  7.8921])


## Initialize parameters

In [5]:
w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float32)
b = torch.zeros(1, dtype=torch.float32)

w.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True) 

w.shape,b.shape

(torch.Size([2, 1]), torch.Size([1]))

## Define Model

In [6]:
def linreg(X, w, b): 
    return torch.mm(X, w) + b

## Define loss function

In [7]:
def squared_loss(y_hat, y):  
    # return a vector, besides,MSELoss in pyTorch doesn't divided by 2
    return (y_hat - y.view(y_hat.size())) ** 2 / 2

## Define optimization function

In [8]:
def sgd(params, lr, batch_size):  
    for param in params:
        param.data -= lr * param.grad / batch_size  # use param.data

## Train model

In [9]:
lr = 0.03
num_epochs = 5
net = linreg
loss = squared_loss

for epoch in range(num_epochs): 

    for X_batch, y_batch in data_iter(batch_size, X, y):
        l = loss(net(X_batch, w, b), y_batch).sum()  
        l.backward()   # Compute gradient on l with respect to [w,b]
        sgd([w, b], lr, batch_size) 

        # zero gradient
        w.grad.data.zero_()
        b.grad.data.zero_()
        
    train_l = loss(net(X, w, b), y)
    print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))

epoch 1, loss 0.051443
epoch 2, loss 0.000237
epoch 3, loss 0.000051
epoch 4, loss 0.000050
epoch 5, loss 0.000050


In [10]:
print(true_w, '\t\t', w)
print(true_b, '\t\t', b)

[2, -3.4] 		 tensor([[ 1.9998],
        [-3.3994]], requires_grad=True)
4.2 		 tensor([4.2000], requires_grad=True)


# Implement the Linear Regression by using Pytorch module

* torch.utils.data
* torch.nn
* torch.nn.init
* torch.optim

## Read Data (Using DataSet, DataLoader)

In [11]:
import torch.utils.data as Data

batch_size = 10
# combine X and y
dataset = Data.TensorDataset(X, y)
# load batch 
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True)

In [12]:
for X, y in data_iter:
    print(X, y)
    break

tensor([[-1.0294,  0.7481],
        [-1.1275,  0.2111],
        [ 0.1487, -0.7129],
        [-0.3390, -0.5591],
        [ 0.6595, -0.3149],
        [-1.5672, -1.6979],
        [ 0.1037,  0.8913],
        [ 1.2237, -0.2482],
        [ 2.2195,  0.9758],
        [-0.6770,  0.1716]]) tensor([-0.4062,  1.2368,  6.9290,  5.4332,  6.6043,  6.8430,  1.3789,  7.4874,
         5.3312,  2.2704])


`torch.nn` only supports the input of one batch of samples and does not support single sample input. 
If there is only a single sample `x`, use `x.unsqueeze(0)` to add one dimension.
in this way x.shape from `(torch.Size([d])` to `torch.Size([1, d])`

## Define model

In [13]:
import torch.nn as nn
net = nn.Sequential(nn.Linear(2, 1))

In [14]:
# net.parameters() return a generator
for param in net.parameters():
    print(param)

Parameter containing:
tensor([[-0.3538, -0.0508]], requires_grad=True)
Parameter containing:
tensor([0.2526], requires_grad=True)


## Initialize parameters

In [15]:
from torch.nn import init

init.normal_(net[0].weight, mean=0, std=0.01)
init.constant_(net[0].bias, val=0)  # another way: net[0].bias.data.fill_(0)

Parameter containing:
tensor([0.], requires_grad=True)

In [16]:
for param in net.parameters():
    print(param)

Parameter containing:
tensor([[0.0053, 0.0011]], requires_grad=True)
Parameter containing:
tensor([0.], requires_grad=True)


## Define loss function

In [17]:
loss = nn.MSELoss()

## Define optimization function

In [18]:
import torch.optim as optim

optimizer = optim.SGD(net.parameters(), lr=0.03)
print(optimizer)

SGD (
Parameter Group 0
    dampening: 0
    lr: 0.03
    momentum: 0
    nesterov: False
    weight_decay: 0
)


we can set different learning rate for different subnets(useful for finetune) , like 
```
optimizer =optim.SGD([
                {'params': net.subnet1.parameters()}, # lr=0.03
                {'params': net.subnet2.parameters(), 'lr': 0.01}
            ], lr=0.03)
```

## Train model

In [19]:
num_epochs = 5
for epoch in range(1, num_epochs + 1):
    for X_batch, y_batch in data_iter:
        output = net(X_batch)
        l = loss(output, y_batch.view(-1, 1))
        optimizer.zero_grad() # equivalent to net.zero_grad()
        l.backward()
        optimizer.step()
    print('epoch %d, loss: %f' % (epoch, l.item()))

epoch 1, loss: 0.000372
epoch 2, loss: 0.000116
epoch 3, loss: 0.000090
epoch 4, loss: 0.000053
epoch 5, loss: 0.000120


In [20]:
dense = net[0]
print(true_w, dense.weight)
print(true_b, dense.bias)

[2, -3.4] Parameter containing:
tensor([[ 1.9994, -3.4011]], requires_grad=True)
4.2 Parameter containing:
tensor([4.2005], requires_grad=True)
