In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

### Parameter initialization

In [2]:
#export
from exp.nb_01 import *

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

In [6]:
x_train,y_train,x_valid,y_valid = get_data()

In [7]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std

(tensor(0.1304), tensor(0.3073))

In [8]:
# Normalize
x_train = normalize(x_train, train_mean, train_std)
# NB: Use training, not validation mean for validation set
x_valid = normalize(x_valid, train_mean, train_std)

In [9]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std

(tensor(0.0001), tensor(1.))

In [10]:
#export
def test_near_zero(a,tol=1e-3): assert a.abs()<tol, f"Near zero: {a}"

In [11]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())

In [12]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c

(50000, 784, tensor(10))

In [13]:
#Initializing parameters for NN

In [14]:
# num hidden
nh = 50

In [15]:
# simplified kaiming init / he init
w1 = torch.randn(m,nh)/math.sqrt(m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)/math.sqrt(nh)
b2 = torch.zeros(1)

The division by square root is based on the [paper](https://arxiv.org/pdf/1502.01852.pdf) section 2.2


Here is a [blog](https://pouannes.github.io/blog/initialization/) which explains the same

Easy explainer on vanishing gradient problem [link](https://stats.stackexchange.com/questions/301285/what-is-vanishing-gradient/301752)

In [16]:
# This should be ~ (0,1) (mean,std)...
x_valid.mean(),x_valid.std()

(tensor(-0.0057), tensor(0.9924))

In [17]:
def lin(x, w, b): return x@w + b

In [18]:
t = lin(x_valid, w1, b1)

In [19]:
#...so this should have (0,1) (mean,std) , because we used kaiming init, which is designed to do this
t.mean(),t.std()

(tensor(-0.0756), tensor(1.0621))

In [20]:
def relu(x): return x.clamp_min(0.)

In [21]:
t = relu(lin(x_valid, w1, b1))

In [23]:
#With RELU (0,1) (mean,std) is not achieved
t.mean(),t.std()

(tensor(0.3819), tensor(0.6089))

In [24]:
# kaiming init / he init for relu
w1 = torch.randn(m,nh)*math.sqrt(2/m)

In [25]:
w1.mean(),w1.std()

(tensor(-0.0001), tensor(0.0501))

In [26]:
#export
from torch.nn import init

In [31]:
w1 = torch.zeros(m,nh)
init.kaiming_normal_(w1, mode='fan_out') #Does the same thing as done above: math.sqrt(2/m) but using pytorch
t = relu(lin(x_valid, w1, b1))

In [33]:
#This takes the STD close to 1
t.mean(),t.std()

(tensor(0.6250), tensor(0.8539))

In [34]:
#export
def mse(output, targ): return (output.squeeze(-1) - targ).pow(2).mean()

### Forward and backward pass

In [35]:
#export
from torch import nn

In [36]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        self.loss = mse
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x.squeeze(), targ)

In [37]:
model = Model(m, nh, 1)

In [38]:
%time loss = model(x_train, y_train)

CPU times: user 151 ms, sys: 6.34 ms, total: 157 ms
Wall time: 30.6 ms


In [39]:
%time loss.backward()

CPU times: user 204 ms, sys: 9.39 ms, total: 213 ms
Wall time: 43.5 ms


Please refer to this [paper](https://explained.ai/matrix-calculus/index.html) for everything on matrix calculus required for deep learning and this [link](https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#sphx-glr-beginner-blitz-autograd-tutorial-py) for Pytorch autograd understanding

### Export

In [44]:
! /Users/archi/opt/anaconda3/bin/python nb_to_code.py 02_fully_connected.ipynb

Converted 02_fully_connected.ipynb to exp/nb_02.py
