In [3]:
# we want to make sure that at the start we have 0 mean and 1 stdevs
# you can easily run out of gpu memory if you run cells on jupyter notebook
# jupyter notebook stores the results of your previous few results and thus allows you to use
# _ to get your previous one
# if your previous output is a GPU tensor you must clean out those _

# any cuda out of memory errors since the memory that was previously allocatd to your code will still be saved within the 
# traceback call of the error, it will keep the error problems saved

# we must scale our weight matrices so that the standard deviation of our activations stay at 1
# where 1 / root(number of inputs) - so we should scale our random numbers by 0.1
# this is called the Xavier / Glorot initialisation
# a variance is the average of how far away each point is from the mean 
# how far each point is on average from the mean
import torch

t = torch.tensor([1., 2., 3., 4., 5.])
m = t.mean()
m

tensor(3.)

In [4]:
(t - m).mean()

tensor(0.)

In [5]:
# however to calculate this we have to either use the square root or the absolute value
# whenever we are calculating the variance otherwise the positives and negatives will cancel out
# using these functions (squaring to handle the positive and negative values)
# is far easier within derivations and other calculations so we use this one as teh variance
# however it is particularly sensitive to outliers and thus we can take the sqrt of variance
# to get the standard deviation
# it will alwasy be on the same scale as the original data
(t - m).pow(2).mean()

tensor(2.)

In [6]:
sqrt((t - m).pow(2).mean())

NameError: name 'sqrt' is not defined

In [7]:
(t-m).pow(2).mean(), (t*t).mean() - (m*m)
the mean of the squared data points minus the squared mean

(tensor(2.), tensor(2.))

In [8]:
# Covariance
# Tells you how much two things vary with respect to eachother
cov = (t*v).mean() - t.mean()*v.mean(); cov

NameError: name 'v' is not defined

In [9]:
pearson correlation coefficient is the scaled version of covariance
# covaraince divided by the standard deviation of both values


SyntaxError: invalid syntax (1411608723.py, line 1)

In [11]:

# y[i] = sum([c*d for c,d in zip(a[i], x)])
# y[i] = (a[i]*x).sum()

# At the very beginning, our x vector has a mean of roughly 0. and a standard deviation of roughly 1. (since we picked it that way).

mean,sqr = 0.,0.

for i in range(100):
    x = torch.randn(100)
    a = torch.randn(512, 100)
    
    y = a @ x
    mean += y.mean().item()
    sqr  += y.pow(2).mean().item()
    
mean/100,sqr/100

# as long as both are indepent they will give a mean and std of 0 and 1
# so by using this 1/ root (number of inputs) this means that our stdevs of 1 that are added up will all converge to 1


(0.04328734785318375, 100.3774040222168)

In [12]:
# if we do a relu this means that we do not have a 0 mean or 1 stdevs
# this means htat we must use kaiming init
# instead of all of them being added all our means and variances and being changed
# 1 / root(2 / n)

# now that we know which initialisation functions we can use how will we apply them

# model.apply applying init noramlly distributed random numbers
# _ at the end of a function will change in place the method
# however we can only do this if the model is a convolutional or linear layer
# apply also returns the model meaning that we can simply
# pass our model to the momentum learner with the dataloader, cross entorpy learning rate and callbacks
# forcing our model to find ways of compressing intelligently
# by decreasing grid size, we also get a lower amount of compute necessary
# we also have to make sure we normalise our input matrix

# for this we can use batchtransform callback
# this will allow us to transform every single batch

# Problem here is we are putting our values through a relu

# allows us to normalise inputs
@inplace
def transformi(b): b[xl] = [(TF.to_tensor(o)-xmean)/xstd for o in b[xl]]
tds = dsd.with_transform(transformi)
dls = DataLoaders.from_dd(tds, bs, num_workers=4)
xb,yb = next(iter(dls.train))

# relu is incompatible with the idea of normalised data
class GeneralRelu(nn.Module):
    def __init__(self, leak=None, sub=None, maxv=None):
        super().__init__()
        self.leak,self.sub,self.maxv = leak,sub,maxv

    def forward(self, x): 
        x = F.leaky_relu(x,self.leak) if self.leak is not None else F.relu(x)
        if self.sub is not None: x -= self.sub
        if self.maxv is not None: x.clamp_max_(self.maxv)
        return x
    
# by subtracting values from our relu we can pull everything so that we can then ahve a mean of 0
# we can also use a leaky relu to not have negatives totaly flat / truncated
# but these values can be decreased by a certain amount

# leakiness of a relu means the slope instead of being a simple truncation 
# e.g leak of 0.1
# we can also subtract a certain value that  finds our values our mean of 0

# kaiming normal in pytorch provides us with the ability to give it a leaky vlaue
# we also want to minimise the amount of dead units

NameError: name 'inplace' is not defined

In [13]:
# LSUV - layer wise sequential unit variance
# allows us to do weight initialisation for deep learning


In [14]:
# Batch normalisation
# using batch normalisation allows you to train models so much faster
# by using normalisation during training for each mini-batch then this allows us to use much higher learning rates and not care as much
# about learning rates / initialisation

# multiply things by 1 and add 0
# forward function
# mean activation for each input - channel, height, width
# variance for each input too
# normalize data by subtracting the mean and dividing by the variance
class LayerNorm(nn.Module):
    def __init__(self, dummy, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.mult = nn.Parameter(tensor(1.))
        self.add  = nn.Parameter(tensor(0.))

    def forward(self, x):
        m = x.mean((1,2,3), keepdim=True)
        v = x.var ((1,2,3), keepdim=True)
        x = (x-m) / ((v+self.eps).sqrt())
        return x*self.mult + self.add
    
# by adding epsilon this means that we will not be dividing by 0 and we normalise the minibatch / layer that we decide to put this into
# if we want anything other than unit variance / unit mean then we can change these parameters
# and we can learn these parameters
# they may not actually be normalisation
# it normalises for initial layers
# however later on it actually allows it to have 2 numbers to create any distribution of outputs it wants
# this allows the neural network to learn the distribution that best suits this dataset and thus be far faster at producing accurate results

NameError: name 'nn' is not defined

In [15]:
# however it also adds a lot of complexity within our model, 
# Batchnormalisation
# we have a vector of additions and a vector of multiples
# this is why we have to pass in the number of filters that we have
# we take the average of evverything in the batch

# we generate arrays of these and 1 per filter
# then we use .lerp_()
# this allows you to have a sliding scale of a value between 0 and 1 of how much you want of either first and second vlaue combined
# exponentially weighted moving average
# and this updaets our values
# then during inference we use the calcualted values
# batch_norm uses register_buffer
# batch norm is the average of hte input batch - 1 noramlisation value over 1 channel
# layer norm evrages over teh channel - from the input

# group norm groups a part of a channel together

In [None]:
# accelerated sgd

# weight decay is similar / the same to l2 regularisation
# means add the square of the weights to the loss function
# by adding hte square of the weights to teh loss function - this means that 
# by using momentum we want to be able to follow the general direction of hte curve since the noise will cancel out itself
# 

In [16]:

class Momentum(SGD):
    def __init__(self, params, lr, wd=0., mom=0.9):
        super().__init__(params, lr=lr, wd=wd)
        self.mom=mom

    def opt_step(self, p):
        # for each parameter we must find out the moving average of gradients
        # so we can set it to 0 initially
        # then if we have it awe add what it used to be * momentum
        # and then we use a lerp p / (1 - p) of the actual gradient vs momentum
        if not hasattr(p, 'grad_avg'): 
            p.grad_avg = torch.zeros_like(p.grad)
        p.grad_avg = p.grad_avg * self.mom + p.grad*(1-self.mom)
        p -= self.lr * p.grad_avg
        
# the smaller the batch size the better - this would allow us to give the nerual network
# to update as much as possible

NameError: name 'SGD' is not defined

In [None]:
# RMS Prop
# updates the optimisation step with instead of using the lerp of the gradietn
# we use the lerp of the gradient^2
# if the gradient is moving around allover the place we don't know what it is
# if we are confident on the gradient it will do a big step

# RMS Prop + Momentum is what adam is
# Adam is rms prop + momentum

# beta 1 is momentum 
# beta 2 is the momentum for the squares

# now we can implement schedulers to optimise our learning rate - they find the optimal one by themselves
