In [None]:
## Chapter 4: Under the Hood: Training a Digital Classifier

In [None]:
from fastai.vision.all import *
from fastai.vision.widgets import ImageClassifierCleaner
from fastai.text.all import *
from fastai.tabular.all import *
from fastai.collab import *
from fastai.basics import *
from torchvision.models import resnet34
import ipywidgets as widgets
from IPython.display import display, clear_output
from ipywidgets import Dropdown, VBox, Button, Layout, Label, interact
import os
import pandas as pd
from PIL import Image
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import shutil
import torch

In [None]:
## use doc() to answer questions about a fastai method

# What is untar_data
doc(untar_data)

In [None]:
# use untar_data to extract urls and return the paths 
# this creates the following path: '/root/.fastai/data/mnist_sample'
# download samples of the NIST images of 3 or 7
path = untar_data(URLs.MNIST_SAMPLE)

In [None]:
# Set the base path
Path.BASE_PATH = 'train'

# Define the paths
train_path = path/'train'
valid_path = path/'valid'
csv_path = path/'labels.csv'

# Create directories if they don't exist
train_path.mkdir(parents=True, exist_ok=True)
valid_path.mkdir(parents=True, exist_ok=True)

# Create the labels.csv file (you can replace '...' with actual label data)
labels_data = pd.DataFrame({'file': [], 'label': []})
labels_data.to_csv(csv_path, index=False)

# Check the updated directory structure
## list folders for training, validation, and test data
path.ls()

In [None]:
# list the training data folders
(path/'train').ls()

In [None]:
sevens = (path/'train'/'7').ls().sorted()
threes = (path/'train'/'3').ls().sorted()

In [None]:
# list the threes (image files)
threes

In [None]:
## Goal: demonstrate what an image looks like to a computer ##
# view an image of a three
# the Image class comes from the Python Imaging Library (PIL)
im3_path = threes[1]
im3 = Image.open(im3_path)
im3

In [None]:
## Goal: demonstrate what an image looks like to a computer ##
# convert the image of the 3 to a NumPy array 
# ensure numpy is imported as np
# [rows, columns]
# index rows / columns 4 up to 10 (not included)
array(im3)[4:10,4:10]

In [None]:
## Goal: demonstrate what an image looks like to a computer ##
# convert the image of the 3 to a PyTorch tensor
tensor(im3)[4:10,4:10]

In [None]:
## Goal Complete: This is what an image looks like to a computer ##
# slice the array to pick just the part with the top digit in it
im3_t = tensor(im3)
df = pd.DataFrame(im3_t[4:15,4:22])
df

In [None]:
## Goal Complete: This is what an image looks like to a computer ##
# use a Pandas DataFrame to color-code the values with a gradient
# 0's are white; 255's are black; shades of grey are between 0 and 255
df.style.set_properties(**{'font-size':'6pt'}).background_gradient('Greys')

In [None]:
## Goal: create a very simple baseline model using averages
# creat a tensor for all images of 3 and 7 in a directory
seven_tensors = [tensor(Image.open(o)) for o in sevens]
three_tensors = [tensor(Image.open(o)) for o in threes]
len(three_tensors), len(seven_tensors)

In [None]:
## Goal: create a very simple baseline model using averages
# check one of the images using the fastai show_image() method
show_image(three_tensors[1])

In [None]:
## Goal: create a very simple baseline model using averages
# combine all of the images in the 3s and 7s lists into a single three-dimensional tensor
# use the PyTorch stack() method to stack up individual tensors into a single tensor
# cast the stacked tensor into a float and divide by 255 to get a number from 0 to 1
stacked_sevens = torch.stack(seven_tensors).float()/255
stacked_threes = torch.stack(three_tensors).float()/255

In [None]:
## Goal: create a very simple baseline model using averages
# an important attribute of a tensor is its shape which tells you the length of each axis
# we have 6,131 images, each of size 28×28 pixels.
stacked_threes.shape

In [None]:
## Goal: create a very simple baseline model using averages
# the length of a tensor's shape is  its rank
# this is a rank-3 tensor
len(stacked_threes.shape)

In [None]:
## Goal: create a very simple baseline model using averages
# get a tensor's rank directly with ndim
stacked_threes.ndim

In [None]:
## Goal: create a very simple baseline model using average
# determine the ideal 3 in our rank-3 tensor
# calculate the mean of all the image tensors 
# for every pixel position, compute the average of that pixel over all images
mean3 = stacked_threes.mean(0)
show_image(mean3)

In [None]:
## Goal: create a very simple baseline model using average
# determine the ideal 7 in our rank-3 tensor
mean7 = stacked_sevens.mean(0)
show_image(mean7)

In [None]:
## Goal: create a very simple baseline model using average
# we will measure the distance from this aribrary '3' to our "ideal 3"

# pick an arbitrary 3 
a_3 = stacked_threes[1]
show_image(a_3)

In [None]:
## Goal: create a very simple baseline model using average
# measuring distance
# method 1: Mean Absolute Difference (a.k.k L1 norm) 
dist_3_abs = (a_3 - mean3).abs().mean()

# method 2: Root Mean Squared Error (RRMSE)(a.k.a L2 norm)
dist_3_sqr = ((a_3 - mean3)**2).mean().sqrt()

# display the distances
dist_3_abs, dist_3_sqr

In [None]:
## Goal: create a very simple baseline model using average
# we will measure the distance from this aribrary '7' to our "ideal 7"

# pick an arbitrary 7
a_7 = stacked_sevens[1]
show_image(a_7)

In [None]:
## Goal: create a very simple baseline model using average
# measuring distance

# method 1: Mean Absolute Difference (a.k.k L1 norm) 
# dist_7_abs = (a_7 - mean7).abs().mean() 
dist_7_abs = (a_3 - mean7).abs().mean() # Note: why use a_3 - mean7 ??

# method 2: Root Mean Squared Error (RRMSE)(a.k.a L2 norm)
# dist_7_sqr = ((a_7 - mean7)**2).mean().sqrt()
dist_7_sqr = ((a_3 - mean7)**2).mean().sqrt() # Note: why use a_3 - mean7 ??

# display the distance
dist_7_abs, dist_7_sqr

In [None]:
## Goal: create a very simple baseline model using average
# measuring distance using the PyTorch loss function
F.l1_loss(a_3.float(), mean7), F.mse_loss(a_3,mean7).sqrt()

In [None]:
## Goal complete: create a very simple baseline model using average
# measuring distance

# The distance between our 3 and the "ideal" 3 < the distance to the ideal 7. 
# So our simple model will give the right prediction in this case.

# now let's compare the results
print(f'3s: {dist_3_abs, dist_3_sqr}\n')
print(f'7s {dist_7_abs, dist_7_sqr}\n')
print(f'loss function: {F.l1_loss(a_3.float(), mean7), F.mse_loss(a_3,mean7).sqrt()}')

In [None]:
# my notes 
# Smaller values indicate closer similarity. 
# The RMSE is higher than the MAE  
# There are possibly some pixels with relatively larger errors,...
# pulling the RMSE up more than the MAE.

### Now, is our baseline model any good? ### 

In [None]:
# Quick lesson on numpy arrays and PyTorch tensors

data = [[1,2,3],[4,5,6]]

# create a numpy array
arr = array(data)

# create a PyTorch tensor
tns = tensor(data)


In [None]:
# Quick lesson on numpy arrays and PyTorch tensors

# display the numpy array
arr

In [None]:
# Quick lesson on numpy arrays and PyTorch tensors

# display the PyTorch tensor
tns

In [None]:
# Quick lesson on numpy arrays and PyTorch tensors
# basic operations on tensors (most are similar to numpy)

# display a row
# display index 1 (element 2) of the tensor
tns[1]

In [None]:
# Quick lesson on numpy arrays and PyTorch tensors
# basic operations on tensors (most are similar to numpy)

# display a column
# show all of the first axis (index column 1)
tns[:,1]

In [None]:
# Quick lesson on numpy arrays and PyTorch tensors
# basic operations on tensors (most are similar to numpy)

# select part of a row or column
# second list element [4,5,6] indexes 1 through 3 (excluding 3) so index 1 and 2
tns[1,1:3]

In [None]:
# Quick lesson on numpy arrays and PyTorch tensors
# basic operations on tensors (most are similar to numpy)

# perform operations on tensors
# add one to each element of the tensor
tns + 1

In [None]:
# Quick lesson on numpy arrays and PyTorch tensors
# basic operations on tensors (most are similar to numpy)
# check the tensor type
tns.type()

In [None]:
# Quick lesson on numpy arrays and PyTorch tensors
# basic operations on tensors (most are similar to numpy)
# automatically changes from int to float
tns*1.5

In [None]:
### Now, is our baseline model any good? ### 
# Goal: build a metric
# next step: check the shape of the images 
# create tensors for the 3s and 7s from the 'valid' directory
# these tensors are used to calculate a metric measuring the quality of the model
# the metric measures distance from an ideal image

valid_3_tens = torch.stack([tensor(Image.open(o)) for o in (path/'valid'/'3').ls()])
valid_3_tens = valid_3_tens.float()/255

valid_7_tens = torch.stack([tensor(Image.open(o)) for o in (path/'valid'/'7').ls()])
valid_7_tens = valid_7_tens.float()/255

# display the number of images and the shape (remember definition of shape)
valid_3_tens.shape, valid_7_tens.shape

In [None]:
### Now, is our baseline model any good? ### 
# Goal: build a metric
# next step: write a function to decide if an aribrary number is a 3 or 7

# define a function that calculates the distance between two images
# applies the method "elementwise"
# returns 1010 matrices of absolute values
# (-1, -2) are the last two axes (horizontal and vertical dimensions) in an image
# .mean(-1,-2) takes the mean of the last two axes  in the image
# after taking the mean of (-1, -2) we are left with the first tensor axis
# the first tensor axis index over all of the images
# so we get the average intensity of all the pixels in that image
def mnist_distance(a,b):
    return (a-b).abs().mean((-1,-2))

# call the distance calculator function
# result is the same value we previously calculated for the distance
mnist_distance(a_3, mean3)

In [None]:
### Now, is our baseline model any good? ### 
# Goal: build a metric
# next step: **Apply broadcasting technque**

# option 1: loop over 'valid_3_tens',the stacked ismage tensors in the validation set
# option 2 (better): pass 'valid_3_tens' as an argument into mnist_distance function

# pass the tensor for the 3s and the mean as arguments into the distance func
# PyTorch will automatically apply the broadcasting technique
valid_3_dist = mnist_distance(valid_3_tens, mean3)
valid_3_dist, valid_3_dist.shape

In [None]:
## My Notes on Broadcasting ##
# Shape Transformation 
# Tensor A (3x4 matrix)
A = [
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12]
]

# Tensor B (1D vector)
B = [1, 2, 3, 4]

# Convert B to a 2D tensor with shape (1, 4)
B_padded = [B]  # This is now [[1, 2, 3, 4]], which is shape (1, 4)

# Broadcast B to match the shape of A (3, 4)
B_broadcast = [
    [1, 2, 3, 4],
    [1, 2, 3, 4],
    [1, 2, 3, 4]
]


In [None]:
### Now, is our baseline model any good? ### 
# Goal: build a metric
# next step: use mnist_distance to test an arbitrary 3
# logic: 
# if (distance between 'ideal 3' and image) < (distance to 'ideal 7') then image = 3

def is_3(x): return mnist_distance(x,mean3) < mnist_distance(x, mean7)

# test an image
is_3(a_3), is_3(a_3).float()

In [None]:
### Now, is our baseline model any good? ### 
# Goal: build a metric
# next step: use mnist_distance to test the validation set of 3s
is_3(valid_3_tens)

In [None]:
### Our baseline model is pretty good ### 
# Goal Complete: buitl a metric
# final step: calculate the accuracy for each of the 3s and 7s
# take the average of the function for all 3s and its inverse for all 7s
accuracy_3s = is_3(valid_3_tens).float().mean()
accuracy_7s = (1- is_3(valid_7_tens).float()).mean()

# display the accuracy of the 3s and 7s
accuracy_3s, accuracy_7s, (accuracy_3s+accuracy_7s)/2

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: build a simple loss function 

# build the plot_function
def plot_function(f, title=None, min=-2.1, max=2.1, color='b', ylim=None):
    x = torch.linspace(min,max, 100)[:,None]
    if ylim: plt.ylim(ylim)
    plt.plot(x, f(x), color)
    if title is not None: plt.title(title)

# a quadratic loss function where x is a weight parameter
def f(x): return x**2

# pass the f function as an argument into the plot_function method
# display the loss function
plot_function(f, title='Plot of x^2')

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: build a simple loss function 

# next step: pick some random value for a parameter 
# f(-1.5) = 2.25 (x,y) = (-1.5, 2.25)
# calculate the loss
plot_function(f, title='Plot of x^2')
plt.scatter(-1.5, f(-1.5), color='red')

In [None]:
## here is how to calculate a gradient ##

# define the function
def f(x): return x**2

# create a tensor with requires_grad=True
x = torch.tensor(3.0, requires_grad=True)

# compute the function value
y = f(x)

# compute the gradient
y.backward()

# access the gradient
gradient = x.grad

print(f"Gradient of f(x) = x^2 at x = {x.item()} is {gradient.item()}")

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: build a simple loss function 

# next step: calculate a gradient
# pick a tensor value which we want gradients at
xt = tensor(3.).requires_grad_()

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: build a simple loss function 
# next step: calculate a gradient

# calculate our function with the value
yt = f(xt)
yt

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: build a simple loss function 
# next step: calculate a gradient

# backpropagation is the proces of calculating the derivative of each layer
yt.backward()

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: build a simple loss function 

# view the gradients
# check the grad attribute of our tensor
xt.grad

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: build a simple loss function 

# calculate a gradient with a vector argument
xt = tensor([3.,4.,10.]).requires_grad_()
xt

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: build a simple loss function 

# add sum to the function so it can take a vector
def f(x): return (x**2).sum()
yt = f(xt)
yt

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: build a simple loss function 

yt.backward()
xt.grad

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: build a simple loss function 

## gradients only tell us the slope of our function ##
## gradients do not tell us exactly how far to adjust the parameters ##
# a very large slope suggests we need more adjustments
# a very small slope may suggest that we are close to the optimal value

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: build a simple loss function 
# next step: stepping with a learning rate (LR)

# define a function
def h(x): return x**2

# create a tensor with requires_grad=True
# use an initial value of 3.0
x = torch.tensor(3.0, requires_grad=True)

# define the learning rate
learning_rate = 0.1

# perform a single gradient descent step
# do 10 iterations 
for i in range (10):
    y = h(x)
    y.backward() # compute the gradient using backpropagation
    
    # update x using the gradient and learning rate
    with torch.no_grad(): # temporarily set requires_grad to False
        x -= learning_rate * x.grad
    
    # zero the gradients after updating
    x.grad.zero_()
    
    print(f"Iteration {i+1}: x = {x.item()}")

# final value of x
print(f"Final value x: {x.item()}")

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# next step: create the time tensor with values from 0 to 19 (inclusive)
# manually measure the speed every 20 seconds
# This tensor represents the time points at which we measure the speed of the roller coaster
time = torch.arange(0,20).float()
time

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# next step: generate the speed tensor
# represents the speed of the roller coaster at each time point.
speed = torch.randn(20)*3 + 0.75*(time-9.5)**2 +1
speed

#### explanation ###
# torch.randn(20) * 3: 
# creates a tensor of 20 random values sampled from a standard normal distribution 
# multiplies each value by 3
# adds a stochastic (random) component to the speed to simulate noise

# 0.75 * (time - 9.5) ** 2: 
# models a quadratic function centered around time = 9.5. The ** 2 operation squares the difference between each time value and 9.5, and the 0.75 coefficient controls the curvature of the quadratic function. This simulates the expected change in speed as the roller coaster approaches the top of the hump (slowing down as it climbs and then speeding up as it descends).

# + 1: 
# adds a baseline speed of 1 to all values
# ensures that the speed remains positive and does not drop too low

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# next step: make a scatter plot of the roller coaster speed
plt.scatter(time,speed)

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# intermediate goal: find a function to fit the model
# let's guess and start with a quadratic of the form a*(time**2) + (b*time) + c

# next step: collect the parameters into one argument
# find the best quadratic
def f(t, params):
    a, b, c = params 
    return a*(t**2) + (b*t) + c

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# intermediate goal: find a function to fit the model

# next step: apply SGD to minimize loss
# for continuous data use the mean squared error 
def mse(pred, targets): return ((preds - targets)**2).mean()

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# intermediate goal: find a function to fit the model
# apply the 7-step process

# step 1: initialize the parameters
# initialize the parameters to random values and track their gradients
# create a 1-dimensional tensor of size 3 filled with random values
# random values are sampled from a standard normal distribution
params = torch.randn(3).requires_grad_()
params

## notes: params represent the parameters (weights) of the model that you want to optimize
## Setting requires_grad=True computes gradients during the training process. 

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# intermediate goal: find a function to fit the model
# step 2: calculate the predictions
preds = f(time,params)
preds

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# intermediate goal: find a function to fit the model
# step 2 (continued): create a function to see how close our predictions are to our targets
def show_preds(preds, ax=None):
    if ax is None: 
        ax=plt.subplots()[1]
    ax.scatter(time,speed)
    ax.scatter(time, to_np(preds), color='red')
    ax.set_ylim(-300,100)
show_preds(preds)

## graphic displays negative speeds!!! So, we need to fix this

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# intermediate goal: find a function to fit the model
# step 3: calculate the loss
loss = mse(preds, speed)
loss

# next we need to improve the loss but first we need to know the gradients

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# intermediate goal: find a function to fit the model
# step 4: calculate the gradients
# calculate an approximation of how the parameters need to change
loss.backward()
params.grad

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# intermediate goal: find a function to fit the model
# step 4 (continued): choose a learning rate of 1e-5
params.grad * 1e-5
params

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# intermediate goal: find a function to fit the model
# step 5: step the weights
# update the parameters based on the gradients we just calculated
lr = 1e-5
params.data -= lr * params.grad.data
params.grad = None

# now see if the loos has improved
preds = f(time, params)
mse(preds, speed)

show_preds(preds)

In [None]:
# Note to self: if you get the RuntimeError: 
# "Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed)."
# go back to the top and rerun all the cells

## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# intermediate goal: find a function to fit the model
# step 6: repeat the process
# We need to repeat this a few times, so we'll create a function to apply one step
def apply_step(params, prn=True):
    preds = f(time, params)
    loss = mse(preds, speed)
    loss.backward(retain_graph=True)
    params.data -= lr * params.grad.data
    params.grad = None
    if prn: print(loss.item())
    return preds

for i in range(10): apply_step(params)

# def apply_step(params, prn=True):
#     preds = f(time, params)
#     loss = mse(preds, speed)
#     loss.backward(retain_graph=True)  # Retain the computation graph
#     with torch.no_grad():
#         params.data -= lr * params.grad
#     params.grad.zero_()
#     if prn: print(loss.item())
#     return preds

# for i in range(10): apply_step(params)


In [None]:

# Note to self: if you get the RuntimeError: 
# "Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed)."
# go back to the top and rerun all the cells

## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal complete: Build an End-to-End SGD Example 
# model the speed of a roller coaster as it went over the top of a hump

# intermediate goal: find a function to fit the model
# step 6 (continued): display the interations
_,axs = plt.subplots(1,4,figsize=(12,3))
for ax in axs: show_preds(apply_step(params, False), ax)
plt.tight_layout()

In [None]:
## Training Process and Stochastic Gradient Descent (SGD) ##
# Goal complete: Build an End-to-End SGD Example 
# Step 7: stop
# We just decided to stop after 10 epochs arbitrarily. 
# In practice, the training and validation losses and metrics decide when to stop


In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights + bias) using gradients to predict a 3

# concatenate the images (x-variables) into a single tensor
# change the shape of the tensor without chaning its content
train_x = torch.cat([stacked_threes, stacked_sevens]).view(-1, 28*28)
train_y = tensor([1]*len(threes) + [0]*len(sevens)).unsqueeze(1)
train_x.shape,train_y.shape

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights + bias) using gradients to predict a 3
# a dataset in PyTorch is required to return a tuple of (x,y) when indexed. 
dset = list(zip(train_x, train_y))
x,y = dset[0]
x.shape,y

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights + bias) using gradients to predict a 3

valid_x = torch.cat([valid_3_tens, valid_7_tens]).view(-1, 28*28)
valid_y = tensor([1]*len(valid_3_tens) + [0]*len(valid_7_tens)).unsqueeze(1)
valid_dset = list(zip(valid_x,valid_y))

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3

# y=w*x+b 
# The w in the equation is called the weights and the b is called the bias. 
# Together, the weights and bias make up the parameters.

# create an (initially random) weight for every pixel
def init_params(size, std=1.0): return (torch.randn(size)*std).requires_grad_()
weights = init_params((28*28,1))

# The function weights*pixels is always equal to 0 when the pixels are equal to 0
# So, initialize it with a randome number for more flexibility
bias = init_params(1)

# calculate a prediction for one image using y = wx + b
(train_x[0]*weights.T).sum() + bias

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3

# matrix multiplication
# The equation, batch@weights + bias, is one of the two fundamental equations of any neural network
def linear1(xb): return xb@weights + bias
preds = linear1(train_x)
preds

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3

# use broadcasting to check our accuracy (decide if an output represents a 3 or a 7)
# check whether the output is greater than 0.0 to 
corrects = (preds>0.0).float() == train_y
corrects

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3

corrects.float().mean().item()

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3

# make a small adjustment to the weight and observe the accuracy
# note we have to ask PyTorch not to calculate gradients as we do this
with torch.no_grad(): weights[0] *=1.0001
preds = linear1(train_x)
((preds>0.0).float() == train_y).float().mean().item()

## output 
#  small changes in the value of a weight will often not change the accuracy at all
# ** It is not useful to use accuracy as a loss function ** #

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3

# develop a loss function 
# loss func gives us a slightly better loss when adjusting our weights result in slightly better predictions

## lesson on loss ##
# loss measures the distance between the predictions and targets
# suppose we had three images which are 3, 7, 3 (the targets)
# suppose the model predicts 0.9, 0.4,and 0.2 
trgts = tensor([1,0,1])
prds = tensor([0.9, 0.4, 0.2])

# measure how distant each prediction is from 1 if it should be 1
# measure how distant each prediction is from 0 if it should be 0
# take the mean of all those distances
# note this uses the PyTorch function where()  
def mnist_loss(predictions, targets):
    return torch.where(targets==1, 1-predictions, predictions).mean()

# now run the loss function on the test data 3, 7, 3
# In PyTorch, we always assume a lower value of a loss function is better. 
torch.where(trgts==1, 1-prds, prds)

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3

## lesson on loss ##
# compute the final loss
# we need a scalar for the final loss
# mnist_loss takes the mean of the previous tensor
mnist_loss(prds,trgts)

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3

## lesson on loss ##
# change our prediction for the "false" target from 0.2 to 0.8
# the loss will go down
mnist_loss(tensor([0.9,0.4,0.8]),trgts)

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3
# The Sigmoid Function

# create a sigmoid function to ensure the output is always a number between 0 and 1.

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3

In [None]:
## Lesson: Improving the MNIST Loss Function ##
# Goal: update the parameters (weights) using gradients to predict a 3