In [1]:
import torch
import numpy

from torch.utils.data import Dataset, DataLoader

import torch.nn as nn

from torch.optim import SGD

In [18]:
# the dataset is in the population_profits.txt file
# we can use numpy.loadtxt to load such basic files
dataset = numpy.loadtxt("population_profits.txt", delimiter = ",")

In [3]:
# numpy.loadtxt produces a numpy array
type(dataset)

numpy.ndarray

In [5]:
# the numpy array itself contains numpy arrays
# where each such array represents an example from the dataset 
dataset[0]

array([ 6.1101, 17.592 ])

In [17]:
# pytorch dataset: must inherit from Dataset
class PopulationProfitDataset(Dataset):
    # constructor: usually this is where we place the actual data in the Dataset object
    def __init__(self, numpy_array):
        self.numpy_array = numpy_array

    # the __getitem__ magic method is what is actually called behind the scenes in Python
    # when we use indexing. It's role is to return the element in position index from our dataset
    # Usually we return 2 tensors from this method: one with the inputs to our model
    # and one with the outputs
    # generally we write __getitem__ so that its output will make our lives easier when
    # using it
    def __getitem__(self, index):
        example = self.numpy_array[index]
        feature = example[0]
        target = example[1]
        return torch.Tensor([feature]), torch.Tensor([target])
    
    # magic method the gets called behind the scenes when we want to find the length of something
    # in our case, returns the size of the dataset
    # note: numpy_array is a 90 x 2 array, because we have 90 examples in our dataset and each example
    # is described by a population and a profit value. len(numpy_array) returns 90, which is the first
    # 'dimension' of the numpy_array
    def __len__(self):
        return len(self.numpy_array)

In [19]:
# we instantiate the dataset that we defined earlier, passing it the numpy array
torch_dataset = PopulationProfitDataset(dataset)

In [21]:
# this is equivalent to torch_dataset.__getitem__(10), which we defined earlier
# it returns 2 tensors, one containing the population (model input)
# and the other containing the population (model output)
torch_dataset[10]

(tensor([5.7107]), tensor([3.2522]))

In [22]:
# this is equivalent to torch_dataset.__len__(). When we defined the __len__ method earlier,
# we essentially told Python that when we call `len` on the dataset object, we want it to 
# call the `len` method of the numpy_array that our dataset actually contains
len(torch_dataset)

90

In [23]:
# we want to use our hardware to the max when training a model
# so we usually try to pass as many examples from the dataset through
# the model as we possible can.
# A dataloader takes in a dataset and gives us multiple examples from it at once
# in this case, when asking the dataloader for more data, it will give us 10 entries
# from the dataset
torch_loader = DataLoader(torch_dataset, batch_size = 10)

In [29]:
# the dataloader is a python generator
# at this point there is nothing in the dataloader
# it hasn't queried the dataset for data yet
# and it hasn't split it into minibatches (group of 10 entries, as mentioned above)

# when we request something from the dataloader (or from a python generator, in general)
# it generates (hence its name) the data right then and there
# this is useful when we have loads and loads of data
# for example, if we train on a dataset of a million images, we can't load them all into
# memory at once. In such a case, we would write our `__getitem__` method 
# so that it loads image number `index` from disk and returns it

# in that situation, when we would ask the dataloader for a minibatch
# it would call the __getitem__ method of the dataset n times
# load the n images right then and there, and return them to us
# in this way we can work with huge datasets even though we can't
# store the whole dataset in memory at once

# dataloaders and generators remember their history
# so next time we call on the dataloader / generator for data
# it will know where it last left off and will continue by giving us the next batch
for feature_batch, target_batch in torch_loader:
    break

In [30]:
# above we used break in the for loop, which means we stop after the dataloader gives us the first batch of data
# we can see a tensor of shape 10 x 1, meaning we received 10 examples from the dataset, each of them containing once value
# this looks equivalent to a tensor of shape 10, and we could store this data in such a tensor, but
# we usually would have more than one value for each example, in which case a 2D tensor would be necessary
# so we make the decision to always use 2D tensors in this situation
# notice that the inner tensors in the batch (the [6.1101], the [5.5277], etc) are of the same shape
# as the tensors that __getitem__ returns, because the dataloader used that method to obtain 
# the data that it gave us
feature_batch

tensor([[6.1101],
        [5.5277],
        [8.5186],
        [7.0032],
        [5.8598],
        [8.3829],
        [7.4764],
        [8.5781],
        [6.4862],
        [5.0546]])

In [31]:
target_batch

tensor([[17.5920],
        [ 9.1302],
        [13.6620],
        [11.8540],
        [ 6.8233],
        [11.8860],
        [ 4.3483],
        [12.0000],
        [ 6.5987],
        [ 3.8166]])

In [149]:
# we instantiate a linear regression model. The 2 parameters of nn.Linear represent the number of inputs to the model,
# respectively how many outputs we expect it to have: in our case, we have one input (the population) 
# and one output (the profit)
# this model implements the following equation: profit = a * population + b
model = nn.Linear(1, 1)

In [150]:
# pytorch stores the coefficients that we multiply inputs by in nn.Linear(_, _).weight
# in our case, this is a single parameter, representing a
# notice that the tensor has requires_grad = True: we intend to modify it's value later 
# through gradient/derivative based optimization
model.weight

Parameter containing:
tensor([[-0.2469]], requires_grad=True)

In [151]:
# pytorch stores the free term (bias term, b in our equation) in nn.Linear(_, _).bias
# as above, this tensor has requires_grad = True
model.bias

Parameter containing:
tensor([0.3258], requires_grad=True)

In [152]:
# to use the model, we give a tensor containing one single value as input
# this value represents a population in the problem we're trying to solve
# the model outputs another value, which we interpret as profit
model(torch.Tensor([6.11]))

tensor([-1.1830], grad_fn=<ViewBackward0>)

In [153]:
# our model has random values for its parameters, so we expect it to be quite terrible
# we need an exact measurement of how terrible it is at the task we're trying to use it for
# (give us the profit when we give it a population)
# for problems like this one, we can use the mean squared error loss
# as with most things in Pytorch, we first need to instantiate the loss function
loss_function = nn.MSELoss()

In [154]:
# to use the loss function, we pass it 2 tensors
# first tensor represents the output of our model (which we expect to be useless currently)
# and the second tensor represents the correct out, i.e. what we would have liked the first
# tensor to be equal to 
# MSELoss will now subtract the corresponding values in the 2 tensors
# followed by raising the result to the second power
# and then computing the mean of all of these error terms
# good model: small loss, bad model: big loss
loss_function(torch.Tensor([100, 120]), torch.Tensor([10, 20]))

tensor(9050.)

In [155]:
# there's no point in knowing how bad the model is if we can't improve it
# to improve the model, we use what is called an optimizer
# the optimizer is going to update our model's parameters
# so it must know what those parameters are
# that is why we pass it model.parameters() as the first argument
# the optimizer will change the model's parameters by some step size
# it has isn't own logic for choosing the step size 
# (for SGD, it uses the opposite of the gradient that it finds stored in the model parameters)
# but we can control that step size to some extent by multiplying it with a value that we choose
# we usually go for small values at first, and can then increase them if we notice our model improves too slow
# or decrease them if we end up with a model that doesn't learn anything
optimizer = SGD(model.parameters(), lr = 0.005)

In [156]:
# if we look the model.parameters(), we notice it is a generator
# a generator doesn't contain any values yet, but it knows how to obtain them
# for our purpose (to see what's in model.parameters, that is) we can ask the generator
# to give us all of its values by casting it to a list
# notice in the following 2 cells that model.weight and model.bias can be found in the parameters list
list(model.parameters())

[Parameter containing:
 tensor([[-0.2469]], requires_grad=True),
 Parameter containing:
 tensor([0.3258], requires_grad=True)]

In [157]:
model.weight

Parameter containing:
tensor([[-0.2469]], requires_grad=True)

In [158]:
model.bias

Parameter containing:
tensor([0.3258], requires_grad=True)

In [159]:
# we are now ready to train the model
# we will run 20 training iterations (in ML, an iteration is called an epoch)
for epoch in range(20):
    epoch_loss = 0
    # we query the dataloader for mini-batches
    for features_batch, targets_batch in torch_loader:
        # for each minibatch, we run it through the model
        # obtaining a prediction
        # we would like the prediction to be equal to targets_batch
        # because features_batch represents populations and
        # targets_batch represents profit and we would like our model to
        # be good at predicting profit from population 
        predicted_targets_batch = model(features_batch)
        # we compute the loss of the model using MSELoss
        loss = loss_function(predicted_targets_batch, targets_batch)
        # we reset the gradient value to 0
        # otherwise PyTorch would keep adding the new gradient value over the old one
        optimizer.zero_grad()
        # at this point we ran some inputs through the model
        # and computed the loss
        # behind the scenes, PyTorch has constructed a graph of everything we have done
        # this way, it knows how to compute the gradient of the loss function with respect to
        # the parameters (model.weight and model.bias) of the model
        # we compute those gradients using loss.backward()
        loss.backward()
        # finally, we take an optimization step: we change the value of the model parameters
        # in such a way that our model should improve
        optimizer.step()
        # we also would like to compute a loss for the whole dataset
        # remember that MSELoss computes a mean of the error when we give more than one prediction - correct output pair
        # so we just add all losses up here
        # and will later on divide the epoch_loss by how many minibatches we have
        epoch_loss += loss

    epoch_loss = epoch_loss / (len(torch_dataset) / 10)
    print(epoch_loss)

tensor(32.3659, grad_fn=<DivBackward0>)
tensor(17.8078, grad_fn=<DivBackward0>)
tensor(17.4761, grad_fn=<DivBackward0>)
tensor(17.1786, grad_fn=<DivBackward0>)
tensor(16.8935, grad_fn=<DivBackward0>)
tensor(16.6201, grad_fn=<DivBackward0>)
tensor(16.3580, grad_fn=<DivBackward0>)
tensor(16.1067, grad_fn=<DivBackward0>)
tensor(15.8658, grad_fn=<DivBackward0>)
tensor(15.6348, grad_fn=<DivBackward0>)
tensor(15.4132, grad_fn=<DivBackward0>)
tensor(15.2008, grad_fn=<DivBackward0>)
tensor(14.9971, grad_fn=<DivBackward0>)
tensor(14.8018, grad_fn=<DivBackward0>)
tensor(14.6144, grad_fn=<DivBackward0>)
tensor(14.4347, grad_fn=<DivBackward0>)
tensor(14.2624, grad_fn=<DivBackward0>)
tensor(14.0971, grad_fn=<DivBackward0>)
tensor(13.9385, grad_fn=<DivBackward0>)
tensor(13.7864, grad_fn=<DivBackward0>)


In [160]:
model.weight

Parameter containing:
tensor([[0.7316]], requires_grad=True)

In [162]:
model.bias

Parameter containing:
tensor([-1.0277], requires_grad=True)

In [165]:
# to use the model (that is now trained) we simply give it some value of the population
# and it will spit out a value for the profit
# EXTRA: notice the output tensor has grad_fn=something
# This means that pytorch has constructed a graph of everything we've done so far
# so that it could compute gradients if we ask it to. Gradients are useful when trying 
# to train a model, but here we just want to use it. Use Google to figure out
# how to prevent pytorch from computing gradients, and see what the output looks like
# when PyTorch doesn't do that
model(torch.Tensor([18]))

tensor([12.1415], grad_fn=<ViewBackward0>)