In [1]:
# Imports
import torch
from torch import nn, optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
def seed_all(seed=42):
    """
    Sets the numpy and torch random seed.
    """
    np.random.seed(seed)
    torch.random.seed = seed

seed_all()

In [3]:
# Create some X data
X = np.random.uniform(0, 10, 100)

In [4]:
# Define the slope (m), bias (b), and some noise we want to add to X to make y
m = 3
b = 1.8
noise = np.random.normal(scale=3, size=100)

# Exercise 2.1

Working with `np.Array` and `torch.Tensor` objects are very similar!
This exercise is intended as a warm-up for array operations that will be common throughout the course.
Based on your knowledge of linear regression, please use the variables `X`, `m`, `b`, and `noise` to generate a new variable `y`.
The output should be the target variable `y`, where `X` and `y` are related in a linear fashion, but with some noise.

<!-- startquestion -->

In [5]:
y = ...

In [7]:
fig, ax = plt.subplots(figsize=(10,10))
ax.set_xlabel('X')
ax.set_ylabel('y')
ax.scatter(X, y)

# Exercise 2.2

Now that you've learned about the Mean Squared Error (MSE) loss function, it's time to put your knowledge into practice. In this exercise, you'll be tasked with implementing the MSE loss function in Python.

Write a Python function named `mse` that takes two lists of equal length as input: `ys` (the actual values) and `yhats` (the predicted values). Your function should return the Mean Squared Error between the actual and predicted values.

We've provided some python code to get you started.

<!-- startquestion -->

In [8]:
# Define MSE
def mse(predictions:torch.Tensor, actuals:torch.Tensor) -> torch.Tensor:
    # your code here
    raise NotImplementedError("Implement MSE, then remove this line")

In [10]:
ys = torch.tensor([1,2,3])
yhats = torch.tensor([1.1, 2.1, 3.1])

In [11]:
assert torch.allclose(mse(ys, yhats),  torch.tensor(0.01))

In [12]:
print(mse(ys, yhats))
print(mse(ys, yhats).numpy())

In [13]:
# Now that we've defined MSE, let's just use Torch's.
mse_loss = nn.MSELoss()
mse_loss(ys, yhats)

In [14]:
# We can also use the functional API to calculate MSE
F.mse_loss(ys, yhats)

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Exercise 2.3

In the exercise below, your task is to fit a `LinearRegression` model using `scikit-learn`.
This involves training the model on our dataset and then inspecting the learned parameters - the coefficient (slope) and intercept.
You'll also calculate the mean squared error (MSE) of the model's predictions, which gives us a measure of how well the model fits the data.

<!-- startquestion -->

In [16]:
# your code here
lr = ...

In [18]:
# Display the slope and intercept
lr.coef_, lr.intercept_

In [19]:
# Calculate the mean squared error
mean_squared_error(y, lr.predict(X.reshape(-1, 1)))

In [20]:
# Plot our line of best fit
fig, ax = plt.subplots(figsize=(10,10))
ax.set_xlabel('X')
ax.set_ylabel('y')
ax.scatter(X, y)
_x = np.arange(0, 10)
_y = _x * lr.coef_[0] + lr.intercept_
ax.plot(_x, _y, c='red', label=f"Line of best fit")
ax.legend()

In [21]:
# Because we're in torch now, let's just turn X and y into tensors.
X = torch.tensor(X)
y = torch.tensor(y)

# Exercise 2.4

In the exercise below, your task is to complete the `forward` method within the `LinReg` class. This method should define how the input `X` is transformed to produce the output for a linear regression model.

Before we dive into the exercise, let's understand what `nn.Parameter` does. In PyTorch, `nn.Parameter` is a special kind of Tensor that automatically registers itself as a parameter when assigned as an attribute to a `Module`. This is particularly useful when we want certain tensors to be considered as trainable parameters of our model. In our case, the slope and the bias are the parameters that the model will learn during training.

<!-- startquestion -->

In [22]:
# Build our linear regression model
class LinReg(nn.Module):
    def __init__(self):
        super().__init__()
        # Randomly initialize 2 parameters, one for our slope and one for our bias.
        self.slope = nn.Parameter(torch.rand(1))
        self.bias = nn.Parameter(torch.rand(1))

    def forward(self, X):
        raise NotImplementedError()

In [24]:
lr = LinReg()

In [25]:
N_EPOCHS = 300
LR = 1e-3

In [26]:
# Initialize lists to store slopes, biases, losses, and alphas for visualization later
slopes = []
biases = []
losses = []
_alphas = []

# Loop over the number of epochs
for i in range(N_EPOCHS):
    # Generate predictions using the current model
    yhat = lr(X)

    # Compute the loss between the predictions and actual values
    loss = F.mse_loss(yhat, y)

    # Print the loss every 10% of the total epochs
    if i % (N_EPOCHS / 10) == 0:
        print(f"Epoch {i} Train Loss: {loss:.04f}")

    # Compute the gradients of the loss with respect to the model parameters
    loss.backward()

    # Update the model parameters using the computed gradients and the learning rate
    lr.slope.data.sub_(lr.slope.grad * LR)
    lr.bias.data.sub_(lr.bias.grad * LR)

    # Reset the gradients to zero for the next iteration
    lr.slope.grad.zero_()
    lr.bias.grad.zero_()

    # Store the current parameters and loss for visualization later
    slopes.append(float(lr.slope.data.detach().numpy()))
    biases.append(float(lr.bias.data.detach().numpy()))
    losses.append(float(loss.detach().numpy()))
    _alphas.append(i / N_EPOCHS)


In [27]:
lr.slope, lr.bias

In [28]:
fig, ax = plt.subplots(figsize=(10,10))
ax.set_xlabel('X')
ax.set_ylabel('y')
ax.scatter(X, y)
for s, b, a in zip(slopes, biases, _alphas):
    _x = np.arange(0, 10)
    _y = _x * s + b
    ax.plot(_x, _y, alpha=a, c='red', label=f"Epoch {int(a)}")


In [29]:
fig, ax = plt.subplots(figsize=(14,10))
ax.plot(losses)
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss (MSE)')
if (losses[-1] > losses[0]) | np.isnan(losses[-1]):
    ax.set_title('Diverging - BAD!')
else:
    ax.set_title('Converging - goood!')

# Exercise 2.5

Now that you've seen how the training loop works, it's time for you to experiment with different hyperparameters. Specifically, we'll be adjusting the learning rate (`LR`) and the number of epochs (`N_EPOCHS`).

Hyperparameters are settings that we can tune to control how the model learns. They are not learned from the data but are set prior to training. The learning rate and the number of epochs are two of the most important hyperparameters in gradient descent.

1. **Learning Rate (`LR`):** This controls the size of the steps that we take during gradient descent. A larger learning rate means we take bigger steps, while a smaller learning rate means we take smaller steps.

2. **Number of Epochs (`N_EPOCHS`):** This is simply the number of times the training loop is run. More epochs mean more opportunities for the model to learn from the data.

Here's what you need to do:

1. Change the values of `LR` and `N_EPOCHS` in the code.
2. Re-run the code up to this point. Remember to re-instantiate your model every time to start from scratch.
3. Observe the changes in the model's performance.

Things to consider:

- What happens if you make the learning rate too large? What about if it's too small?
- How does changing the number of epochs affect the model's performance and the number of epochs required to reach the loss minimum?
- Can you find a combination of `LR` and `N_EPOCHS` that gives you the best performance in the fewest number of epochs?

Remember, machine learning involves a lot of experimentation. Don't be afraid to try different values and see what happens. Happy tuning!

<!-- startquestion -->

In [30]:
# Go back and re-run the code with different hyperparameters.

In [31]:
# Make yet another fake dataset
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=1000, n_features=3, n_informative=2, bias=3, noise=2)

In [32]:
# No more bad habits, we need to split our data.
X_train, X_valid, y_train, y_valid = (torch.tensor(i).float() for i in train_test_split(X, y, test_size=0.1, random_state=42))

# Exercise 2.6

In the following exercise, your task is to create a few `nn.Parameter`s that will represent our weights and bias for the multi-variable linear regression model. Here's what you need to do:

1. **Weights Parameter:** Create a `weights` parameter that matches the number of columns in `X_train`. Each weight corresponds to a feature in our input data. Initialize these weights with random numbers, which you can generate using `torch.rand`.

2. **Bias Parameter:** Create a `bias` parameter that represents the bias term in our regression model. This should be a single value, also initialized with a random number using `torch.rand`.

Remember, these parameters (weights and bias) are the values that our model will learn to adjust during the training process to minimize the loss function. Initially, we set them to random values, but as the model learns from the data, these parameters will be updated to better fit our data.

<!-- startquestion -->

In [33]:
# Let's create some temporary weights and biases and test out our matrix operations before we build our model.
# Create a weights parameter with 1 beta per column in X
weights = ...
# Create our bias parameter
bias = ...

In [35]:
# Test out the operation we want to perform in the forward pass
torch.matmul(X_train[:10], weights) + bias

In [36]:
# FYI: @ does the same thing as matmul in this context
X_train[:10]@weights + bias

In [37]:
# Sanity check: different implementations of our forward pass are the same
assert (X_train@weights + bias == torch.matmul(X_train, weights) + bias).all()

In [38]:
# Let's make our model
class LinRegMulti(nn.Module):
    def __init__(self, n_cols):
        super().__init__()
        self.n_cols = n_cols

        self.weights = nn.Parameter(torch.rand(self.n_cols))
        self.bias = nn.Parameter(torch.rand(1))

    def forward(self, X):
        return X@self.weights.T + self.bias

In [39]:
N_EPOCHS = 10000
LR = 1e-3

In [40]:
lrm = LinRegMulti(X_train.shape[1])

In [41]:
# Instead of updating each parameter individually, let's make an update rule function.
def gd_update_rule(parameters, lr):
    parameters.data.sub_(parameters.grad * lr)
    parameters.grad.zero_()

In [42]:
train_losses = []
valid_losses = []

In [43]:
for i in range(N_EPOCHS):
    yhat = lrm(X_train)
    loss = mse(yhat, y_train)
    loss.backward()
    for p in lrm.parameters():
        gd_update_rule(p, LR)
    train_losses.append(loss.detach().numpy())

    with torch.no_grad():
        yhat = lrm(X_valid)
        valid_loss = mse(yhat, y_valid)
        valid_losses.append(valid_loss.numpy())

    if i%(N_EPOCHS/10) == 0:
        print(f"Epoch {i} Train Loss: {loss:.04f}, Valid Loss: {valid_loss:.04f}")

In [44]:
EPOCHS_TO_SHOW = 2000
fig, ax = plt.subplots(figsize=(12, 12))
ax.plot(train_losses[:EPOCHS_TO_SHOW], label='Train', linewidth=3, alpha=0.5)
ax.plot(valid_losses[:EPOCHS_TO_SHOW], ls='--', label='Valid')
ax.legend()

In [45]:
lrm.weights

In [46]:
lrm.bias

In [47]:
class Linear(nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.weights = nn.Parameter(torch.rand((dim_in, dim_out)))
        self.bias = nn.Parameter(torch.rand(dim_out))

    def forward(self, X):

        return X@self.weights + self.bias

In [48]:
# Let's compare our Linear class with nn.Linear
l1 = Linear(3, 5)
l2 = nn.Linear(3, 5)

In [49]:
l2.weight

In [50]:
l1.weights

In [51]:
# We need to make sure the weights have the same values.
# If they don't, we won't be able to compare the output.
# I'm not sure why the Linear layer's weights are transposed,
# but we'll see it doesn't matter.
l1.weights.data.copy_(l2.weight.T)
l1.bias.data.copy_(l2.bias)

In [52]:
l1(X_train[:5])

In [53]:
l2(X_train[:5])

In [54]:
assert (l1(X_train[:5]) == l2(X_train[:5])).all()

In [55]:
%%timeit
l1(X_train[:5])

In [56]:
%%timeit
 l2(X_train[:5])

In [57]:
rng = torch.arange(-5, 5.01, 0.05)
fig, ax = plt.subplots(figsize=(14, 8))
ax.plot(rng, F.relu(rng), label='ReLU')
ax.plot(rng, torch.tanh(rng), label='tanh')
ax.plot(rng, torch.sigmoid(rng), label='sigmoid')
ax.plot(rng, F.leaky_relu(rng, negative_slope=0.01), ls='--', label='leaky ReLU')
ax.set_ylim(-1.1, 1.1)
ax.set_title('Common activation functions')
ax.legend()

In [58]:
class MultiLayerRegressor(nn.Module):
    def __init__(self, dim_in, hidden_dim):
        super().__init__()
        # self.first_layer = Linear(dim_in, hidden_dim)
        self.first_layer = nn.Linear(dim_in, hidden_dim)
        # self.second_layer = Linear(hidden_dim, 1)
        self.second_layer = nn.Linear(hidden_dim, 1)

    def forward(self, X):
        x = self.first_layer(X)
        # x = relu(x)
        x = F.relu(x)
        x = self.second_layer(x)
        return x

In [59]:
def multilayer_regressor(in_dim, hidden_dim):
    return nn.Sequential(
        nn.Linear(in_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, 1)
    )

In [60]:
# mlr = MultiLayerRegressor(3, 4)
mlr = multilayer_regressor(3, 4)

In [61]:
train_losses = []
valid_losses = []

In [62]:
LR = 1e-3
N_EPOCHS = 20000

In [63]:
# Notice that instead of iterating through our parameters and applying
# an update rule, we're just using torch's built in SGD optimizer.
opt = optim.SGD(mlr.parameters(), lr=LR)

In [64]:
for i in range(N_EPOCHS):
    yhat = mlr(X_train).squeeze()
    # Calculate the loss
    loss = F.mse_loss(yhat, y_train)
    # Calculate the gradients
    loss.backward()
    # Perform the update step
    opt.step()
    # Zero out the gradients
    opt.zero_grad()
    train_losses.append(loss.detach().numpy())

    with torch.no_grad():
        yhat = mlr(X_valid).squeeze()
        valid_loss = F.mse_loss(yhat, y_valid)
        valid_losses.append(loss.numpy())

    if i%(N_EPOCHS/10) == 0:
        print(f"Epoch {i} Train loss: {loss:.04f}, Valid loss: {valid_loss:.04f}")

In [65]:
idx=10000
fig, ax = plt.subplots(figsize=(12, 12))
ax.plot(train_losses[:idx], label='Train', linewidth=3, alpha=0.5)
ax.plot(valid_losses[:idx], ls='--', label='Valid')
ax.legend()

In [66]:
# Modify the code above to complete the exercise