# Attribution

Geron, Aur√©lion (2025). Hands-On Machine Learning with Scikit-Learn and PyTorch: Concepts, Tools, and Techniques to Build Intelligent Systems. O'Reilly: Santa Rosa (CA).

# Linear Regression with PyTorch

In [1]:
import torch

## Device check

In [2]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

device

'mps'

## Getting data

In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

In [4]:
housing_dataset = fetch_california_housing()

In [5]:
print(housing_dataset.data[:3])
print(housing_dataset.target[:3])

[[ 8.32520000e+00  4.10000000e+01  6.98412698e+00  1.02380952e+00
   3.22000000e+02  2.55555556e+00  3.78800000e+01 -1.22230000e+02]
 [ 8.30140000e+00  2.10000000e+01  6.23813708e+00  9.71880492e-01
   2.40100000e+03  2.10984183e+00  3.78600000e+01 -1.22220000e+02]
 [ 7.25740000e+00  5.20000000e+01  8.28813559e+00  1.07344633e+00
   4.96000000e+02  2.80225989e+00  3.78500000e+01 -1.22240000e+02]]
[4.526 3.585 3.521]


## Test set

In [6]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing_dataset.data,
    housing_dataset.target,
    random_state=42
)

## Train & Validation set

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full,
    y_train_full,
    random_state=42
)

## Conversion to Tensor & normalization

In [8]:
X_train = torch.FloatTensor(X_train)
X_valid = torch.FloatTensor(X_valid)
X_test = torch.FloatTensor(X_test)

means = X_train.mean(dim=0, keepdims=True)
standards = X_train.std(dim=0, keepdims=True)

X_train = (X_train - means) / standards
X_valid = (X_valid - means) / standards
X_test = (X_test - means) / standards

## Conversion of targets to tensors

Increasing the dimensionality from 1 to 2

In [9]:
y_train = torch.FloatTensor(y_train).reshape(-1,1)
y_valid = torch.FloatTensor(y_valid).reshape(-1,1)
y_test = torch.FloatTensor(y_test).reshape(-1,1)

## Training function 

- with param creation
- batch gradient descent

In [15]:
def training(lr, epochs):
    torch.manual_seed(42)
    global w
    global b

    # input features
    n_features = X_train.shape[1]
    
    # weights (a column vector with one weight per input dimension)
    # initialized randomly
    w = torch.randn((n_features, 1), requires_grad=True)
    
    # bias (scalar)
    # initialized to 0
    b = torch.tensor(0., requires_grad=True)
    
    learning_rate = lr
    n_epochs = epochs
    for epoch in range(n_epochs):
        # computing predictions
        y_pred = X_train @ w + b

        # MSE loss function
        loss = ((y_pred - y_train) ** 2).mean()
        
        # autograd to compute gradients of the loss
        loss.backward()
        
        # gradient descent step
        with torch.no_grad():
            b -= learning_rate * b.grad
            w -= learning_rate * w.grad
            b.grad.zero_()
            w.grad.zero_()
            
        print(f"Epoch {epoch + 1}/{n_epochs}; Loss {loss.item()}")

## Training

In [16]:
training(0.4, 20)

Epoch 1/20; Loss 16.158456802368164
Epoch 2/20; Loss 4.8793745040893555
Epoch 3/20; Loss 2.25522518157959
Epoch 4/20; Loss 1.3307634592056274
Epoch 5/20; Loss 0.9680693745613098
Epoch 6/20; Loss 0.8142677545547485
Epoch 7/20; Loss 0.7417045831680298
Epoch 8/20; Loss 0.7020701169967651
Epoch 9/20; Loss 0.6765918731689453
Epoch 10/20; Loss 0.6577964425086975
Epoch 11/20; Loss 0.6426151394844055
Epoch 12/20; Loss 0.6297222971916199
Epoch 13/20; Loss 0.6184942126274109
Epoch 14/20; Loss 0.6085968613624573
Epoch 15/20; Loss 0.5998216271400452
Epoch 16/20; Loss 0.592018723487854
Epoch 17/20; Loss 0.5850691795349121
Epoch 18/20; Loss 0.578873336315155
Epoch 19/20; Loss 0.573345422744751
Epoch 20/20; Loss 0.5684100389480591


## Making prediction

In [17]:
X_new = X_test[:5]
with torch.no_grad():
    y_pred = X_new @ w + b

In [18]:
print(y_pred)

tensor([[0.8916],
        [1.6480],
        [2.6577],
        [2.7062],
        [2.2410]])


In [19]:
y_test[:5]

tensor([[0.4770],
        [0.4580],
        [5.0000],
        [2.1860],
        [2.7800]])

## Using higher-level API

In [20]:
import torch.nn as nn

In [29]:
# reimport of previously used data (see above)

n_features = X_train.shape[1]
w = torch.randn((n_features, 1), requires_grad=True)
b = torch.tensor(0., requires_grad=True)

learning_rate = 0.4
n_epochs = 20

In [22]:
torch.manual_seed(42)

model = nn.Linear(in_features=n_features, out_features=1)

In [26]:
# first the "weights", then the "bias"
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[ 0.2703,  0.2935, -0.0828,  0.3248, -0.0775,  0.0713, -0.1721,  0.2076]],
       requires_grad=True)
Parameter containing:
tensor([0.3117], requires_grad=True)


In [27]:
# prediction with untrained model for first two instances
# at that time params are random, hence, predictions are terrible
model(X_train[:2])

tensor([[-0.4718],
        [ 0.1131]], grad_fn=<AddmmBackward0>)

In [30]:
# creating an optimizer to update model params
# loss func is MSE
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
mse = nn.MSELoss()

$$
\mathrm{MSE} = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y}_i)^2
$$

In [31]:
def train_bgd(model, optimizer, criterion, X_train, y_train, n_epochs):
    for epoch in range(n_epochs):
        y_pred = model(X_train)
        loss = criterion(y_pred, y_train)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {loss.item()}")