In [16]:
import torch
import torch.nn as nn
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler # to scale our features
from sklearn.model_selection import train_test_split

StandardScaler is a preprocessing tool provided by the scikit-learn library, used to standardize features by removing the mean and scaling to unit variance. 
This is often an essential step in data preprocessing for machine learning algorithms, particularly those that are sensitive to the scale of the data, such as gradient descent-based algorithms.

Here is a brief explanation of how StandardScaler works:

- Mean Removal: Each feature (column) of the data is centered by subtracting the mean of that feature.
- Scaling to Unit Variance: Each feature is scaled by dividing by its standard deviation.
This process transforms the data so that each feature has a mean of 0 and a standard deviation of 1.

Step calculate the standardization step by step for the dataset $\left[\begin{array}{cc}1 & 2 \\ 3 & 4 \\ 5 & 6 \\ 7 & 8\end{array}\right]$. 

### Step 1: Compute the Mean
First, we compute the mean for each feature (column).

$$
\text{mean}_1 = \frac{1 + 3 + 5 + 7}{4} = \frac{16}{4} = 4
$$

$$
\text{mean}_2 = \frac{2 + 4 + 6 + 8}{4} = \frac{20}{4} = 5
$$

### Step 2: Compute the Standard Deviation
Next, we compute the standard deviation for each feature.

For the first feature (column 1):

$$
\text{sd}_1 = \sqrt{\frac{(1-4)^2 + (3-4)^2 + (5-4)^2 + (7-4)^2}{4}} = \sqrt{\frac{9 + 1 + 1 + 9}{4}} = \sqrt{\frac{20}{4}} = \sqrt{5} \approx 2.236
$$

For the second feature (column 2):

$$
\text{sd}_2 = \sqrt{\frac{(2-5)^2 + (4-5)^2 + (6-5)^2 + (8-5)^2}{4}} = \sqrt{\frac{9 + 1 + 1 + 9}{4}} = \sqrt{\frac{20}{4}} = \sqrt{5} \approx 2.236
$$

### Step 3: Standardize the Data
Now, we standardize each value by subtracting the mean and dividing by the standard deviation for each feature.

$$
z_{i,j} = \frac{x_{i,j} - \text{mean}_j}{\text{sd}_j}
$$

For the first feature (column 1):

$$
z_{1,1} = \frac{1 - 4}{2.236} = \frac{-3}{2.236} \approx -1.342
$$
$$
z_{2,1} = \frac{3 - 4}{2.236} = \frac{-1}{2.236} \approx -0.447
$$
$$
z_{3,1} = \frac{5 - 4}{2.236} = \frac{1}{2.236} \approx 0.447
$$
$$
z_{4,1} = \frac{7 - 4}{2.236} = \frac{3}{2.236} \approx 1.342
$$

For the second feature (column 2):

$$
z_{1,2} = \frac{2 - 5}{2.236} = \frac{-3}{2.236} \approx -1.342
$$
$$
z_{2,2} = \frac{4 - 5}{2.236} = \frac{-1}{2.236} \approx -0.447
$$
$$
z_{3,2} = \frac{6 - 5}{2.236} = \frac{1}{2.236} \approx 0.447
$$
$$
z_{4,2} = \frac{8 - 5}{2.236} = \frac{3}{2.236} \approx 1.342
$$

### Result
The standardized data is:

$$
\left[\begin{array}{cc}
-1.342 & -1.342 \\
-0.447 & -0.447 \\
0.447 & 0.447 \\
1.342 & 1.342
\end{array}\right]
$$

This matches the results you would get using `StandardScaler` in scikit-learn.

In [17]:
# Sample data
data = [[1, 2], [3, 4], [5, 6], [7, 8]]

# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the data and transform the data
scaled_data = scaler.fit_transform(data)

print(scaled_data)


[[-1.34164079 -1.34164079]
 [-0.4472136  -0.4472136 ]
 [ 0.4472136   0.4472136 ]
 [ 1.34164079  1.34164079]]


In [18]:
# 0) Prepare data
bc = datasets.load_breast_cancer()
X, y = bc.data, bc.target


In [19]:
X

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [20]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [21]:
n_samples, n_features = X.shape
print(n_samples, n_features)

569 30


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [23]:
# scale
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# type double to type float to prevent error in later training
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.astype(np.float32))
y_test = torch.from_numpy(y_test.astype(np.float32))

In [24]:
y_train

tensor([1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1.,
        0., 1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 0.,
        1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1.,
        1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0.,
        1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0.,
        1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
        0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 1., 1.,
        0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 0.,
        0., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1.,
        0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
        1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0.,
        1., 1., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1.,
        1., 1., 1., 0., 1., 1., 1., 0., 

In [25]:
y_test

tensor([1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1.,
        1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1.,
        1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 0.,
        0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 0.,
        1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
        1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0.,
        0., 1., 0., 1., 0., 0.])

In [26]:
y_train.shape

torch.Size([455])

In [27]:
y_test.shape

torch.Size([114])

In [28]:
# reshape - make it column vector
y_train = y_train.view(y_train.shape[0], 1)
y_test = y_test.view(y_test.shape[0], 1)
print(y_train.shape, y_test.shape)


torch.Size([455, 1]) torch.Size([114, 1])


In [29]:
y_train

tensor([[1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
      

In [30]:
y_test

tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
      

In [31]:
# 1) Model
# Linear model f = wx + b , sigmoid at the end
class Model(nn.Module):
    def __init__(self, n_input_features):
        super(Model, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)

    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

model = Model(n_features)

Binary Cross Entropy (BCE) Loss is a loss function used in binary classification tasks. It measures the performance of a classification model whose output is a probability value between 0 and 1. BCE Loss is used to quantify the difference between two probability distributions - the true labels (which are 0 or 1) and the predicted probabilities from the model.

The Binary Cross Entropy Loss function is defined as:

$$ \text{BCE Loss} = - \frac{1}{N} \sum_{i=1}^N [y_i \log(\hat{y}_i) + (1 - y_i) \log(1 - \hat{y}_i)] $$

where:
- $ N $ is the number of samples.
- $ y_i $ is the true label for the $ i $-th sample (0 or 1).
- $ \hat{y}_i $ is the predicted probability for the $ i $-th sample (between 0 and 1).



In [32]:
# 2) Loss and optimizer
num_epochs = 100
learning_rate = 0.01
criterion = nn.BCELoss() # Binary Cross Entropy Loss
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [33]:
# 3) Training loop
for epoch in range(num_epochs):
    # Forward pass and loss
    y_pred = model(X_train)
    loss = criterion(y_pred, y_train)

    # Backward pass and update
    loss.backward()
    optimizer.step()

    # zero grad before new step
    optimizer.zero_grad()

    if (epoch+1) % 10 == 0:
        print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')


epoch: 10, loss = 0.5411
epoch: 20, loss = 0.4566
epoch: 30, loss = 0.4001
epoch: 40, loss = 0.3596
epoch: 50, loss = 0.3290
epoch: 60, loss = 0.3049
epoch: 70, loss = 0.2853
epoch: 80, loss = 0.2690
epoch: 90, loss = 0.2551
epoch: 100, loss = 0.2432


In [34]:
with torch.no_grad():
    y_predicted = model(X_test)
    y_predicted_cls = y_predicted.round()
    acc = y_predicted_cls.eq(y_test).sum() / float(y_test.shape[0])
    print(f'accuracy: {acc.item():.4f}')

accuracy: 0.9211
