In [1]:
!pip install numpy pandas matplotlib torch




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import  torch


### Download and inspect data

In [3]:
df = pd.read_csv("data_banknote_authentication.txt", header=None)
df.head()

Unnamed: 0,0,1,2,3,4
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [5]:
X_features = df[[0, 1, 2, 3]].values
y_target = df[4].values

In [6]:
X_features.shape

(1372, 4)

In [7]:
import numpy as np

np.bincount(y_target)

array([762, 610], dtype=int64)

### Create custom Dataloader

In [12]:
from torch.utils.data import DataLoader, Dataset


class BanknoteDataset(Dataset):

    def __init__(self, X, y):

        self._features = torch.tensor(X, dtype=torch.float32)
        self._labels = torch.tensor(y, dtype=torch.float32)

    def __getitem__(self, index):
        x = self._features[index]
        y = self._labels[index]

        return x, y

    def __len__(self):
        return self._labels.shape[0]


In [9]:
train_size = int(X_features.shape[0] * 0.8)
train_size

1097

In [10]:
test_size = X_features.shape[0] - train_size
test_size

275

### Split data to train and test

In [13]:
dataset = BanknoteDataset(X_features, y_target)

torch.manual_seed(1)
train_set, val_set = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(
    dataset=train_set,
    batch_size=10,
    shuffle=True,
)

val_loader = DataLoader(
    dataset=val_set,
    batch_size=10,
    shuffle=False,
)

### Implementing Logistic Regression Model

In [14]:
class LogisticRegressionModel(torch.nn.Module):

    def __init__(self, num_features):

        super().__init__()
        self._linear = torch.nn.Linear(num_features, 1)


    def forward(self, X):
        logits = self._linear(X)
        probas = torch.sigmoid(logits)
        return probas


### Model evaluation

In [15]:
def compute_accuracy(model, dataloader):

    model = model.eval()

    correct = 0.0
    total_examples = 0

    for idx, (features, class_labels) in enumerate(dataloader):

        with torch.no_grad():
            probas = model(features)

        pred = torch.where(probas > 0.5, 1, 0)
        lab = class_labels.view(pred.shape).to(pred.dtype)

        compare = lab == pred
        correct += torch.sum(compare)
        total_examples += len(compare)

    return correct / total_examples

In [36]:
import torch.nn.functional as F

num_epochs_stop = 30
learning_rate_stop = 3

def find_optimal_hyperparameters():

    torch.manual_seed(1)
    model = LogisticRegressionModel(num_features=4)

    stop_learning = False
    best_lr = 0
    best_num_epochs = 0
    for epochs_count in range(1, num_epochs_stop):
        if stop_learning:
            break
        num_epochs = epochs_count
        curr_lr = 0.1
        while curr_lr < learning_rate_stop:
            if stop_learning:
                break
            optimizer = torch.optim.SGD(model.parameters(), lr=curr_lr)
            for epoch in range(num_epochs):
                if stop_learning:
                    break
                model = model.train()
                for batch_idx, (features, class_labels) in enumerate(train_loader):

                    probas = model(features)

                    loss = F.binary_cross_entropy(probas, class_labels.view(probas.shape))

                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                    ### LOGGING
                    if not batch_idx % 20: # log every 20th batch
                        print(f'Epoch: {epoch+1:03d}/{num_epochs:03d}'
                               f' | Batch {batch_idx:03d}/{len(train_loader):03d}'
                               f' | Loss: {loss:.2f}')
                    train_acc = compute_accuracy(model, train_loader)
                    val_acc = compute_accuracy(model, val_loader)
                    if train_acc * 100 > 98 and val_acc * 100 > 98:
                        best_lr = curr_lr
                        best_num_epochs = epochs_count
                        break

            curr_lr += 0.1

        return {"best_lr": best_lr, "best_num_epochs": best_num_epochs, "model": model}



In [37]:
res = find_optimal_hyperparameters()

Epoch: 001/001 | Batch 000/110 | Loss: 1.30
Epoch: 001/001 | Batch 020/110 | Loss: 0.17
Epoch: 001/001 | Batch 040/110 | Loss: 0.29
Epoch: 001/001 | Batch 060/110 | Loss: 0.07
Epoch: 001/001 | Batch 080/110 | Loss: 0.04
Epoch: 001/001 | Batch 100/110 | Loss: 0.05
Epoch: 001/001 | Batch 000/110 | Loss: 0.13
Epoch: 001/001 | Batch 020/110 | Loss: 0.23
Epoch: 001/001 | Batch 040/110 | Loss: 0.04
Epoch: 001/001 | Batch 060/110 | Loss: 0.04
Epoch: 001/001 | Batch 000/110 | Loss: 0.04
Epoch: 001/001 | Batch 000/110 | Loss: 0.02
Epoch: 001/001 | Batch 000/110 | Loss: 0.11
Epoch: 001/001 | Batch 000/110 | Loss: 0.01
Epoch: 001/001 | Batch 000/110 | Loss: 0.07
Epoch: 001/001 | Batch 000/110 | Loss: 0.01
Epoch: 001/001 | Batch 020/110 | Loss: 0.01
Epoch: 001/001 | Batch 000/110 | Loss: 0.08
Epoch: 001/001 | Batch 000/110 | Loss: 0.03
Epoch: 001/001 | Batch 000/110 | Loss: 0.03
Epoch: 001/001 | Batch 000/110 | Loss: 0.02
Epoch: 001/001 | Batch 000/110 | Loss: 0.02
Epoch: 001/001 | Batch 000/110 |

In [38]:
print(res)

{'best_lr': 2.9000000000000012, 'best_num_epochs': 1, 'model': LogisticRegressionModel(
  (_linear): Linear(in_features=4, out_features=1, bias=True)
)}


In [39]:
val_acc = compute_accuracy(res["model"], val_loader)
print(f"Accuracy: {val_acc * 100:.2f}%")

Accuracy: 98.91%
