# A deeper dive into loading data

In [None]:
import pandas as pd
import numpy as np


animals = pd.read_csv('animals.csv')


# Define input features
features = animals.iloc[:, 1:-1]
X = features.to_numpy()
print(X)

 # Define target features (ground truth)
target = animals.iloc[:, -1]
y = target.to_numpy()

In [None]:
import torch
from torch.utils.data import TensorDataset

# Instantiate dataset class
dataset = TensorDataset(torch.tensor(X).float(), torch.tensor(y).float())

# Access an individual sample
sample = dataset[0]
input_sample, label_sample = sample
print('input sample:', input_sample)
print('label_sample:', label_sample)

In [None]:
from torch.utils.data import DataLoader

batch_size = 2
shuffle = True
# Create a DataLoader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

 # Iterate over the dataloader
for batch_inputs, batch_labels in dataloader:
    print('batch inputs', batch_inputs)
    print('batch labels', batch_labels)

In [None]:
"""

Create a TensorDataset using the torch_features and the torch_target tensors provided (in this order).
Return the last element of the dataset.

"""

import numpy as np
import torch
from torch.utils.data import TensorDataset

np_features = np.array(np.random.rand(12, 8))
np_target = np.array(np.random.rand(12, 1))

torch_features = torch.tensor(np_features)
torch_target = torch.tensor(np_target)

# Create a TensorDataset from two tensors
dataset = TensorDataset(torch_features.float() , torch_target.float())

# Return the last element of this dataset
print(dataset[-1])

(tensor([0.8883, 0.4074, 0.3171, 0.4329, 0.2112, 0.7095, 0.4987, 0.1853]), tensor([0.2213]))


In [None]:
"""

Extract the features (ph, Sulfate, Conductivity, Organic_carbon) and target (Potability) values and load them into tensors to represent features and targets.
Use both tensors to generate a PyTorch dataset using the tensor dataset class.

"""

# Load the different columns into two PyTorch tensors
features = torch.tensor(dataframe[['ph', 'Sulfate', 'Conductivity', 'Organic_carbon']].to_numpy()).float()
target = torch.tensor(dataframe['Potability'].to_numpy()).float()

# Create a dataset from the two generated tensors
dataset = TensorDataset(features, target)

# Evaluating model performance

**Calculating training loss**

In [None]:
training_loss = 0.0
for i, data in enumerate(trainloader, 0):
    # Run the forward pass
    ...
    # Calculate the loss
    loss = criterion(outputs, labels)
    # Calculate the gradients
    ...
    # Calculate and sum the loss
    training_loss += loss.item()  #### The loss tensor's .item() method returns the Python number contained in the tensor
epoch_loss = training_loss / len(trainloader)

**Calculating validation loss**

In [None]:
validation_loss = 0.0
model.eval() # Put model in evaluation mode because some layers in PyTorch models behave differently at training versus validation stages
with torch.no_grad(): # Speed up the forward pass
    for i, data in enumerate(validationloader, 0):
        # Run the forward pass
        ...
        # Calculate the loss
        loss = criterion(outputs, labels)
        validation_loss += loss.item()
epoch_loss = validation_loss / len(validationloader)
model.train() ### We set the model back to training mode at the end of the validation epoch, so we can run another training epoch.

**Calculating accuracy with torchmetric**

In [None]:
"""

In addition to loss, we also want to keep track of other metrics to evaluate how well our model is at predicting correct answers.
To do so, a new package called torchmetrics will be used. If we are performing classification, we can use torchmetrics to create an accuracy metric.

On each iteration of dataloader, we call this metric using model outputs and ground truth labels.
The accuracy metric takes probabilities and single number labels as inputs. The output variable here would be the probabilities returned by the softmax function.
If the labels contain one-hot encoded classes, we'll need the argmax function to obtain numbers instead of one-hot vectors.

At the end of the epoch, we calculate total accuracy using the metric's .compute() method. Finally, we use .reset() to reset the metric for the next epoch.
Accuracy is calculated in the same way for training and validation.

"""

import torchmetrics
# Create accuracy metric using torch metrics
metric = torchmetrics.Accuracy(task="multiclass", num_classes=3)
for i, data in enumerate(dataloader, 0):
    features, labels = data
    outputs = model(features)
    # Calculate accuracy over the batch
    acc = metric(outputs, labels.argmax(dim=-1))
# Calculate accuracy over the whole epoch
acc = metric.compute()
print(f"Accuracy on all data: {acc}")
# Reset the metric for the next epoch (training or validation)
metric.reset()

In [None]:
"""

In this exercise, you will practice writing the evaluation loop. Recall that the evaluation loop is similar to the training loop, except that you will not perform the gradient calculation and the optimizer step.

The model has already been defined for you, along with the object validationloader, which is a dataset.

"""




"""

Set the model to evaluation mode.
Sum the current batch loss to the validation_loss variable
Calculate the mean loss value for the epoch.
Set the model back to training mode.
"""

# Set the model to evaluation mode
model.eval()
validation_loss = 0.0

with torch.no_grad():

  for data in validationloader:

      outputs = model(data[0])
      loss = criterion(outputs, data[1])

      # Sum the current loss to the validation_loss variable
      validation_loss += loss.item()

# Calculate the mean loss value
validation_loss_epoch = validation_loss / len(validationloader)
print(validation_loss_epoch)

# Set the model back to training mode
model.train()


In [None]:
"""

In this exercise, you will practice using the torchmetrics package to calculate the accuracy. You will be using a sample of the facemask dataset.
This dataset contains three different classes. The plot_errors function will display samples where the model predictions do not match the ground truth.

The torchmetrics package is already imported. The model outputs are the probabilities returned by a softmax as the last step of the model. T
he labels tensor contains the labels as one-hot encoded vectors

"""


"""

Create an accuracy metric for a "multiclass" problem with three classes.
Calculate the accuracy for each batch of the dataloader

Calculate accuracy for the epoch.
Reset the metric for the next epoch.

"""


# Create accuracy metric using torch metrics
metric = torchmetrics.Accuracy(task="multiclass", num_classes=3)
for data in dataloader:
    features, labels = data
    outputs = model(features)

    # Calculate accuracy over the batch
    acc = metric(outputs, labels.argmax(dim=-1))

# Calculate accuracy over the whole epoch
acc = metric.compute()

# Reset the metric for the next epoch
metric.reset()
plot_errors(model, dataloader)

# Fighting overfitting

In [None]:
"""

To counter overfitting, we can reduce the model size or add a new type of layer called dropout. We can also use weight decay to force the parameters to remain small.
We can get more data or use data augmentation.
"""

**Regularization" using a dropout layer**

In [None]:
"""

Dropout is a "regularization" technique where randomly, a fraction of input neurons is set to zero at each update, effectively "dropping" them out.
Corresponding connections are temporarily removed from the network, making the network less likely to overly rely on specific features.

Dropout can be added to models as shown. The p argument indicates the probability of setting a neuron to zero.
Here, we set 50 percent of the output tensor's neurons to zero. Usually, dropout layers are added after activation functions.
Dropout layers behave differently between training and evaluation and we must not forget to switch the model mode

"""

model = nn.Sequential(
    nn.Linear(8, 4),
    nn.ReLU(),
    nn.Dropout(p=0.5)
  )
features = torch.randn((1, 8))
model(i)

**Regularization with weight decay**

In [None]:
"""

In PyTorch, weight decay can be added to the optimizer as shown. It is controlled by the weight_decay parameter,
which should range between zero and one but is typically very small. When the optimizer's weight_decay parameter is set,
it adds an additional term to the parameter update step that encourages smaller weights.

This regularization term is proportional to the current value of the weight, and it is subtracted from the gradient during backpropagation.
The weight decay term effectively penalizes large weights and helps prevent overfitting.
The higher we set this parameter, the less likely our model is to overfit, so the model can generalize better to new data.

"""

optimizer = optim.SGD(model.parameters(), lr=1e-3, weight_decay=1e-4)

**Data Augmentation**

In [None]:
"""

Getting more data can be costly. However, researchers have found a way to artificially increase the size and diversity of their dataset by using data augmentation.
Data augmentation is commonly applied to image data, which can be rotated and scaled, so that different views of the same face become available as "new" data points.

"""

# Improving model performance

In [None]:
"""

Randomly sample a learning rate factor between 2 and 4 so that the learning rate (lr) is bounded between
10e-2 and 10e-4

Randomly sample a momentum between 0.85 and 0.99.

"""

values = []
for idx in range(10):
    # Randomly sample a learning rate factor between 2 and 4
    factor = np.random.uniform(2, 4)
    lr = 10 ** -factor

    # Randomly select a momentum between 0.85 and 0.99
    momentum = np.random.uniform(.85 , .99)

    values.append((lr, momentum))