# Benchmarking of CPU vs GPU for a Multi Layer Perceptron on a Small Dataset

In [37]:
#uncomment if necessary
# %pip install torch sklearn

### create the Multi Layer Perceptron model  
- 3 layers   
- dynamic input layer  
- 32 nodes in 1st hidden layer  
- 16 nodes in 2nd hidden layer  
- 2 classes for output layer   
- ReLU for activation

In [38]:
from torch import nn 


class MultiLayerPerceptron(nn.Module): 
    
    def __init__(self, input_dim):
        super().__init__()

        self.fc1 = nn.Linear(in_features=input_dim, out_features=32)
        self.fc2 = nn.Linear(in_features=32, out_features=16)
        self.fc3 = nn.Linear(in_features=16, out_features=2)

        self.activation = nn.ReLU()

    


    def forward(self, x): 
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x)) 
        logits = self.fc3(x)
        return logits


### we will be using the `heart disease UCI` dataset for this benchmarking, I already prepared it, we will clone the following repo to get the `.joblib` ready data from it: 

In [39]:
! git clone https://github.com/zakariaaithssain/deep-learning-labs.git

fatal: destination path 'deep-learning-labs' already exists and is not an empty directory.


In [40]:
DATA_PATH = "/content/deep-learning-labs/data/ready/ready.joblib"

### get data from the `joblib` file   
the joblib file contains the following `json`-like structure:   
```python3 
{
    "train": {
            "X": X_train, 
            "y": y_train
            }, 
    "test": {
            "X": X_test, 
            "y": y_test
        }
}
```

In [41]:
import torch
import joblib


try:
    data = joblib.load(DATA_PATH)
except FileNotFoundError: 
        print(f"{DATA_PATH} not found.")
        exit(1)

training_data = data["train"]
X_train = training_data["X"]
y_train = training_data["y"]

#convert to tensors
X_train = torch.tensor(X_train, dtype=torch.float32) #float is required for gradient calcs
y_train = torch.tensor(y_train, dtype=torch.long) #long type is issential for cross entropy

### define `train` function with benchmarking using `time.perf_counter`

In [42]:
import time

def train(model, dataloader, num_epochs:int, lr:float, device): 
        #move model to device
        model.to(device, non_blocking=True)
        #Adam with L2 regularization, regularization strength 1e-4
        decay, no_decay = [], []
        for _, param in model.named_parameters():
            if param.ndim == 1:  #bias
                no_decay.append(param)
            else: #weights
                decay.append(param)

        optimizer = torch.optim.AdamW(
            [
                {"params": decay, "weight_decay": 1e-4},
                {"params": no_decay, "weight_decay": 0.0},
            ],
            lr=lr
        )

        criterion = torch.nn.CrossEntropyLoss()
        torch.cuda.synchronize()
        start = time.perf_counter()
        for epoch in range(num_epochs): 
                model.train()
                train_loss = 0.0
                n_batches = 0
                for X_batch, y_batch in dataloader: 
                        # to calculate the mean of losses per batch (train_loss/n_batches)
                        n_batches+=1
                        #tensors should be at the same device as the model
                        X_batch = X_batch.to(device, non_blocking=True)
                        y_batch = y_batch.to(device, non_blocking=True)
                        #forward pass
                        logits = model(X_batch)
                        batch_loss = criterion(logits, y_batch)

                        train_loss+= batch_loss.item()

                        #backward pass
                        optimizer.zero_grad()
                        batch_loss.backward()

                        #update params 
                        optimizer.step()
                train_loss/= n_batches
                
        
        torch.cuda.synchronize()
        end = time.perf_counter()
        return end - start


#### we use the following formula to calculate the graphical acceleration: 
**SPEED UP = CPU TIME / GPU TIME**

In [43]:
num_features = X_train.size(dim=1)
model = MultiLayerPerceptron(input_dim=num_features)

dataset = torch.utils.data.TensorDataset(X_train, y_train)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=10, shuffle=True,
                                            pin_memory=True, num_workers=2) #for GPU training, not need for CPU

num_epochs = 100
lr = 1e-3


if torch.cuda.is_available(): 
    print("GPU training started.")
    gpu_time = train(model, dataloader, num_epochs, lr, torch.device("cuda"))
    print("GPU time (s): ", gpu_time)

    print("CPU training started.")
    cpu_time = train(model, dataloader, num_epochs, lr, torch.device("cpu"))
    print("CPU time (s): ", cpu_time)
    speed_up = cpu_time/gpu_time 
    print("SPEED UP: ", speed_up)
else: 
    print("gpu not available for benchmarking")

GPU training started.
GPU time (s):  27.890532811999947
CPU training started.
CPU time (s):  27.748056419999784
SPEED UP:  0.9948915858667691


### CONCLUSION:  
for this 3 layers MLP model with less than 900 training data points, combining: 
 - GPU usage
 - parallel data preprocessing in CPU (num_workers > 0 in DataLoader)
 - memory pinning (pin_memory is True in DataLoader)
 - non blocking CPU scheduling (non_blocking is True when transfering to device)

 makes training **slower** than using CPU as the device **(SPEED UP < 1)**, especially if the batch size is small.   
 this is due to the transfer time from host to GPU being the bottleneck, and also the GPU requires warming up time before reaching its full potential. 

 **Note:** GPU only gives acceleration for heavy tasks, the heaviest the better. using GPU for liteweight tasks (e.g small datasets, small models) causes latency instead of acceleration. 