# Goal
Compare different aspects of the two frameworks in terms of:
* readability (e.g., number of lines, what patterns they follow)
* performance (e.g., loss, acc.)
* speed: data loading, training (forward+backward passes), inference

# Imports

In [None]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, metrics, losses, optimizers

In [None]:
import torch as th
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

In [None]:
import matplotlib.pyplot as plt
import numpy as np

Check if CUDA is installed properly for both frameworks

In [None]:
print(th.cuda.is_available())
print(tf.config.list_physical_devices('GPU'))

# Hyperparameters

In [None]:
BATCH_SIZE = 32
LR = 1e-3
NUM_EPOCHS = 10
SEED = 17
device='cuda'

# Convenience

In [None]:
# Create data structure to hold different datasets, models and results
# For the purpose of this notebook only a CNN is analysed 

config = {
    "th": {
        "datasets": {},
        "models": {}
    },
    "tf": {
        "datasets": {},
        "models": {}
    }
}

In [None]:
th.manual_seed(SEED)
tf.random.set_seed(SEED)

# Dataset

In [None]:
(train_images, train_labels), (val_images, val_labels) = datasets.cifar10.load_data()
# can do transforms but with TFX library or implement as tf layers (that have the advantage to be part of the model i.e. can be used also at inference)
train_images, val_images = (train_images - 0.5) / 0.5, (val_images - 0.5) / 0.5

# from_tensor_slices splits the input data along first dimension
# A tf.data.Dataset represents a sequence of elements where element = single data point (e.g., feature-label pair) | batch of data points
train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels))
# keep in mind that setting the buffer_size=card(dataset) will load all the training data into memory
train_loader = train_dataset.shuffle(buffer_size=train_dataset.cardinality()).batch(BATCH_SIZE)

# Prepare the validation dataset.
val_dataset = tf.data.Dataset.from_tensor_slices((val_images, val_labels))
val_loader = val_dataset.batch(BATCH_SIZE)

print(len(train_images), len(val_labels), train_dataset.element_spec)

config["tf"]["datasets"]["cifar10"] = dict(train_dataset=train_dataset,
                                          val_dataset=val_dataset,
                                          train_loader=train_loader, 
                                          val_loader=val_loader)

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# num_workers=0 for Windows
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                             download=True, transform=transform)
val_dataset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                            download=True, transform=transform)
train_loader = th.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                        shuffle=True, num_workers=0)
val_loader = th.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE,
                                     shuffle=False, num_workers=0)

class_names = train_dataset.classes
print(len(train_dataset), len(val_dataset), train_dataset[0][0].shape, train_dataset[0][1])
print(class_names)

config["th"]["datasets"]["cifar10"] = dict(train_dataset=train_dataset,
                                          val_dataset=val_dataset,
                                          train_loader=train_loader, 
                                          val_loader=val_loader)

### Conclusions so far
* TF is channels-last while TH is channel-first
* Both dataset implementations use lazy loading 
* Both can parallelize loading and processing
* Both have sparse labels => SparseCategoricalCrossEntropy
* PyTorch has builtin transforms for data augmentations while TensorFlow has to rely direclty on python or other libraries (e.g., TFX)

# Architectures

In [None]:
class TF_CNN(models.Model):
    def __init__(self):
        super().__init__()
        self.conv1 = layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu')
        self.pool1 = layers.MaxPooling2D(pool_size=(2, 2))
        self.conv2 = layers.Conv2D(32, (3, 3), activation='relu')
        self.pool2 = layers.MaxPooling2D((2, 2))
        self.conv3 = layers.Conv2D(64, (3, 3), activation='relu')
        self.flatten = layers.Flatten() # 4 * 4 * 64
        self.fc1 = layers.Dense(64, activation='relu')
        self.fc2 = layers.Dense(10)
        
    def call(self, x):
        x = self.conv1(x)
        x = self.pool1(x)
        x = self.conv2(x)
        x = self.pool2(x)
        x = self.conv3(x)
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)

        return x
model = TF_CNN()
model.build(input_shape=(BATCH_SIZE, 32, 32, 3))
model.summary()
config["tf"]["models"]["cnn"] = {"model": model}

In [None]:
class TH_CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, 3)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(32, 64, 3)
        self.fc1 = nn.Linear(64*4*4, 64)
        self.fc2 = nn.Linear(64, 10)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool1(F.relu(self.conv2(x)))
        x = F.relu(self.conv3(x))
        x = th.flatten(x, 1) 
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
model = TH_CNN()
print(f"Number of trainable paramters PT: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
config["th"]["models"]["cnn"] = {"model": model}

### Conclusions so far
* Both can use Sequential and Functional APIs
* tensorflow developers would assume feature dimensions and make only channel dimensions explicit
* pytorch developers will need to make each dimension explicit
* pytorch conv layers expect channels first while tensorflow channel last

# Criterion, metrics & optimizers

In [None]:
sce_tf = losses.SparseCategoricalCrossentropy(from_logits=True)
adam_tf = optimizers.Adam(learning_rate=LR)
accuracy_tf = metrics.Accuracy()

config["tf"]["models"]["cnn"]["criterions"] = {"sce": sce_tf}
config["tf"]["models"]["cnn"]["optimizers"] = {"adam": adam_tf}
config["tf"]["models"]["cnn"]["metrics"] = {"acc": accuracy_tf}

In [None]:
ce_th = nn.CrossEntropyLoss()
adam_th = optim.Adam(config["th"]["models"]["cnn"]["model"].parameters(), lr=LR)

def accuracy_th(logits, targs):
    pred_classes = th.argmax(logits, -1)
    correct = (pred_classes == targs).sum().item()
    return correct / len(targs)

config["th"]["models"]["cnn"]["criterions"] = {"ce": ce_th}
config["th"]["models"]["cnn"]["optimizers"] = {"adam": adam_th}
config["th"]["models"]["cnn"]["metrics"] = {"acc": accuracy_th}

### Conclusions so far
* CategoricalCrossEntropy expects targets to be one-hot encoded, from_logits=True means no Softmax in the last layer
* SparseCategoricalCrossEntropy expects targets to be integer indices, from_logits=True means no Softmax in the last layer
* CrossEntropyLoss always expects logits as predictions and targets as either integers or one hot encodings
* CrossEntropyLoss <=> LogSoftmax + NLLLoss
* No metrics, rely on Python or external library e.g., torchmetrics, sklearn
* optimizer in Pytorch expects model parameters

# Train loops

### TensorFlow

In [None]:
# remove annotation to see performance impact
@tf.function
def train_step_tf(model, inps, targs, criterion, optimizer):
    # Open a GradientTape to record the operations run
    # during the forward pass, which enables auto-differentiation.
    with tf.GradientTape() as tape:
        # Run the forward pass of the layer.
        # The operations that the layer applies to its inputs are going to be recorded on the GradientTape.
        preds = model(inps, training=True) 
        loss_value = criterion(targs, preds)
        
    grads = tape.gradient(loss_value, model.trainable_weights)
    optimizer.apply_gradients(zip(grads, model.trainable_weights))
    return loss_value

def train_tf(model, train_loader, val_loader, criterion, optimizer):
    for epoch in range(NUM_EPOCHS):    
        train_loss = 0.0       
        for step, (inps, targs) in enumerate(train_loader):
            tloss = train_step_tf(model, inps, targs, criterion, optimizer)
            train_loss += float(tloss)
            
        val_loss = 0.0     
        for val_inps, val_targs in val_loader:
            val_preds = model(val_inps, training=False)
            vloss = criterion(val_targs, val_preds)
            val_loss += float(vloss)

        print(f"Epoch {epoch} \t Train loss {train_loss/len(train_loader)} \t Val loss {val_loss/len(val_loader)}")
    return model

def evaluate_tf(model, data_loader, metric, criterion):
    criterion_res, metric_res= 0.0, 0.0
    for inp, targ in data_loader:
        pred = model(inp, training=False)
        
        metric.update_state(targ, tf.argmax(pred, -1)[..., None])
        criterion_res += float(criterion(targ, pred))
        metric_res += metric.result().numpy()
        metric.reset_state()
    criterion_res /= len(data_loader)
    metric_res /= len(data_loader)
    return metric_res, criterion_res

In [None]:
model = train_tf(model=config["tf"]["models"]["cnn"]["model"], 
        train_loader=config["tf"]["datasets"]["cifar10"]["train_loader"],
        val_loader=config["tf"]["datasets"]["cifar10"]["val_loader"],
        criterion=config["tf"]["models"]["cnn"]["criterions"]["sce"],
        optimizer=config["tf"]["models"]["cnn"]["optimizers"]["adam"])

acc_tf, loss_tf = evaluate_tf(model=model,
                  data_loader=config["tf"]["datasets"]["cifar10"]["val_loader"],
                  metric=config["tf"]["models"]["cnn"]["metrics"]["acc"],
                  criterion=config["tf"]["models"]["cnn"]["criterions"]["sce"])
print(acc_tf, loss_tf)

### PyTorch

In [None]:
def train_step_th(model, inps, targs, criterion, optimizer):
    # zero the parameter gradients to avoid leaking previous model performance
    optimizer.zero_grad()
    # forward + backward + optimize
    preds = model(inps)
    tloss = criterion(preds, targs)
    
    tloss.backward()
    optimizer.step()
    return tloss

def train_th(model, train_loader, val_loader, criterion, optimizer):
    model.to(device)
    for epoch in range(NUM_EPOCHS): 
        
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            inps, targs = batch[0].to(device), batch[1].to(device)
            tloss = train_step_th(model, inps, targs, criterion, optimizer)
            train_loss += tloss.item()
            
        model.eval()
        val_loss = 0.0
        with th.no_grad():
            for val_batch in val_loader:
                val_inps, val_targs = val_batch[0].to(device), val_batch[1].to(device)
                val_preds = model(val_inps)
                vloss = criterion(val_preds, val_targs)
                val_loss += vloss.item()                    
        print(f"Epoch {epoch} \t Train loss {train_loss/len(train_loader)} \t Val loss {val_loss/len(val_loader)}")
    return model
    
def evaluate_th(model, data_loader, metric, criterion):
    model.eval()
    metric_res, criterion_res = 0.0, 0.0
    for inps, targs in data_loader:
        inps = inps.to(device)
        targs = targs.to(device)
        model.to(device)
        with th.no_grad():
            preds = model(inps)
        metric_res += metric(preds, targs)
        criterion_res += criterion(preds, targs).item()
        
    metric_res /= len(data_loader)    
    criterion_res /= len(data_loader)    
    return metric_res, criterion_res

In [None]:
model = train_th(model=config["th"]["models"]["cnn"]["model"], 
        train_loader=config["th"]["datasets"]["cifar10"]["train_loader"],
        val_loader=config["th"]["datasets"]["cifar10"]["val_loader"],
        criterion=config["th"]["models"]["cnn"]["criterions"]["ce"],
        optimizer=config["th"]["models"]["cnn"]["optimizers"]["adam"])
acc_th, loss_th = evaluate_th(model=model,
                  data_loader=config["th"]["datasets"]["cifar10"]["val_loader"],
                  metric=config["th"]["models"]["cnn"]["metrics"]["acc"],
                  criterion=config["th"]["models"]["cnn"]["criterions"]["ce"])
print(acc_th, loss_th)

# Let's compare
* parameters, loading speed, fw+bw pass, inf
* how fast is on cuda
* how fast is on cuda+cpu(transfer)
* PyTorch has direct access to cuda events but TensorFlow uses a profiler so we cannot make a fair comparison => CuPy

In [None]:
import time
# use CuPy to measure GPU time
import cupy as cp

### Define more or less the same test data

In [None]:
inps_th = th.randn((BATCH_SIZE, 3, 32, 32), dtype=th.float32)
targs_th = th.randint(0, 10, (BATCH_SIZE,))

In [None]:
inps_tf = tf.random.normal((BATCH_SIZE, 32, 32, 3), dtype=tf.float32)
targs_tf = tf.random.uniform(shape=(BATCH_SIZE,), minval=0, maxval=10, dtype=tf.int32)

### Train step time (CPU + GPU)

In [None]:
def measure_train_step_time_th(model, inps, targs, criterion, optimizer, n=10000):
    avg_cpu_time, avg_gpu_time = 0.0, 0.0 
    start_gpu = cp.cuda.Event()
    end_gpu = cp.cuda.Event()

    for _ in range(n):
        # time measures wall-clock time that includes both CPU and GPU times
        start_gpu.record()
        start_cpu = time.perf_counter()

        # As TensorFlow handles the data migration on itself we have to include this for a fair comparison
        model.to(device)
        inps = inps.to(device)
        targs = targs.to(device)
       
        train_step_th(model, inps, targs, criterion, optimizer)
        
        end_cpu = time.perf_counter()
        end_gpu.record()
        end_gpu.synchronize() # GPU is running async
        
        avg_cpu_time += end_cpu-start_cpu
        avg_gpu_time += cp.cuda.get_elapsed_time(start_gpu, end_gpu)
        
    avg_gpu_time /= n
    avg_cpu_time /= n
    return avg_cpu_time/1000, avg_gpu_time

In [None]:
def measure_train_step_time_tf(model, inps, targs, criterion, optimizer, n=10000):
    avg_cpu_time, avg_gpu_time = 0.0, 0.0 
    start_gpu = cp.cuda.Event()
    end_gpu = cp.cuda.Event()

    for _ in range(n):
        # time measures wall-clock time that includes both CPU and GPU times
        start_gpu.record()
        start_cpu = time.perf_counter()
        
        train_step_tf(model, inps, targs, criterion, optimizer)
        
        end_cpu = time.perf_counter()
        end_gpu.record()
        end_gpu.synchronize() # GPU is running async
        
        avg_cpu_time += end_cpu-start_cpu
        avg_gpu_time += cp.cuda.get_elapsed_time(start_gpu, end_gpu)
        
    avg_gpu_time /= n
    avg_cpu_time /= n
    return avg_cpu_time/1000, avg_gpu_time

In [None]:
cpu_time_th, gpu_time_th = measure_train_step_time_th(model=config["th"]["models"]["cnn"]["model"], 
                                 inps=inps_th, 
                                 targs=targs_th,
                                 criterion=config["th"]["models"]["cnn"]["criterions"]["ce"],
                                 optimizer=config["th"]["models"]["cnn"]["optimizers"]["adam"])

cpu_time_tf, gpu_time_tf = measure_train_step_time_tf(model=config["tf"]["models"]["cnn"]["model"], 
                                 inps=inps_tf, 
                                 targs=targs_tf,
                                 criterion=config["tf"]["models"]["cnn"]["criterions"]["sce"],
                                 optimizer=config["tf"]["models"]["cnn"]["optimizers"]["adam"])

In [None]:
print(cpu_time_th, gpu_time_th)
print(cpu_time_tf, gpu_time_tf)

### Batch Loading Time

In [None]:
def get_batch_loading_speed(loader):
    avg_load_time = 0.0
    start_cpu = time.perf_counter()
    for batch in loader:
        batch
    end_cpu = time.perf_counter()
    avg_load_time = (end_cpu - start_cpu) / len(loader)
    return avg_load_time 

In [None]:
batch_time_th = get_batch_loading_speed(config["th"]["datasets"]["cifar10"]["val_loader"])
batch_time_tf = get_batch_loading_speed(config["tf"]["datasets"]["cifar10"]["val_loader"])

### Model Inference Time

In [None]:
def get_inference_time_th(model, inp, n=10000):
    model.eval()
    model.to(device)
    inp = inp.to(device)
    start_gpu = cp.cuda.Event()
    end_gpu = cp.cuda.Event()
    avg_elapsed_time = 0.0
    for _ in range(n):
        start_gpu.record()
        with th.no_grad():
            model(inp)
        end_gpu.record()
        end_gpu.synchronize() # GPU is running async
        avg_elapsed_time += cp.cuda.get_elapsed_time(start_gpu, end_gpu)
    return avg_elapsed_time/n

In [None]:
@tf.function
def run_model(model, inp):
    return model(inp)

def get_inference_time_tf(model, inp, n=10000):
    start_gpu = cp.cuda.Event()
    end_gpu = cp.cuda.Event()
    avg_elapsed_time = 0.0
    for _ in range(n):
        start_gpu.record()
        run_model(model, inp)
        end_gpu.record()
        end_gpu.synchronize() # GPU is running async
        avg_elapsed_time += cp.cuda.get_elapsed_time(start_gpu, end_gpu)
    return avg_elapsed_time/n

In [None]:
inf_gpu_time_th = get_inference_time_th(model=config["th"]["models"]["cnn"]["model"], inp = inps_th)
inf_gpu_time_tf = get_inference_time_tf(model=config["tf"]["models"]["cnn"]["model"], inp=inps_tf)
print(inf_gpu_time_th, inf_gpu_time_tf)

# Final Results
Please run all the above cells to populate the following pandas table

In [None]:
import pandas as pd

cnn_data = {'Train step time CPU [ms]': [cpu_time_th, cpu_time_tf], 
            'Train step time GPU [ms]': [gpu_time_th, gpu_time_tf],
            'Inference time GPU [ms]': [inf_gpu_time_th, inf_gpu_time_tf],
            'Batch loading speed [ms]': [batch_time_th, batch_time_tf],
            'CE Loss': [loss_th, loss_tf],
            'Val Accuracy': [acc_th, acc_tf]}
df = pd.DataFrame(cnn_data, index=['PyTorch', 'TensorFlow'])
df

### Exercise: Compare both frameworks for RNNs