# Librerie e Funzioni

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch import optim, nn
from torch.nn import functional
import os
import time
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
from torchvision.utils import make_grid
from random import randint
from PIL import Image
import random

from einops import rearrange
from einops.layers.torch import Rearrange

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from vit_pytorch.ats_vit import ViT as ATSViT
from vit_pytorch import ViT

from MultiViT import MultiViT
from TopK import ViT_topk
from PatchMergerViT import PatchMergerViT


from tqdm import tqdm

import torch
from torch import nn

from einops import rearrange, repeat
from einops.layers.torch import Rearrange

from thop import profile

import torch
import time

## Benchmark

In [2]:
def benchmark_time(
    model,
    device,
    input_size,
    batch_size,
    runs,
    throw_out,
    verbose,
):
    """
    Benchmark the given model with random inputs at the given batch size.

    Args:
     - model: the module to benchmark
     - device: the device to use for benchmarking
     - input_size: the input size to pass to the model (channels, h, w)
     - batch_size: the batch size to use for evaluation
     - runs: the number of total runs to do
     - throw_out: the percentage of runs to throw out at the start of testing
     - verbose: whether or not to use tqdm to print progress / print throughput at end

    Returns:
     - the throughput measured in images / second
    """
    if not isinstance(device, torch.device):
        device = torch.device(device)
    is_cuda = torch.device(device).type == "cuda"

    model = model.eval().to(device)
    print(device)
    input = torch.rand(batch_size, *input_size, device=device)

    warm_up = int(runs * throw_out)
    total = 0
    start = time.time()

    with torch.autocast(device.type):
        with torch.no_grad():
            for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"):
                if i == warm_up:
                    if is_cuda:
                        torch.cuda.synchronize()
                    total = 0
                    start = time.time()

                model(input)
                total += batch_size

    if is_cuda:
        torch.cuda.synchronize()

    end = time.time()
    elapsed = end - start

    throughput = total / elapsed
    throughput = f'{throughput:.2f}'

    if verbose:
        print(f"Throughput: {throughput:.2f} im/s")

    return [throughput]





def benchmark_flops(
    model,
    device,
    input_size,
    batch_size,
    runs,
    verbose,
):
    """
    Benchmark the given model with random inputs at the given batch size.

    Args:
     - model: the module to benchmark
     - device: the device to use for benchmarking
     - input_size: the input size to pass to the model (channels, h, w)
     - batch_size: the batch size to use for evaluation
     - runs: the number of total runs to do
     - verbose: whether or not to use tqdm to print progress / print throughput at end

    Returns:
     - the throughput measured in images / second
    """
    if not isinstance(device, torch.device):
        device = torch.device(device)
    is_cuda = torch.device(device).type == "cuda"

    model = model.eval().to(device)
    print(device)
    input = torch.rand(batch_size, *input_size, device=device)

    total = 0
    total_flops = 0
    total_params = 0

    with torch.autocast(device.type):
        with torch.no_grad():
            for i in tqdm(range(runs), disable=not verbose, desc="Benchmarking"):
                flops, params = profile(model, inputs=(input, ), verbose=False)
                total += batch_size
                total_flops += flops

    flops = total_flops/total
    flops = f'{flops/1e9:.4f}'
    if verbose:
        print(f"Total FLOPs: {flops / 1e9:.4f} GFLOPs")
        print(f"Total Parameters: {params / 1e6:.4f} M")

    return [flops], [params]



def create_patch_list(total_patches, cut):
    # Calcola l'importo scontato per ogni blocco di 3 elementi
    discounted_patches = total_patches
    patch_list = []
    for i in range(12):
        if i % 3 == 0 and i != 0:
            discounted_patches = int(discounted_patches * (cut / 100))
        patch_list.append(discounted_patches)
    return patch_list

In [3]:
def get_results(batch_size, patch_size, runs_time, runs_flops, input_size, img_size, att_dim, depth, heads, mlp_dim, max_tokens_per_depth, patch_merge_layers, type, df, device):

    # ViT
    print('##### ViT ####')
    ViTnet = ViT(
        image_size = img_size,
        patch_size = patch_size,
        num_classes = 10,
        dim = att_dim,
        depth = depth,
        heads = heads,
        mlp_dim = mlp_dim,
    )

    ViTnet.to(device)

    baseline_throughput = benchmark_time(
        ViTnet,
        device=device,
        verbose=False,
        runs=runs_time,
        batch_size=batch_size,
        input_size=input_size,
        throw_out = 0.25
    )

    baseline_flops, baseline_params = benchmark_flops(
        ViTnet,
        device=device,
        verbose=False,
        runs=runs_time,
        batch_size=batch_size,
        input_size=input_size
    )

    df = pd.concat([df, pd.DataFrame({'Modello': f'ViT_net_{type}', 'Throughput': baseline_throughput, 'Flops': baseline_flops, 'Params': baseline_params})], ignore_index=True)

    # ATS
    print('##### ATSViT ####')
    ATSViTnet = ATSViT(
        image_size = img_size,
        patch_size = patch_size,
        num_classes = 10,
        dim = att_dim,
        depth = depth,
        heads = heads,
        mlp_dim = mlp_dim,
        max_tokens_per_depth = max_tokens_per_depth
    )

    ATSViTnet.to(device)

    ATS_throughput = benchmark_time(
        ATSViTnet,
        device=device,
        verbose=False,
        runs=runs_time,
        batch_size=batch_size,
        input_size=input_size,
        throw_out = 0.25
    )

    ATS_flops, ATS_params = benchmark_flops(
        ATSViTnet,
        device=device,
        verbose=False,
        runs=runs_time,
        batch_size=batch_size,
        input_size=input_size
    )

    df = pd.concat([df, pd.DataFrame({'Modello': f'ATS_{type}', 'Throughput': ATS_throughput, 'Flops': ATS_flops, 'Params': ATS_params})], ignore_index=True)

    # MultiViT
    print('##### MultiViT ####')
    MultiViTnet = MultiViT(
        image_size = img_size,
        patch_size = patch_size,
        num_classes = 10,
        dim = att_dim,
        depth = depth,
        heads = heads,
        mlp_dim = mlp_dim,
        n_patch = max_tokens_per_depth
    )

    MultiViTnet.to(device)


    MultiViT_throughput = benchmark_time(
        MultiViTnet,
        device=device,
        verbose=False,
        runs=runs_time,
        batch_size=batch_size,
        input_size=input_size,
        throw_out = 0.25
    )

    MultiViT_flops, MultiViT_params = benchmark_flops(
        MultiViTnet,
        device=device,
        verbose=False,
        runs=runs_time,
        batch_size=batch_size,
        input_size=input_size
    )

    df = pd.concat([df, pd.DataFrame({'Modello': f'MultiViT_{type}', 'Throughput': MultiViT_throughput, 'Flops': MultiViT_flops, 'Params': MultiViT_params})], ignore_index=True)


    print('##### Transformer_topk ####')
    # Transformer_topk
    Transformer_topknet = ViT_topk(
        image_size = img_size,
        patch_size = patch_size,
        num_classes = 10,
        dim = att_dim,
        depth = depth,
        heads = heads,
        mlp_dim = mlp_dim,
        n_patch = max_tokens_per_depth
    )

    Transformer_topknet.to(device)

    Transformer_topk_throughput = benchmark_time(
        Transformer_topknet,
        device=device,
        verbose=False,
        runs=runs_time,
        batch_size=batch_size,
        input_size=input_size,
        throw_out = 0.25
    )

    Transformer_topk_flops, Transformer_topk_params = benchmark_flops(
        Transformer_topknet,
        device=device,
        verbose=False,
        runs=runs_time,
        batch_size=batch_size,
        input_size=input_size
    )

    df = pd.concat([df, pd.DataFrame({'Modello': f'Transformer_topk_{type}', 'Throughput': Transformer_topk_throughput, 'Flops': Transformer_topk_flops, 'Params': Transformer_topk_params})], ignore_index=True)

    # Patch merger
    print('##### PatchMerger ####')
    PatchMergerViTnet = PatchMergerViT(
        image_size = img_size,
        patch_size = patch_size,
        num_classes = 10,
        dim = att_dim,
        depth = depth,
        heads = heads,
        mlp_dim = mlp_dim,
        patch_merge_layers = patch_merge_layers
    )

    PatchMergerViTnet.to(device)


    PatchMerger_throughput = benchmark_time(
        PatchMergerViTnet,
        device=device,
        verbose=False,
        runs=runs_time,
        batch_size=batch_size,
        input_size=input_size,
        throw_out = 0.25
    )


    PatchMerger_flops, PatchMerger_params = benchmark_flops(
        PatchMergerViTnet,
        device=device,
        verbose=False,
        runs=runs_time,
        batch_size=batch_size,
        input_size=input_size
    )


    df = pd.concat([df, pd.DataFrame({'Modello': f'PatchMerger_{type}', 'Throughput': PatchMerger_throughput, 'Flops': PatchMerger_flops, 'Params': PatchMerger_params})], ignore_index=True)

    return df

# Creazione tabella

## GPU

In [62]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')

batch_size = 64
patch_size = 16
runs_time = 10
runs_flops = 10
input_size = (3, 160, 160)
img_size = 160

att_dim = 768
depth = 12
heads = 12
mlp_dim = att_dim * 4

total_patches = int((img_size/patch_size)**2)
cut = 50
max_tokens_per_depth = create_patch_list(total_patches, cut)

patch_merge_layers = [(2, max_tokens_per_depth[3]),(5, max_tokens_per_depth[6]),(8, max_tokens_per_depth[9])] 

type = 'Base'

df_gpu = pd.DataFrame(columns = ['Modello', 'Throughput', 'Flops', 'Params'])

In [63]:
df_gpu = get_results(batch_size, patch_size, runs_time, runs_flops, input_size, img_size, att_dim, depth, heads, mlp_dim, max_tokens_per_depth, patch_merge_layers, type, df_gpu, device)

##### ViT ####
cuda:0
cuda:0


  df = pd.concat([df, pd.DataFrame({'Modello': f'ViT_net_{type}', 'Throughput': baseline_throughput, 'Flops': baseline_flops, 'Params': baseline_params})], ignore_index=True)


##### ATSViT ####
cuda:0
cuda:0
##### MultiViT ####
cuda:0
cuda:0
##### Transformer_topk ####
cuda:0
cuda:0
##### PatchMerger ####
cuda:0
cuda:0


In [64]:
att_dim = 384
depth = 12
heads = 6
mlp_dim = att_dim * 4

type = 'Small'

df_small_gpu = pd.DataFrame(columns = ['Modello', 'Throughput', 'Flops', 'Params'])

In [65]:
df_small_gpu = get_results(batch_size, patch_size, runs_time, runs_flops, input_size, img_size, att_dim, depth, heads, mlp_dim, max_tokens_per_depth, patch_merge_layers, type, df_small_gpu, device)

##### ViT ####
cuda:0
cuda:0
##### ATSViT ####


  df = pd.concat([df, pd.DataFrame({'Modello': f'ViT_net_{type}', 'Throughput': baseline_throughput, 'Flops': baseline_flops, 'Params': baseline_params})], ignore_index=True)


cuda:0
cuda:0
##### MultiViT ####
cuda:0
cuda:0
##### Transformer_topk ####
cuda:0
cuda:0
##### PatchMerger ####
cuda:0
cuda:0


In [66]:
att_dim = 192
depth = 12
heads = 3
mlp_dim = att_dim * 4

type = 'Tiny'

df_tiny_gpu = pd.DataFrame(columns = ['Modello', 'Throughput', 'Flops', 'Params'])

In [67]:
df_tiny_gpu = get_results(batch_size, patch_size, runs_time, runs_flops, input_size, img_size, att_dim, depth, heads, mlp_dim, max_tokens_per_depth, patch_merge_layers, type, df_tiny_gpu, device)

##### ViT ####
cuda:0
cuda:0


  df = pd.concat([df, pd.DataFrame({'Modello': f'ViT_net_{type}', 'Throughput': baseline_throughput, 'Flops': baseline_flops, 'Params': baseline_params})], ignore_index=True)


##### ATSViT ####
cuda:0
cuda:0
##### MultiViT ####
cuda:0
cuda:0
##### Transformer_topk ####
cuda:0
cuda:0
##### PatchMerger ####
cuda:0
cuda:0


### Display

In [68]:
display(df_gpu), display(df_small_gpu), display(df_tiny_gpu)

Unnamed: 0,Modello,Throughput,Flops,Params
0,ViT_net_Base,800.46,8.6502,85629706.0
1,ATS_Base,1037.71,4.0943,85629706.0
2,MultiViT_Base,1408.75,4.3279,85626634.0
3,Transformer_topk_Base,1459.81,4.3285,85629706.0
4,PatchMerger_Base,1636.96,4.0357,85634314.0


Unnamed: 0,Modello,Throughput,Flops,Params
0,ViT_net_Small,2031.59,2.1806,21581962.0
1,ATS_Small,1618.46,1.038,21581962.0
2,MultiViT_Small,3319.77,1.0981,21579658.0
3,Transformer_topk_Small,3502.97,1.0986,21581962.0
4,PatchMerger_Small,3868.73,1.0254,21584266.0


Unnamed: 0,Modello,Throughput,Flops,Params
0,ViT_net_Tiny,4026.1,0.5543,5483338.0
1,ATS_Tiny,1855.31,0.2682,5483338.0
2,MultiViT_Tiny,6478.06,0.2826,5481418.0
3,Transformer_topk_Tiny,6711.6,0.283,5483338.0
4,PatchMerger_Tiny,7911.24,0.2647,5484490.0


(None, None, None)

# Prove a caso

In [11]:
import functools

def quad(lista):
    somma = 0
    for elem in lista:
        somma += elem**2
    return somma    

In [12]:
sum([100, 90, 80, 70, 65, 60, 55, 50, 45, 40, 35, 30])/12

60.0

In [13]:
quad([100, 90, 80, 70, 65, 60, 55, 50, 45, 40, 35, 30])

48500

In [14]:
sum([100, 100, 100, 100, 100, 100, 20, 20, 20, 20, 20, 20])/12

60.0

In [15]:
quad([100, 100, 100, 100, 100, 100, 20, 20, 20, 20, 20, 20])

62400

In [16]:
sum([60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10, 5])/12

32.5

In [17]:
quad([60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10, 5])

16250

In [18]:
sum([100, 100, 100, 8, 8, 8, 8, 8, 8, 8, 8, 8])

372

In [19]:
quad([100, 100, 100, 8, 8, 8, 8, 8, 8, 8, 8, 8])

30576