# Initial setup (imports, settings)

In [1]:
!nvidia-smi

Sun Dec 10 18:36:42 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.98                 Driver Version: 535.98       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 2080 Ti     Off | 00000000:1B:00.0 Off |                  N/A |
| 30%   31C    P8               4W / 250W |      0MiB / 11264MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import torch
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from time import perf_counter
from optimum.onnxruntime import (
    AutoOptimizationConfig,
    ORTModelForSequenceClassification,
    ORTOptimizer
)

In [3]:
# Settings
CUTOFF_LENGTH = 256

# SENTIMENT <---> Class ID mappings
sentiments = ['positive', "negative"]
ID2SENT = { idx: sentiment for idx, sentiment in enumerate(sorted(sentiments)) }
SENT2ID = { sentiment: idx for idx, sentiment in enumerate(sorted(sentiments)) }
NUM_LABELS = len(sentiments)

print("=" * 50)
print("ID <----> Sentiment mappings:")
print("-" * 50)
print(f"ID to sentiment: {ID2SENT}")
print(f"Sentiment to ID: {SENT2ID}")
print("=" * 50)

LABEL_COLUMN = "labels"
TEXT_COLUMN = "text"

ID <----> Sentiment mappings:
--------------------------------------------------
ID to sentiment: {0: 'negative', 1: 'positive'}
Sentiment to ID: {'negative': 0, 'positive': 1}


# Optimizing the finetuned model

In [4]:
# Load finetuned model
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

base_model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
finetuned_model_id = "00BER/imbd-roberta-base-sentiment-merged-latest"
finetuned_model = AutoModelForSequenceClassification.from_pretrained(
    finetuned_model_id, 
    num_labels=NUM_LABELS, 
    ignore_mismatched_sizes=True
).to(device)
tokenizer = AutoTokenizer.from_pretrained(base_model, add_prefix_space=True)
finetuned_model.eval();

## Conversion to ORT model + Optimization

In [5]:
# Convert to ORT model

ort_model = ORTModelForSequenceClassification.from_pretrained(finetuned_model_id, export=True).to(device)
optimized_model_save_path = "../models/imbd-roberta-base-sentiment-onxx-optimized-latest"
optimization_config = AutoOptimizationConfig.O3()
optimizer = ORTOptimizer.from_pretrained(ort_model)
optimizer.optimize(save_dir=optimized_model_save_path, optimization_config=optimization_config)

Framework not specified. Using pt to export to ONNX.
Using the export variant default. Available variants are:
    - default: The default ONNX variant.
Using framework PyTorch: 2.1.1+cu121
Overriding 1 configuration item(s)
	- use_cache -> False
2023-12-10 18:37:00.953359146 [E:onnxruntime:Default, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3352038, index: 1, mask: {2, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.
2023-12-10 18:37:00.964354940 [E:onnxruntime:Default, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3352039, index: 2, mask: {3, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.
2023-12-10 18:37:00.979467350 [E:onnxruntime:Default, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3352040, index: 3, mask: {4, }, error code: 22 error msg: Invalid argument. Specify the number of threads e

PosixPath('../models/imbd-roberta-base-sentiment-onxx-optimized-latest')

## Preliminary checks

In [6]:
%%timeit

# Finetuned model inference
text=["What a lovely day!"]
inputs = tokenizer(text,return_tensors='pt').to(device)
outputs = finetuned_model(**inputs)

8.65 ms ± 388 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit

# Optimized model inference
text=["What a lovely day!"]
inputs = tokenizer(text,return_tensors='pt').to(device)
outputs = ort_model(**inputs)

2.26 ms ± 3.84 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Performance Evaluation

In [8]:
# Load optimized model
optimized_model = ORTModelForSequenceClassification.from_pretrained(optimized_model_save_path).to(device)

2023-12-10 18:37:17.164838324 [E:onnxruntime:Default, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3352189, index: 0, mask: {1, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.
2023-12-10 18:37:17.186338419 [E:onnxruntime:Default, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3352190, index: 1, mask: {2, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.
2023-12-10 18:37:17.201466107 [E:onnxruntime:Default, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3352191, index: 2, mask: {3, }, error code: 22 error msg: Invalid argument. Specify the number of threads explicitly so the affinity is not set.
2023-12-10 18:37:17.216625744 [E:onnxruntime:Default, env.cc:254 ThreadMain] pthread_setaffinity_np failed for thread: 3352192, index: 3, mask: {4, }, error code: 22 error msg: Invalid argument. Specify the n

## Load and tokenize dataset

In [9]:
BATCH_SIZE = 32

dataset = load_dataset("csv", data_files="../data/imdb-dataset.csv", split="train[:12000]")
dataset = dataset.class_encode_column("sentiment")
dataset = dataset.align_labels_with_mapping(SENT2ID, "sentiment")
dataset = (dataset
    .rename_column("review", TEXT_COLUMN)
    .rename_column("sentiment", LABEL_COLUMN)
)

def tokenize_dataset(examples):
    return tokenizer(examples[TEXT_COLUMN], truncation=True, max_length=CUTOFF_LENGTH)

dataset = dataset.map(tokenize_dataset, batched=True, remove_columns=TEXT_COLUMN)
dataset.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    collate_fn=data_collator
)

print("Total Samples: ", len(dataset))

Total Samples:  12000


In [10]:
def measure_latency(target_model):
    with torch.no_grad():
        latencies = []
        # warm up
        for _ in range(10):
            tokens = tokenizer(["It's such a lovely day today!"], return_tensors="pt")
            input_ids = tokens["input_ids"].to(device)
            attention_mask = tokens["attention_mask"].to(device)
            _ = target_model(input_ids=input_ids, attention_mask=attention_mask)
       
        # Timed run
        for i, batch in tqdm(enumerate(dataloader)):
            start_time = perf_counter()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            _ = target_model(input_ids=input_ids, attention_mask=attention_mask)
            latency = perf_counter() - start_time
            latencies.append(latency)

        # Compute run statistics
        time_avg_ms = 1000 * np.mean(latencies)
        time_std_ms = 1000 * np.std(latencies)
        return f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}"

In [12]:
print("\nStarting benchmark:")
print("=" * 60)
print(f"Batch size: {BATCH_SIZE}")
print("-" * 60)
print(f"Vanilla model {measure_latency(finetuned_model)}")
print(f"Optimized model {measure_latency(optimized_model)}")


Starting benchmark:
Batch size: 32
------------------------------------------------------------


375it [00:59,  6.28it/s]


Vanilla model Average latency (ms) - 153.15 +\- 7.57


375it [00:56,  6.68it/s]

Optimized model Average latency (ms) - 143.49 +\- 0.90



