### Performance Benchmarks for Mediations Classification Pipeline

This notebook implements a benchmark class to test pipeline performance across four different pipeline permutations:
1. DistilBERT base cased Featue Extractor > Logistic Regression Classifier
2. DistilBERT base cased Featue Extractor (Quantized) > Logistic Regression Classifier 
3. Longformer Feature Extractor >  Logistic Regression Classifier
4. Longformer Feature Extractor (Quantized) >  Logistic Regression Classifier

Each of these pipelines is tested against three metrics:
1. Size
2. Latency
3. F1 scoore

The aim is to minimize latency and maximize F1 score with a slight preference for latency.


In [7]:
from transformers import pipeline, AutoTokenizer, AutoModel
from torch.quantization import quantize_dynamic
from datasets import load_dataset
from joblib import load, dump
from datasets import load_metric
from pathlib import Path
from time import perf_counter
from tqdm.notebook import tqdm
import numpy as np
import torch
import torch.nn as nn

In [2]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Load dataset
base_path = '../data/mediation_search_results'
data_files = {
    'train': f"{base_path}-train.csv",
    'test': f"{base_path}-test.csv"
}
mediations = load_dataset('csv', data_files=data_files)

Using custom data configuration default-31792a7577eeeaac
Reusing dataset csv (/home/evan/.cache/huggingface/datasets/csv/default-31792a7577eeeaac/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# class CustomPipeline:
#     def __init__(self, model_name, path_to_clf, device=torch.device("cpu")):
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.model = AutoModel.from_pretrained(model_name)
#         self.device = device
#         self.clf = load(path_to_clf)
        
#     def __call__(self, batch, batch_size=8):
#         # Tokenize batch
#         batch_encoded = batch.map(self.tokenize, batched=True, batch_size=None)

#         # Convert to torch and extract features
#         batch_encoded.set_format("torch",
#                               columns=["input_ids", "attention_mask", "label"])
#         batch_hidden = batch_encoded.map(self.extract_hidden_states, batched=True, batch_size=batch_size)
        
#         # Get predicts from trained classifier
#         X = np.array(batch_hidden['train']["hidden_state"])
#         return self.clf.predict(batch_hidden)
        
#     def tokenize(self, batch):
#         return self.tokenizer(batch["text"], padding=True, truncation=True)
    
#     def extract_hidden_states(self, batch):
#         # Place model inputs on the GPU
#         inputs = {k:v.to(device) for k,v in batch.items()
#                 if k in self.tokenizer.model_input_names}
#         # Extract last hideen states
#         with torch.no_grad():
#             last_hidden_state = self.model(**inputs).last_hidden_state
#         # Return vector for [CLS] token
#         return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}
    
    
class PerformanceBenchmark:
    def __init__(self, pipeline, classifier, tokenizer_kwargs, dataset, optim_type="BERT Baseline"):
        self.pipeline = pipeline
        self.tokenizer_kwargs = tokenizer_kwargs
        self.classifier = classifier
        self.dataset = dataset
        self.optim_type = optim_type
        
    def query_custom_pipline(self, query):
        # Compute hidden states from feature extract
        hidden_states = self.pipeline(query, **self.tokenizer_kwargs)[0]
        
        # Fetch [CLS] token hidden state (last hidden state)
        last_hidden_state = np.array(hidden_states)[0].reshape(1, -1)
        
        # Predict labbel 
        return self.classifier.predict(last_hidden_state)[0]
        
    def compute_f1(self):
        # We'll define this later
        pass
    
    def compute_size(self):
        # We'll define this later
        pass
    
    def time_pipeline(self):
        # We'll define this later
        pass
    
    def run_benchmark(self, size=True, time=True, f1=True):
        assert (size or time or f1), "One benchmark flag must be set to True."
        metrics = {}
        if size:
            metrics[self.optim_type] = self.compute_size()
        if time:
            metrics[self.optim_type].update(self.time_pipeline())
        if f1:
            metrics[self.optim_type].update(self.compute_f1())
        return metrics

In [5]:
# Define benchmark metrics
f1_score = load_metric("f1")

def compute_f1(self):
    """This overrides the PerformanceBenchmark.compute_accuracy() method."""
    print("------ Benchmarking F1 Score ------")
    preds, labels = [], []
    for example in tqdm(self.dataset):
        preds.append(self.query_custom_pipline(example["text"]))
        labels.append(example["label"])
    f1 = f1_score.compute(predictions=preds, references=labels)
    print(f"F1 Score on test set - {f1['f1']:.3f}")
    return f1

def compute_size(self):
    """This overrides the PerformanceBenchmark.compute_size() method."""
    print("------ Benchmarking Size ------")
    
    # Fetch feature extract state dict (preferred way of storing a nn model)
    state_dict = self.pipeline.model.state_dict()
    tmp_path = Path("model.pt")
    torch.save(state_dict, tmp_path)
    
    # Calculate size in megabytes
    size_model_mb = Path(tmp_path).stat().st_size / (1024 * 1024)
    
    # Delete
    tmp_path.unlink()
    
    # Fetch classifier
    clf = self.classifier
    tmp_path = Path("clf.joblib")
    with tmp_path.open('wb') as f:
        dump(clf, f)
        
    # Calculate size
    size_clf_mb = Path(tmp_path).stat().st_size / (1024 * 1024)
    
    # Print and store
    print(f"Feature Extraction Model size (MB) - {size_model_mb:.2f}")
    print(f"Classifier size (MB) - {size_clf_mb:.2f}")
    return {"size_model_mb": size_model_mb, "size_clf_mb": size_clf_mb}


def time_pipeline(self):
    """This overrides the PerformanceBenchmark.time_pipeline() method."""
    print("------ Benchmarking Time ------")
    latencies = []
    
    # Warmup
    print("\tStarting warmup")
    for _ in tqdm(range(10)):
        q = self.dataset.shuffle()["text"][0]
        _  = self.query_custom_pipline(q)
    # Timed run
    print("\tStarting run")
    for _ in tqdm(range(100)):
        q = self.dataset.shuffle()["text"][0]
        start_time = perf_counter()
        _ = self.query_custom_pipline(q)
        latency = perf_counter() - start_time 
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    print(f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}")
    return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}


# Override methods
PerformanceBenchmark.compute_f1 = compute_f1
PerformanceBenchmark.time_pipeline = time_pipeline
PerformanceBenchmark.compute_size = compute_size

In [8]:
# Define models
classifier_end = 'mediations-logistic-classifier'
classifiers = [f"distilbert-base-uncased-{classifier_end}", f"longformer-base-4096-{classifier_end}"]
models = ["distilbert-base-uncased", "allenai/longformer-base-4096"]

# Specify constant tokenizer kwargs
tokenizer_kwargs = {'padding': True, 'truncation': True}

results = {}

for model_name, classifier_name in zip(models, classifiers):
    # Load classifier
    clf = load(f"../models/{classifier_name}.joblib")
    
    # Instantiate tokenizer, and pipeline
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer_kwargs['max_length'] = tokenizer.init_kwargs['model_max_length']
    pipe = pipeline('feature-extraction', model=model_name, tokenizer=tokenizer)

    # Benchmark Baseline
    pb = PerformanceBenchmark(
        pipeline=pipe,
        classifier=clf,
        tokenizer_kwargs=tokenizer_kwargs,
        dataset=mediations["test"],
        optim_type="Baseline"
    )
    results[classifier_name] = pb.run_benchmark(f1=False)
    
    # Benchmark qunatized
    model = AutoModel.from_pretrained(model_name).to("cpu")
    model_quantized = quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)
    pipe = pipeline('feature-extraction', model=model_quantized, tokenizer=tokenizer)
    
    # Benchmark quantized
    pb = PerformanceBenchmark(
        pipeline=pipe,
        classifier=clf,
        tokenizer_kwargs=tokenizer_kwargs,
        dataset=mediations["test"],
        optim_type="quantized"
    )
    results[classifier_name] = pb.run_benchmark()
    
results
    

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


------ Benchmarking Size ------
Feature Extraction Model size (MB) - 253.19
Classifier size (MB) - 0.01
------ Benchmarking Time ------
	Starting warmup


  0%|          | 0/10 [00:00<?, ?it/s]

	Starting run


  0%|          | 0/100 [00:00<?, ?it/s]

Average latency (ms) - 323.32 +\- 109.23


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


------ Benchmarking Size ------
Feature Extraction Model size (MB) - 131.71
Classifier size (MB) - 0.01
------ Benchmarking Time ------
	Starting warmup


  0%|          | 0/10 [00:00<?, ?it/s]

	Starting run


  0%|          | 0/100 [00:00<?, ?it/s]

Average latency (ms) - 216.84 +\- 88.17
------ Benchmarking F1 Score ------


  0%|          | 0/31181 [00:00<?, ?it/s]

F1 Score on test set - 0.027


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


------ Benchmarking Size ------
Feature Extraction Model size (MB) - 567.22
Classifier size (MB) - 0.01
------ Benchmarking Time ------
	Starting warmup


  0%|          | 0/10 [00:00<?, ?it/s]

	Starting run


  0%|          | 0/100 [00:00<?, ?it/s]

Average latency (ms) - 2093.27 +\- 1620.77


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


------ Benchmarking Size ------
Feature Extraction Model size (MB) - 261.86
Classifier size (MB) - 0.01
------ Benchmarking Time ------
	Starting warmup


  0%|          | 0/10 [00:00<?, ?it/s]

	Starting run


  0%|          | 0/100 [00:00<?, ?it/s]

Average latency (ms) - 1805.46 +\- 1444.78
------ Benchmarking F1 Score ------


  0%|          | 0/31181 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [23]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer_kwargs['max_length'] = tokenizer.init_kwargs['model_max_length']
pipe = pipeline('feature-extraction', model="distilbert-base-uncased", tokenizer=tokenizer, device=1)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
batch = mediations["test"][:8]
hidden_states = pipe(batch["text"], **tokenizer_kwargs)

In [None]:
    def query_custom_pipline(self, query):
        # Compute hidden states from feature extract
        hidden_states = self.pipeline(query, **self.tokenizer_kwargs)[0]
        
        # Fetch [CLS] token hidden state (last hidden state)
        last_hidden_state = np.array(hidden_states)[0].reshape(1, -1)
        
        # Predict labbel 
        return self.classifier.predict(last_hidden_state)[0]