
# TinyBERT for Real-Time NER on Mobile Devices/Edge Computing

## Introduction
TinyBERT is a distilled, compact version of BERT, designed to achieve high accuracy with much smaller size and faster inference. 
In this notebook, we implement TinyBERT for Named Entity Recognition (NER), focusing on real-time inference on edge devices (like mobile phones or IoT devices).



## Setup
We will use the Hugging Face `transformers` library and `torch` for model inference. Optionally, optimization tools can be applied for enhanced performance.

Install required packages:

```bash
!pip install transformers torch matplotlib
```


In [None]:

import json
import torch
import matplotlib.pyplot as plt
import logging
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from torch.quantization import quantize_dynamic
from time import time
from typing import List, Dict

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [None]:

def load_tinybert_model(model_name: str = "adel-cybral/TinyBERT-finetuned-NER"):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForTokenClassification.from_pretrained(model_name)
        logger.info("Model loaded successfully!")
        return model, tokenizer
    except Exception as e:
        logger.error(f"Error loading model: {e}")
        return None, None

model_name = "adel-cybral/TinyBERT-finetuned-NER"
model, tokenizer = load_tinybert_model(model_name)
if model is None or tokenizer is None:
    raise Exception("Model or tokenizer could not be loaded!")


In [None]:

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


In [None]:

texts = [
    "John Doe works at Acme Corp in San Francisco.",
    "Alice Johnson lives in New York City and works at Microsoft.",
    "Google was founded by Larry Page and Sergey Brin while they were students at Stanford University.",
    "Elon Musk is the CEO of SpaceX and Tesla.",
    "Dr. Strange is a fictional character in the Marvel Universe."
]


In [None]:

def perform_inference(pipeline, texts: List[str]) -> Dict[str, List[Dict]]:
    inference_results = {}
    for idx, text in enumerate(texts):
        start_time = time()
        entities = pipeline(text)
        end_time = time()
        inference_results[f"Sentence_{idx+1}"] = {
            "entities": entities,
            "time_taken": end_time - start_time
        }
    return inference_results

inference_results = perform_inference(ner_pipeline, texts)


In [None]:

for sentence, result in inference_results.items():
    print(f"{sentence} - Inference Time: {result['time_taken']:.4f} seconds")
    print(json.dumps(result['entities'], indent=2))
    print("
")


In [None]:

inference_times = [result['time_taken'] for result in inference_results.values()]
plt.figure(figsize=(10, 6))
plt.bar(range(len(texts)), inference_times, color='blue')
plt.xlabel("Sentence Index")
plt.ylabel("Inference Time (seconds)")
plt.title("Inference Time per Sentence (Before Optimization)")
plt.show()


In [None]:

def quantize_model(model):
    try:
        quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
        logger.info("Model quantization successful!")
        return quantized_model
    except Exception as e:
        logger.error(f"Error during model quantization: {e}")
        return None

model_quantized = quantize_model(model)
if model_quantized is None:
    raise Exception("Quantization failed!")

ner_pipeline_quant = pipeline("ner", model=model_quantized, tokenizer=tokenizer, aggregation_strategy="simple")


In [None]:

inference_results_quant = perform_inference(ner_pipeline_quant, texts)


In [None]:

inference_times_quant = [result['time_taken'] for result in inference_results_quant.values()]

plt.figure(figsize=(12, 6))
plt.bar(range(len(texts)), inference_times, color='blue', alpha=0.6, label='Original Model')
plt.bar(range(len(texts)), inference_times_quant, color='green', alpha=0.6, label='Quantized Model')
plt.xlabel("Sentence Index")
plt.ylabel("Inference Time (seconds)")
plt.title("Comparison of Inference Time (Original vs Quantized Model)")
plt.legend()
plt.show()


In [None]:

def save_results_to_file(results: Dict, filename: str = "ner_results.json"):
    try:
        with open(filename, "w") as file:
            json.dump(results, file, indent=2)
        logger.info(f"Results saved to {filename}.")
    except Exception as e:
        logger.error(f"Error saving results: {e}")

save_results_to_file(inference_results)
save_results_to_file(inference_results_quant, "ner_results_quant.json")


In [None]:

def compare_performance(original_times: List[float], quantized_times: List[float]):
    for i, (orig, quant) in enumerate(zip(original_times, quantized_times)):
        improvement = ((orig - quant) / orig) * 100
        print(f"Sentence {i+1}: Original Time = {orig:.4f}s, Quantized Time = {quant:.4f}s, Improvement = {improvement:.2f}%")

compare_performance(inference_times, inference_times_quant)
