## Imports

In [32]:
import tensorflow as tf
import numpy as np
from transformers import (BertTokenizerFast,TFBertTokenizer,BertTokenizer,RobertaTokenizerFast,
                          DataCollatorWithPadding,TFRobertaForSequenceClassification,TFBertForSequenceClassification,
                          TFBertModel,create_optimizer)
from datasets import load_dataset
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Flatten,InputLayer,BatchNormalization,Dropout,Input,LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy

In [3]:
BATCH_SIZE = 4

## Data Preperation for BERT

In [4]:
dataset_id = "imdb"
dataset = load_dataset(dataset_id)

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [6]:
dataset['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [10]:
model_id="bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
tokenizer.is_fast

True

In [12]:
test_input_1='The Weather of Today is Gréat! zwp'
test_input_2='How are you doing?'
inputs=[test_input_1,test_input_2]

tokenizer.tokenize(inputs,)

['the',
 'weather',
 'of',
 'today',
 'is',
 'great',
 '!',
 'z',
 '##w',
 '##p',
 'how',
 'are',
 'you',
 'doing',
 '?']

In [13]:
output = tokenizer(inputs, padding=True, truncation=True, max_length=128)
print(output)

{'input_ids': [[101, 1996, 4633, 1997, 2651, 2003, 2307, 999, 1062, 2860, 2361, 102], [101, 2129, 2024, 2017, 2725, 1029, 102, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]}


In [15]:
tokenizer.decode(output['input_ids'][0])

'[CLS] the weather of today is great! zwp [SEP]'

In [17]:
tokenizer.decode(output['input_ids'][1])

'[CLS] how are you doing? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [20]:
def preprocess_function(example):
    return tokenizer(example["text"], padding=True, truncation=True)

In [21]:
tokenized_data = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [22]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

In [23]:
tokenized_data['train'][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [25]:
tf_train_dataset = tokenized_data['train'].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE
)

In [26]:
tf_val_dataset = tokenized_data['test'].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE
)

In [27]:
def swap_position(dataset):
    return {'input_ids': dataset['input_ids'],
            'token_type_ids':dataset['token_type_ids'],
            'attention_mask':dataset['attention_mask']}, dataset['label']

In [28]:
train_dataset = tf_train_dataset.map(swap_position).prefetch(tf.data.AUTOTUNE)
val_dataset = tf_val_dataset.map(swap_position).prefetch(tf.data.AUTOTUNE)

In [29]:
for i in train_dataset.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[  101,  7592,  1012, ...,     0,     0,     0],
       [  101,  2092,  2073, ..., 16718,  2182,   102],
       [  101,  8003,  2472, ...,     0,     0,     0],
       [  101,  2023,  3185, ...,     0,     0,     0]], dtype=int64)>, 'token_type_ids': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)>, 'attention_mask': <tf.Tensor: shape=(4, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int64)>}, <tf.Tensor: shape=(4,), dtype=int64, numpy=array([0, 0, 0, 0], dtype=int64)>)


In [31]:
val_dataset

<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

## Data preperation for Roberta Model

In [None]:
model_id="roberta-base"
tokenizer=RobertaTokenizerFast.from_pretrained(model_id)

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"],padding=True,truncation=True,)

In [None]:
tokenized_dataset = dataset.map(preprocess_function,)

In [None]:
tokenized_dataset['train'][0]

In [None]:
tokenized_dataset

In [None]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=['input_ids','attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
)

In [None]:
tf_val_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=['input_ids','attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
)

In [None]:
def swap_positions(dataset):
  return {'input_ids':dataset['input_ids'],
          'attention_mask':dataset['attention_mask'],},dataset['label']

In [None]:
tf_train_dataset=tf_train_dataset.map(swap_positions).prefetch(tf.data.AUTOTUNE)
tf_val_dataset=tf_val_dataset.map(swap_positions).prefetch(tf.data.AUTOTUNE)

In [None]:
for i in tf_train_dataset.take(1):
  print(i)

## Data preperation for XtremeDistil

model_id="microsoft/xtremedistil-l6-h256-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_id)

In [None]:
tokenizer.is_fast

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"],max_length=512,padding=True,truncation=True,)

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
)

In [None]:
tf_val_dataset = tokenized_dataset["test"].to_tf_dataset(
    columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    shuffle=True,
    batch_size=BATCH_SIZE,
)

In [None]:
def swap_positions(dataset):
  return {'input_ids':dataset['input_ids'],
          'token_type_ids':dataset['token_type_ids'],
          'attention_mask':dataset['attention_mask'],},dataset['label']

In [None]:
tf_train_dataset=tf_train_dataset.map(swap_positions).prefetch(tf.data.AUTOTUNE)
tf_val_dataset=tf_val_dataset.map(swap_positions).prefetch(tf.data.AUTOTUNE)

In [None]:
for i in tf_val_dataset.take(1):
  print(i)

In [None]:
tf_val_dataset

## Modeling

### TFBertSequenceClassification

In [None]:
model=TFBertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=1)
model.summary()

### XtremeDistillForSequenceClassification

In [None]:
model=TFBertForSequenceClassification.from_pretrained(model_id,num_labels=2)
model.summary()

### TFBert Model

In [None]:
model=TFBertModel.from_pretrained("bert-base-uncased")
model.summary()

In [None]:
input_ids=Input(shape = (512,),dtype=tf.int64,name='input_ids')
token_type_ids=Input(shape = (512,),dtype=tf.int64,name='token_type_ids')
attention_mask=Input(shape = (512,),dtype=tf.int64,name='attention_mask')

x = model([input_ids,token_type_ids,attention_mask])
print(x)
x=Dense(128,activation='relu')(x[0][:,0,:])
output=Dense(1,activation='sigmoid',name='label')(x)

custom_bert = tf.keras.Model(inputs=[input_ids,token_type_ids,attention_mask], outputs=output)

In [None]:
custom_bert.summary()

### TFRobertaForSequenceClassification

In [None]:
model=TFRobertaForSequenceClassification.from_pretrained(model_id,num_labels=2)
model.summary()

## Training

In [None]:
num_epochs = 3
batches_per_epoch = len(tokenized_dataset["train"]) // BATCH_SIZE
total_train_steps = int(batches_per_epoch * num_epochs)

In [None]:
optimizer, schedule = create_optimizer(init_lr=2e-5,num_warmup_steps=0, num_train_steps=total_train_steps)

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=optimizer,
    metrics=['accuracy'],)

In [None]:
history=model.fit(
    tf_train_dataset.take(1000),
    validation_data=tf_val_dataset,
    epochs=3,)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model_loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.title('model_accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

## Testing

inputs = tokenizer(["this movie looks very interesting, i love the fact that the actors do a great job in showing how people lived in the 18th century, which wasn't very good at all. But atleast this movie recreates this scenes! ",
                    "very good start, but movie started becoming uninteresting at some point though initially i thought it would have been much more fun. There was too much background noise, but later on towards the middle of the movie, my favorite character got in and he did a great job, so over "], padding=True,return_tensors="tf")

logits = model(**inputs).logits
print(logits)

## Conversion to ONNX

In [None]:
# !pip install -U tf2onnx
# !pip install onnxruntime

In [None]:
import onnxruntime as rt
import tf2onnx
rt.get_device()

### from keras model

In [None]:
output_path = "/content/drive/MyDrive/nlp/sentiment_analysis/xtremedistill.onnx"

In [None]:
spec = [tf.TensorSpec((None,512),tf.int64, name="input_ids"),
        tf.TensorSpec((None,512),tf.int64, name="token_type_ids"),
        tf.TensorSpec((None,512),tf.int64, name="attention_mask")]

model_proto, _ = tf2onnx.convert.from_keras(
    model, input_signature=spec,
    opset=17, output_path=output_path,)
output_names = [n.name for n in model_proto.graph.output]

In [None]:
print(output_names)

### Inference

#### Benchmarking

In [None]:
text=["this movie looks very interesting, i love the fact that the actors do a great job in showing how people lived in the 18th century, which wasn't very good at all. But atleast this movie recreates this scenes!"]

# text = ["this movie looks very interesting, i love the fact that the actors do a great job in showing how people lived in the 18th century, which wasn't very good at all. But atleast this movie recreates this scenes! ",
#                     "very good start, but movie started becoming uninteresting at some point though initially i thought it would have been much more fun. There was too much background noise, but later on towards the middle of the movie, my favorite character got in and he did a great job, so over ",
#                     "very good start, but movie started becoming uninteresting at some point though initially i thought it would have been much more fun. There was too much background noise, but later on towards the middle of the movie, my favorite character got in and he did a great job, so overall i will give this movie a pass "]


inputs = tokenizer(text,padding='max_length',max_length=512,truncation=True,return_tensors="np")

N_PREDICTIONS = 1
print(inputs)

In [None]:
providers=['CPUExecutionProvider']
m = rt.InferenceSession(output_path, providers=providers)

t1 = time.time()
for _ in range(N_PREDICTIONS):
  onnx_pred = m.run(["logits"], {'input_ids':inputs['input_ids'],
                                'token_type_ids':inputs['token_type_ids'],
                                'attention_mask':inputs['attention_mask']})
print("Time for a single Prediction", (time.time() - t1)/N_PREDICTIONS)

In [None]:
print(onnx_pred)

#### Benchmarking TF

t1 = time.time()
for _ in range(N_PREDICTIONS):
  logits = model(**inputs).logits
print(logits)
print("Time for a single Prediction", (time.time() - t1)/N_PREDICTIONS)

In [None]:
# tf, cpu = 600ms
# tf, gpu = 130ms
# tf_size = 50MB

# onnx, cpu = 400ms
# onnx, gpu = 8ms
# onnx_size = 50MB
# onnx_acc  = 91.9%

# onnx_quantized, cpu = 190ms
# onnx_quantized, gpu = 140ms
# onnx_quantized_size = 13MB
# onnx_quantized_acc  = 89.7%

### Quantization with ONNX

import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

In [None]:
model_fp32 = '/content/drive/MyDrive/nlp/sentiment_analysis/xtremedistill.onnx'
model_quant = '/content/drive/MyDrive/nlp/sentiment_analysis/xtremedistill_quantized.onnx'

In [None]:
quantized_model = quantize_dynamic(model_fp32, model_quant, weight_type = QuantType.QUInt8)

### accuracy drop due to quantization

In [None]:
unbatched_val_dataset=tf_val_dataset.unbatch()

In [None]:
N_SAMPLES=1024

In [None]:
def accuracy(model):
  total=0
  for text,label in unbatched_val_dataset.take(N_SAMPLES):

    onnx_pred = model.run(["logits"], {'input_ids':[text['input_ids'].numpy()],
                                'token_type_ids':[text['token_type_ids'].numpy()],
                                'attention_mask':[text['attention_mask'].numpy()]})
    if np.argmax(onnx_pred, axis = -1)[0][0] == label.numpy():
      total+=1
  return (total/N_SAMPLES)*100

In [None]:
providers=['CPUExecutionProvider']
m = rt.InferenceSession(model_fp32, providers=providers)
m_q = rt.InferenceSession(model_quant, providers=providers)
print(accuracy(m_q))
print(accuracy(m))

## Understanding temperature in distilation

In [None]:
def softmax(logits,T):
  denominator=np.sum([np.exp(i/T) for i in logits])
  return [np.exp(i/T)/denominator for i in logits]

In [None]:
logits=[10,13,17,5]

In [None]:
print("For T=1 ------>",softmax(logits,1))
print("For T=2 ------>",softmax(logits,2))
print("For T=3 ------>",softmax(logits,3))
print("For T=5 ------>",softmax(logits,5))
print("For T=10 ----->",softmax(logits,10))
print("For T=10000 -->",softmax(logits,10000))