In [1]:
!pip install transformers 



### Load imdb dataset
imdb dataset contains 100k revies of 2 classes (positive and negative)

In [3]:
import tensorflow_datasets as tfds
(train, test), info = tfds.load(
    'imdb_reviews',
    split = (tfds.Split.TRAIN, tfds.Split.TEST),
    as_supervised = True,
    with_info = True
)



[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteI5HXFJ/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteI5HXFJ/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteI5HXFJ/imdb_reviews-unsupervised.t…

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [8]:
for review, label in tfds.as_numpy(train.take(5)):
    print(review.decode()[:150], '\t', label)

This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be the 	 0
I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable  	 0
Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem 	 0
This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chai 	 1
As others have mentioned, all the women that go nude in this film are mostly absolutely gorgeous. The plot very ably shows the hypocrisy of the female 	 1


### Tokenizer

In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Preparing data for fine-tuning
1. Tokenizing  
       --> Adding special tokens [CLS], [SEP]
       --> Setting max length to 512
       --> Add [PAD] tokens to the max length
       --> Add attention mask to not focus on pad tokens

In [17]:
# The encode_plus  function of the tokenizer class will tokenize the raw input,
# add the special tokens, and pad the vector to a size equal to max length (that we can set).
def review_to_embedding(review):
    return tokenizer.encode_plus(
        review,
        add_special_tokens = True,
        max_length = 512,
        pad_to_max_length = True,
        return_attention_mask = True,
    )

In [20]:
import tensorflow as tf

def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {   
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_masks,
    }, label

def encode_examples(dataset, limit=-1):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        dataset = dataset.take(limit)
    for review, label in tfds.as_numpy(dataset):   
        bert_input = review_to_embedding(review.decode())
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
    return tf.data.Dataset.from_tensor_slices(
        (
            input_ids_list, 
            attention_mask_list, 
            token_type_ids_list, 
            label_list
        )
    ).map(map_example_to_dict)

In [21]:
batch_size = 6
train_encoded = encode_examples(train).shuffle(100000).batch(batch_size)
test_encoded = encode_examples(test).batch(batch_size)

### Model loading

In [22]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Fine-tuning configuration and Training

In [23]:
learning_rate = 2e-5
number_of_epochs = 1
optimizer = tf.keras.optimizers.Adam(
    learning_rate=learning_rate, 
    epsilon=1e-08
)
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True
)
metric = tf.keras.metrics.SparseCategoricalAccuracy(
    'accuracy'
)
model.compile(
    optimizer=optimizer, 
    loss=loss, 
    metrics=[metric]
)

In [24]:
bert_history = model.fit(
    train_encoded, 
    epochs = number_of_epochs, 
    validation_data = test_encoded
)



### Testing

In [25]:
def get_predictions(review):
    encoded_review = tokenizer.encode(
        review,
        truncation = True,
        padding = True,
        return_tensors = "tf"
    )
    prediction = model.predict(encoded_review)[0]
    tf_prediction = tf.nn.softmax(prediction, axis=1)
    labels = ['Negative','Positive'] #(0:negative, 1:positive)
    label = tf.argmax(tf_prediction, axis=1)
    label = label.numpy()
    print(labels[label[0]])

In [27]:
review = "This is a really good movie. I loved it and will watch again"
get_predictions(review)

Positive


In [28]:
review = "I hate the selfishness in you"
get_predictions(review)

Negative
