<a href="https://colab.research.google.com/github/A1DS19/Movie-review-classifier-BIRNN/blob/master/Transfer_learning_IMDB_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Instanciate BERT tokenizer

In [8]:
from transformers import BertTokenizer

bert_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(
                          bert_name,
                          add_special_tokens=True,
                          do_lower_case=False,
                          max_length=150,
                          pad_to_max_length=True)

In [10]:
tokenizer.encode_plus(" Don't be lured",
                      add_special_tokens=True,
                      max_length=9,
                      pad_to_max_length=True,
                      return_attention_mask=True,
                      return_token_type_ids=True,
                      truncation=True)

{'input_ids': [101, 1790, 112, 189, 1129, 19615, 1181, 102, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0]}

In [11]:
def bert_encoder(review):
    txt = review.numpy().decode('utf-8')
    encoded = tokenizer.encode_plus(
                      txt,
                      add_special_tokens=True,
                      max_length=150,
                      pad_to_max_length=True,
                      return_attention_mask=True,
                      return_token_type_ids=True,
                      truncation=True)
    return encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']

### Load dataset

In [12]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
import tensorflow_datasets as tfds

In [13]:
train_data, ds_info = tfds.load('imdb_reviews',
                       split='train',
                       as_supervised=True,
                       with_info=True)
ds_info

[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteBLK4XK/imdb_reviews-train.tfrecord*...…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteBLK4XK/imdb_reviews-test.tfrecord*...:…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteBLK4XK/imdb_reviews-unsupervised.tfrec…

[1mDataset imdb_reviews downloaded and prepared to ~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='~/tensorflow_datasets/imdb_reviews/plain_text/1.0.0',
    file_format=tfrecord,
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        '

In [14]:
bert_train = [bert_encoder(r) for r, l in train_data]
bert_lbl = [l for r, l in train_data]
bert_train = np.array(bert_train)
bert_lbl = tf.keras.utils.to_categorical(bert_lbl, num_classes=2)



### Create splits

In [15]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(bert_train,
                                                  bert_lbl,
                                                  test_size=0.2,
                                                  random_state=42)
print(x_train.shape, y_train.shape)

(20000, 3, 150) (20000, 2)


In [16]:
tr_reviews, tr_segments, tr_masks = np.split(x_train, 3, axis=1)
val_reviews, val_segments, val_masks = np.split(x_val, 3, axis=1)

tr_reviews = tr_reviews.squeeze()
tr_segments = tr_segments.squeeze()
tr_masks = tr_masks.squeeze()

val_reviews = val_reviews.squeeze()
val_segments = val_segments.squeeze()
val_masks = val_masks.squeeze()

In [17]:
def example_to_features(input_ids, attention_masks, token_type_ids, y):
    return {
            "input_ids": input_ids,
            "attention_mask": attention_masks,
            "token_type_ids": token_type_ids}, y

In [18]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (tr_reviews, tr_masks, tr_segments, y_train)).map(example_to_features).shuffle(100).batch(16)

val_ds = tf.data.Dataset.from_tensor_slices(
    (val_reviews, val_masks, val_segments, y_val)).map(example_to_features).shuffle(100).batch(16)

### Load BERT classification pretrained model

In [19]:
from transformers import TFBertForSequenceClassification

bert_model = TFBertForSequenceClassification.from_pretrained(bert_name)

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
optimizer = keras.optimizers.Adam(learning_rate=2e-5)
loss = keras.losses.BinaryCrossentropy(from_logits=True)
bert_model.compile(optimizer=optimizer,
                   loss=loss,
                   metrics=['accuracy'])

In [23]:
bert_model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


In [24]:
bert_history = bert_model.fit(train_ds,
                              epochs=3,
                              validation_data=val_ds)

Epoch 1/3
Epoch 2/3
Epoch 3/3


### Test model

In [25]:
test_data = tfds.load('imdb_reviews',
                       split='test',
                       as_supervised=True)

In [26]:
bert_test = [bert_encoder(r) for r, l in test_data]
bert_test_lbl = [l for r, l in test_data]
bert_test_clean = np.array(bert_test)
bert_test_lbl_clean = tf.keras.utils.to_categorical(bert_test_lbl,
                                                    num_classes=2)



In [27]:
ts_reviews, ts_segments, ts_masks = np.split(bert_test_clean, 3, axis=1)

ts_reviews = ts_reviews.squeeze()
ts_segments = ts_segments.squeeze()
ts_masks = ts_masks.squeeze()

In [28]:
test_ds = tf.data.Dataset.from_tensor_slices((ts_reviews,
                                              ts_masks,
                                              ts_segments,
                                              bert_test_lbl_clean)).map(example_to_features).shuffle(100).batch(16)

In [29]:
bert_model.evaluate(test_ds)



[0.43125054240226746, 0.8799600005149841]