In [None]:
!pip install transformers
!pip install tensorflow_addons
!curl -LO https://raw.githubusercontent.com/MohamadMerchant/SNLI/master/data.tar.gz
!tar -xvzf data.tar.gz

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_addons as tfa
import transformers
from sklearn.manifold import TSNE
from tensorflow.keras.utils import plot_model
import logging
logging.getLogger('tensorflow').disabled = True
pd.set_option('max_colwidth', 400)

In [None]:
# Define the strategy to use and print the number of devices found
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

Number of devices: 1


In [None]:
# resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
# tf.config.experimental_connect_to_cluster(resolver)
# # This is the TPU initialization code that has to be at the beginning.
# tf.tpu.experimental.initialize_tpu_system(resolver)
# print("All devices: ", tf.config.list_logical_devices('TPU'))
# strategy = tf.distribute.TPUStrategy(resolver)

# 1) Text Classification

## 1.1 Data Import

In [None]:
train_df = pd.read_csv("./data/SNLI_Corpus/snli_1.0_train.csv", nrows=100000)
valid_df = pd.read_csv("./data/SNLI_Corpus/snli_1.0_dev.csv")
test_df = pd.read_csv("./data/SNLI_Corpus/snli_1.0_test.csv")

train_df = train_df[train_df.similarity != "-"].sample(frac=1.0, random_state=42).reset_index(drop=True)
valid_df = valid_df[valid_df.similarity != "-"].sample(frac=1.0, random_state=42).reset_index(drop=True)
train_df.head()

Unnamed: 0,similarity,sentence1,sentence2
0,contradiction,A woman is using toy which blows giant bubbles.,A little girl is playing with chalk on a driveway.
1,neutral,A young Asian girl holds a stuffed cat toy in a classroom.,"A young Asian girl sits in class with a stuffed cat toy, the only surviving possession remaining after the tsunami."
2,entailment,A young woman with an afro and an electronic device in her hands walks next to an orange bike.,A young woman walks next to an orange bike.
3,neutral,A young asian girl is sliding down a pole on outdoor playground equipment.,The girl has yellow skin
4,entailment,a man is walking with a cane.,The man is walking.


In [None]:
# label encoding
label_map = dict(enumerate(train_df['similarity'].astype('category').cat.categories))
y_train = train_df['similarity'].map({v:k for k, v in label_map.items()}).values
y_val = valid_df['similarity'].map({v:k for k, v in label_map.items()}).values
y_test = test_df['similarity'].map({v:k for k, v in label_map.items()}).values

## 1.2 Pre-processing

In [None]:
max_length = 64
batch_size = 32

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained( "bert-base-uncased", do_lower_case=True)
print(len(tokenizer.get_vocab()))
tokenizer.encode('Hello Tensorflow')

In [None]:
sentence_pairs = train_df[["sentence1", "sentence2"]].values[:5]
encoded = tokenizer.batch_encode_plus(
    sentence_pairs.tolist(),
    add_special_tokens=True,
    max_length=max_length,
    return_attention_mask=True,
    return_token_type_ids=True,
    padding='max_length',
    return_tensors="tf")

print(encoded.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [None]:
print(encoded['input_ids'][0][:32])
print(encoded['token_type_ids'][0][:32])
print(encoded['attention_mask'][0][:32])

tf.Tensor(
[  101  1037  2450  2003  2478  9121  2029 13783  5016 17255  1012   102
  1037  2210  2611  2003  2652  2007 16833  2006  1037 11202  1012   102
     0     0     0     0     0     0     0     0], shape=(32,), dtype=int32)
tf.Tensor([0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0], shape=(32,), dtype=int32)
tf.Tensor([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0], shape=(32,), dtype=int32)


In [None]:
tokenizer.decode(encoded['input_ids'][0][:32])

'[CLS] a woman is using toy which blows giant bubbles. [SEP] a little girl is playing with chalk on a driveway. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [None]:
class BertDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentence_pairs: Array of premise and hypothesis input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to incude the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(self, sentence_pairs, labels, batch_size=batch_size, shuffle=True, include_targets=True):

        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        self.tokenizer = transformers.BertTokenizer.from_pretrained( "bert-base-uncased", do_lower_case=True)
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are, encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            padding='max_length',
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

## 1.3 Model Building 

Outputs of Bert Model comprised of:

- ***last_hidden_state*** with shape=(m, seq_len, embed_dim)
- ***pooler_output*** with shape=(m, emb_dim)
- ***hidden_states*** which generate the hidden state for all transformer layers. Only when set *output_hidden_states=True*, shape=(num_layers, m, seq_len, emb_dim)
- ***attentions*** attention weights from each layer.  Only when set *output_attentions=True*, shape=(num_layers, m, seq_len, emb_dim)





In [None]:
def build_model():

  # Encoded token ids from BERT tokenizer
  input_ids = tf.keras.layers.Input(shape=(max_length, ), dtype=tf.int32, name="input_ids")
  # Attention masks indicates to the model which tokens should be attended to
  attention_masks = tf.keras.layers.Input(shape=(max_length, ), dtype=tf.int32, name="attention_masks")
  # Token type ids are binary masks identifying different sequences in the model
  token_type_ids = tf.keras.layers.Input(shape=(max_length, ), dtype=tf.int32, name="token_type_ids")

  # Loading pretrained BERT model, freeze the weight, check bert_model.config to configure
  bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
  bert_model.trainable = False

  bert_output = bert_model(input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids, output_attentions=False, output_hidden_states=False)
  sequence_output = bert_output["last_hidden_state"] # (m, seq_len, emb_dim)
  pooled_output = bert_output["pooler_output"] # (m, emb_dim)

  # Add trainable layers on top of Bert to adapt the pretrained features on the new data.
  bi_lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(2, return_sequences=True))(sequence_output) # (m, emb_dim, hidden_unit * 2)
  
  # Applying hybrid pooling approach to bi_lstm sequence output.
  avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm) # (m, hidden_unit)
  max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm) # (m, hidden_unit)
  concat = tf.keras.layers.concatenate([avg_pool, max_pool]) # (m, hidden_unit)
  dropout = tf.keras.layers.Dropout(0.3)(concat)
  output = tf.keras.layers.Dense(3, activation="softmax")(dropout) #(m, 3)

  model = tf.keras.models.Model(inputs=[input_ids, attention_masks, token_type_ids], outputs=output)
  return model

In [None]:
bert_encoder = build_model()
bert_encoder.compile(optimizer=tf.keras.optimizers.Adam(), loss="sparse_categorical_crossentropy", metrics=["acc"])
bert_encoder.summary()

## 1.4 Model Training (Frozen Pre-trained Bert)

In [None]:
# generate batch data
train_data = BertDataGenerator(train_df[["sentence1", "sentence2"]].values.astype("str"), y_train, batch_size=batch_size, shuffle=True)
valid_data = BertDataGenerator(valid_df[["sentence1", "sentence2"]].values.astype("str"), y_val, batch_size=batch_size, shuffle=False)

In [None]:
epochs = 2
history = bert_encoder.fit(train_data, validation_data=valid_data, epochs=epochs, use_multiprocessing=True, workers=-1, steps_per_epoch=10)

## 1.5 Fine Tuning Bert

This step must only be performed after the feature extraction model has been trained to convergence on the new data.

This is an optional last step where bert_model is unfreezed and retrained with a very low learning rate. This can deliver meaningful improvement by incrementally adapting the pretrained features to the new data.

In [None]:
bert_encoder.layers[3].trainable = True
# Recompile the model to make the change effective.
bert_encoder.compile(optimizer=tf.keras.optimizers.Adam(1e-5), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
bert_encoder.summary()

# train entire model
history = bert_encoder.fit(train_data, validation_data=valid_data, epochs=epochs, use_multiprocessing=True, workers=-1,)

## 1.6 Evaluation

In [None]:
test_data = BertDataGenerator(test_df[["sentence1", "sentence2"]].values.astype("str"), y_test, batch_size=batch_size, shuffle=False)
model.evaluate(test_data, verbose=1)

In [None]:
def check_similarity(sentence1, sentence2):
    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
    test_data = BertDataGenerator(sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False)

    proba = model.predict(test_data)[0]
    idx = np.argmax(proba)
    proba = f"{proba[idx]: .2f}%"
    pred = label_map.get(idx)
    return pred, proba

In [None]:
sentence1 = "Two women are observing something together."
sentence2 = "Two women are standing with their eyes closed."
check_similarity(sentence1, sentence2)