In [3]:
import numpy as np 
import pandas as pd 
import tensorflow as tf 
import transformers 

# Introduction 
Semantic Similarity is the task of determining how similary two sentences are, in terms of what they mean. The SNLI (Stanford natural Language Inference) Corpus is used to predict sentence semantic similarity with transformers. BERT model will be fine-tuned to take two sentences as inputs and output a similarity score for two sentences. 

In [4]:
max_length = 128 # max len of input sentence 
batch_size = 32 
epochs = 2 

labels = ['contradictoin', 'entailment', 'neutral']

In [5]:
!curl -LO https://raw.githubusercontent.com/MohamadMerchant/SNLI/master/data.tar.gz
!tar -xvzf data.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 14 11.1M   14 1687k    0     0  2390k      0  0:00:04 --:--:--  0:00:04 2390k
100 11.1M  100 11.1M    0     0  7214k      0  0:00:01  0:00:01 --:--:-- 7219k


SNLI_Corpus/
SNLI_Corpus/snli_1.0_dev.csv
SNLI_Corpus/snli_1.0_train.csv
SNLI_Corpus/snli_1.0_test.csv


In [6]:
# There are more than 550k samples in total; we will use 500K for this example.
train_df = pd.read_csv("SNLI_Corpus/snli_1.0_train.csv", nrows=500_000) #100000)
valid_df = pd.read_csv("SNLI_Corpus/snli_1.0_dev.csv")
test_df = pd.read_csv("SNLI_Corpus/snli_1.0_test.csv")


In [7]:
# Shape of the data
print(f"Total train samples : {train_df.shape[0]}")
print(f"Total validation samples: {valid_df.shape[0]}")
print(f"Total test samples: {valid_df.shape[0]}")

Total train samples : 500000
Total validation samples: 10000
Total test samples: 10000


In [8]:
# We have some NaN entries in our train data, we will simply drop them.
print("Number of missing values")
print(train_df.isnull().sum())
train_df.dropna(axis=0, inplace=True)

Number of missing values
similarity    0
sentence1     0
sentence2     6
dtype: int64


In [9]:
print("Train Target Distribution")
print(train_df.similarity.value_counts())

Train Target Distribution
entailment       166712
contradiction    166510
neutral          166091
-                   681
Name: similarity, dtype: int64


In [10]:
train_df = (
    train_df[train_df.similarity != "-"]
    .sample(frac=1.0, random_state=42)
    .reset_index(drop=True)
)
valid_df = (
    valid_df[valid_df.similarity != "-"]
    .sample(frac=1.0, random_state=42)
    .reset_index(drop=True)
)

In [11]:
# One hot encoding 
train_df["label"] = train_df["similarity"].apply(
    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
)
y_train = tf.keras.utils.to_categorical(train_df.label, num_classes=3)

valid_df["label"] = valid_df["similarity"].apply(
    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
)
y_val = tf.keras.utils.to_categorical(valid_df.label, num_classes=3)

test_df["label"] = test_df["similarity"].apply(
    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
)
y_test = tf.keras.utils.to_categorical(test_df.label, num_classes=3)

In [13]:
# Custom Data Generator
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    def __init__(
        self, 
        sentence_pairs, 
        labels, 
        batch_size=batch_size, 
        shuffle=True, 
        include_targets=True
        ):
        
        self.sentence_pairs = sentence_pairs 
        self.labels = labels 
        self.shuffle = shuffle 
        self.batch_size = batch_size 
        self.include_targets = include_targets 

        # load bert tokenier to encode text 
        # will use bert-base-uncased 
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            'bert-base-uncased', do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end() 

    def __len__(self):
        return len(self.sentence_pairs) // self.batch_size 

    def __getitem__(self, idx):
        
        indexes = self.indexes[idx * self.batch_size: (idx+1)*self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(), 
            add_special_tokens=True, 
            max_length=max_length, 
            return_attention_mask=True, 
            return_token_type_ids=True, 
            pad_to_max_length=True, 
            return_tensors='tf' 
        )

        input_ids = np.array(encoded['input_ids'], dtype='int32')
        attention_masks = np.array(encoded['attention_mask'], dtype='int32')
        token_type_ids = np.array(encoded['token_type_ids'], dtype='int32')

        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype='int32')
            return [input_ids, attention_masks, token_type_ids], labels 
        else: 
            return [input_ids, attention_masks, token_type_ids]


    def on_epoch_end(self): 
        if self.shuffle: 
            np.random.RandomState(42).shuffle(self.indexes)






In [14]:
# Create the model under a distribution strategy scope.
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


In [15]:
with strategy.scope():
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name='input_ids', 
    )

    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name='attention_mask'
    )
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name='token_type_ids'
    )

    bert_model = transformers.TFBertModel.from_pretrained('bert-base-uncased')
    bert_model.trainable = False 

    sequence_output, pooled_output = bert_model(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )

    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    )(sequence_output)

    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPool1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
    droput = tf.keras.layers.Dropout(0.3)(concat)
    output = tf.keras.layers.Dense(3, activation='softmax')(droput)
    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], output=output
    )
    model.compile(
        optimizer=tf.keras.optimizers.Adam(), 
        loss='categorical_crossentropy', 
        metrics=['acc']
    )

print(f"Strategy: {strategy}")
model.summary()

Downloading: 100%|██████████| 511M/511M [00:53<00:00, 10.1MB/s]
Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


TypeError: Inputs to a layer should be tensors. Got: last_hidden_state

In [None]:
train_data = BertSemanticDataGenerator(
    train_df[["sentence1", "sentence2"]].values.astype("str"),
    y_train,
    batch_size=batch_size,
    shuffle=True,
)
valid_data = BertSemanticDataGenerator(
    valid_df[["sentence1", "sentence2"]].values.astype("str"),
    y_val,
    batch_size=batch_size,
    shuffle=False,
)

In [None]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)