In [3]:
import numpy as np 
import pandas as pd 
import tensorflow as tf 
import transformers 

# Introduction 
Semantic Similarity is the task of determining how similary two sentences are, in terms of what they mean. The SNLI (Stanford natural Language Inference) Corpus is used to predict sentence semantic similarity with transformers. BERT model will be fine-tuned to take two sentences as inputs and output a similarity score for two sentences. 

In [4]:
max_length = 128 # max len of input sentence 
batch_size = 32 
epochs = 2 

labels = ['contradictoin', 'entailment', 'neutral']

In [5]:
!curl -LO https://raw.githubusercontent.com/MohamadMerchant/SNLI/master/data.tar.gz
!tar -xvzf data.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 14 11.1M   14 1687k    0     0  2390k      0  0:00:04 --:--:--  0:00:04 2390k
100 11.1M  100 11.1M    0     0  7214k      0  0:00:01  0:00:01 --:--:-- 7219k


SNLI_Corpus/
SNLI_Corpus/snli_1.0_dev.csv
SNLI_Corpus/snli_1.0_train.csv
SNLI_Corpus/snli_1.0_test.csv


In [6]:
# There are more than 550k samples in total; we will use 500K for this example.
train_df = pd.read_csv("SNLI_Corpus/snli_1.0_train.csv", nrows=500_000) #100000)
valid_df = pd.read_csv("SNLI_Corpus/snli_1.0_dev.csv")
test_df = pd.read_csv("SNLI_Corpus/snli_1.0_test.csv")


In [7]:
# Shape of the data
print(f"Total train samples : {train_df.shape[0]}")
print(f"Total validation samples: {valid_df.shape[0]}")
print(f"Total test samples: {valid_df.shape[0]}")

Total train samples : 500000
Total validation samples: 10000
Total test samples: 10000


In [8]:
# We have some NaN entries in our train data, we will simply drop them.
print("Number of missing values")
print(train_df.isnull().sum())
train_df.dropna(axis=0, inplace=True)

Number of missing values
similarity    0
sentence1     0
sentence2     6
dtype: int64


In [9]:
print("Train Target Distribution")
print(train_df.similarity.value_counts())

Train Target Distribution
entailment       166712
contradiction    166510
neutral          166091
-                   681
Name: similarity, dtype: int64


In [10]:
train_df = (
    train_df[train_df.similarity != "-"]
    .sample(frac=1.0, random_state=42)
    .reset_index(drop=True)
)
valid_df = (
    valid_df[valid_df.similarity != "-"]
    .sample(frac=1.0, random_state=42)
    .reset_index(drop=True)
)

In [11]:
# One hot encoding 
train_df["label"] = train_df["similarity"].apply(
    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
)
y_train = tf.keras.utils.to_categorical(train_df.label, num_classes=3)

valid_df["label"] = valid_df["similarity"].apply(
    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
)
y_val = tf.keras.utils.to_categorical(valid_df.label, num_classes=3)

test_df["label"] = test_df["similarity"].apply(
    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
)
y_test = tf.keras.utils.to_categorical(test_df.label, num_classes=3)