In [None]:
pip install -U sentence_transformers

In [None]:
from sentence_transformers import CrossEncoder
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

In [71]:
model = CrossEncoder('cross-encoder/nli-roberta-base')
scores = model.predict([('A man is eating pizza', 'A man eats something')])

#Convert scores to labels
label_mapping = ['contradiction', 'entailment', 'neutral']
labels = [label_mapping[score_max] for score_max in scores.argmax(axis=1)]
labels

In [None]:
# downloading the dataset from the url
!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip
#unzip the files
!unzip snli_1.0.zip

# importing the dataset into dataframes
df_train = pd.read_csv("snli_1.0/snli_1.0_train.txt", sep="\t")
df_dev = pd.read_csv("snli_1.0/snli_1.0_dev.txt", sep="\t")
df_test = pd.read_csv("snli_1.0/snli_1.0_test.txt", sep="\t")

# extracting the required columns form the dataset
df_train = df_train[['gold_label','sentence1','sentence2']]
df_dev = df_dev[['gold_label','sentence1','sentence2']]
df_test = df_test[['gold_label','sentence1','sentence2']]

# Analyzing the data
df_train.groupby('gold_label').count()

# removing the entries from all train, dev and test datasets with label '-'
df_train = df_train[df_train['gold_label'] != '-']
df_dev = df_dev[df_dev['gold_label'] != '-']
df_test = df_test[df_test['gold_label'] != '-']

df_test = df_test.head(200)   # dropping the rows from the data with NaN values
df_train = df_train.dropna(subset = ['sentence2'])
df_train.groupby('gold_label').count() 

In [74]:
premise  = df_test['sentence1'].values
hypothesis = df_test['sentence2'].values

model.predict([(premise[0], hypothesis[0])])

array([[ 0.41002804, -2.0508602 ,  2.702842  ]], dtype=float32)

In [75]:
# predictions pour le jeu de test 
label_mapping = ['contradiction', 'entailment', 'neutral']
listt = []
for i in range (len(hypothesis)):
  preds = model.predict([(premise[i], hypothesis[i])])
  labels = [label_mapping[score_max] for score_max in preds.argmax(axis=1)]
  listt.append("".join(labels))

In [81]:
le = LabelEncoder()
Y_test = np_utils.to_categorical(le.fit_transform(df_test["gold_label"].values)).astype("int64")
Y_predi = np_utils.to_categorical(le.fit_transform(listt)).astype("int64")

In [82]:
test_acc = (np.argmax(Y_predi, axis=1) == np.argmax(Y_test, axis=1)).sum()/Y_test.shape[0] * 100
print("Accuracy on test set is: %"+str(test_acc))

Accuracy on test set is: %90.5
