In [5]:
import numpy as np
import pandas as pd 
import requests
from io import StringIO
import faiss 

In [15]:
res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')

text = res.text
text[:100]

'pair_ID\tsentence_A\tsentence_B\trelatedness_score\tentailment_judgment\n1\tA group of kids is playing in '

In [16]:
data = pd.read_csv(StringIO(text), sep='\t')
data.head()

Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,entailment_judgment
0,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,4.5,NEUTRAL
1,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,3.2,NEUTRAL
2,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,4.7,ENTAILMENT
3,5,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,3.4,NEUTRAL
4,9,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,3.7,NEUTRAL


In [17]:
sentences = data['sentence_A'].tolist()
sentences[:5]

['A group of kids is playing in a yard and an old man is standing in the background',
 'A group of children is playing in the house and there is no man standing in the background',
 'The young boys are playing outdoors and the man is smiling nearby',
 'The kids are playing outdoors near a man with a smile',
 'The young boys are playing outdoors and the man is smiling nearby']

In [18]:
sentence_b = data['sentence_B'].tolist()
sentences.extend(sentence_b)
len(set(sentences))

4802

In [19]:
urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

for url in urls:
    res = requests.get(url)
    # extract to dataframe
    try:
        data = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines='skip')
        # add to columns 1 and 2 to sentences list
        sentences.extend(data[1].tolist())
        sentences.extend(data[2].tolist())
    except:
        print(f"Error for url: {url}")

In [20]:
len(set(sentences))

14505

In [21]:
# remove duplicates and NaN
sentences = [
    sentence.replace('\n', '') for sentence in list(set(sentences)) if type(sentence) is str
    ]

In [22]:
with open('sentences.txt', 'w') as fp:
    fp.write('\n'.join(sentences))

In [23]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')

sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

(14504, 768)

In [25]:
# saving data
split = 256
file_count = 0
for i in range(0, sentence_embeddings.shape[0], split):
    end = i + split
    if end > sentence_embeddings.shape[0] + 1:
        end = sentence_embeddings.shape[0] + 1
    file_count = '0' + str(file_count) if file_count < 0 else str(file_count)
    with open(f'./sim_sentences/embeddings_{file_count}.npy', 'wb') as fp:
        np.save(fp, sentence_embeddings[i:end, :])
    print(f"embeddings_{file_count}.npy | {i} -> {end}")
    file_count = int(file_count) + 1

embeddings_0.npy | 0 -> 256
embeddings_1.npy | 256 -> 512
embeddings_2.npy | 512 -> 768
embeddings_3.npy | 768 -> 1024
embeddings_4.npy | 1024 -> 1280
embeddings_5.npy | 1280 -> 1536
embeddings_6.npy | 1536 -> 1792
embeddings_7.npy | 1792 -> 2048
embeddings_8.npy | 2048 -> 2304
embeddings_9.npy | 2304 -> 2560
embeddings_10.npy | 2560 -> 2816
embeddings_11.npy | 2816 -> 3072
embeddings_12.npy | 3072 -> 3328
embeddings_13.npy | 3328 -> 3584
embeddings_14.npy | 3584 -> 3840
embeddings_15.npy | 3840 -> 4096
embeddings_16.npy | 4096 -> 4352
embeddings_17.npy | 4352 -> 4608
embeddings_18.npy | 4608 -> 4864
embeddings_19.npy | 4864 -> 5120
embeddings_20.npy | 5120 -> 5376
embeddings_21.npy | 5376 -> 5632
embeddings_22.npy | 5632 -> 5888
embeddings_23.npy | 5888 -> 6144
embeddings_24.npy | 6144 -> 6400
embeddings_25.npy | 6400 -> 6656
embeddings_26.npy | 6656 -> 6912
embeddings_27.npy | 6912 -> 7168
embeddings_28.npy | 7168 -> 7424
embeddings_29.npy | 7424 -> 7680
embeddings_30.npy | 7680 -> 7

## Flat L2 Index
We initialize the flat L2 distance index `IndexFlatL2`, all we need is the specify the vector dimensionality - which in this case is d == 768 (to align with the sentence-BERT model output embeddings of size 768).

In [27]:
d = sentence_embeddings.shape[1]
index = faiss.IndexFlatL2(d)

In [30]:
print(index.is_trained, index.ntotal)

True 0


In [31]:
index.add(sentence_embeddings)
index.ntotal

14504

In [32]:
xq = model.encode(["Someone sprints with a football"])

In [33]:
%%time
k = 4
D, I = index.search(xq, k=k)
I

CPU times: user 12.9 ms, sys: 2.94 ms, total: 15.8 ms
Wall time: 10.1 ms


array([[4266, 9982, 5512, 6376]])

In [34]:
[f'{i}: {sentences[i]}' for i in I[0]]

['4266: A group of football players is running in the field',
 '9982: A group of people playing football is running in the field',
 '5512: Two groups of people are playing football',
 '6376: A person playing football is running past an official carrying a football']

In [40]:
nlist = 50
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [41]:
index.is_trained

False

In [42]:
index.train(sentence_embeddings)

In [43]:
index.is_trained

True

In [44]:
index.ntotal

0

In [45]:
index.add(sentence_embeddings)
index.ntotal

14504

In [46]:
%%time
k = 4
D, I = index.search(xq, k=k)
I

CPU times: user 1.65 ms, sys: 0 ns, total: 1.65 ms
Wall time: 994 µs


array([[9982, 5512, 6376,  185]])

In [47]:
[f'{i}: {sentences[i]}' for i in I[0]]

['9982: A group of people playing football is running in the field',
 '5512: Two groups of people are playing football',
 '6376: A person playing football is running past an official carrying a football',
 '185: A football player kicks the ball.']

In [48]:
m = 8
bits = 8
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits)

In [49]:
index.is_trained

False

In [50]:
index.train(sentence_embeddings)

In [51]:
index.add(sentence_embeddings)

In [52]:
%%time
k = 4
D, I = index.search(xq, k=k)
I

CPU times: user 102 µs, sys: 944 µs, total: 1.05 ms
Wall time: 655 µs


array([[6341, 6376, 9982,  185]])

In [53]:
[f'{i}: {sentences[i]}' for i in I[0]]



['6341: A football player is running past an official carrying a football',
 '6376: A person playing football is running past an official carrying a football',
 '9982: A group of people playing football is running in the field',
 '185: A football player kicks the ball.']