# Overview

In this notebook, we use the semantic search from sentence_transformers to retrieves the most similar questions from the [Quora duplicate questions](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) dataset. And we are going to use `distilbert-multilingual-nil-stsb-quora-ranking`, which was trained to identify similar questions and supports 50+ languages. This is a **symmetric search task**, as the search queries have the same length and content as the questions in the corpus.

In [1]:
!pip install sentence-transformers==2.3.1
!pip install datasets==2.15.0

Collecting sentence-transformers==2.3.1
  Downloading sentence_transformers-2.3.1-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1
Collecting datasets==2.15.0
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.15.0)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.15.0)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspe

In [2]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
import os

user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))
os.environ['MODEL_NAME']='quora-distilbert-multilingual'

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Loading and preprocess the data

You can downoad it directly from the internet or use the dataset.

In [3]:
from datasets import load_dataset

max_corpus_size=100000

ds=load_dataset('aisuko/quora_duplicate_questions')
ds

##  You can also download it from the internet.

# from sentence_transformers.util import http_get
# url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
# dataset_path = "quora_duplicate_questions.tsv"
# http_get(url,dataset_path)

# # Get all unique sentences from the file
# corpus_sentences = set()
# with open(dataset_path, encoding='utf8') as fIn:
#     reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
#     for row in reader:
#         corpus_sentences.add(row['question1'])
#         if len(corpus_sentences) >= max_corpus_size:
#             break

#         corpus_sentences.add(row['question2'])
#         if len(corpus_sentences) >= max_corpus_size:
#             break

# corpus_sentences = list(corpus_sentences)
# print("Encode the corpus. This might take a while")
# corpus_embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_tensor=True)

# ###############################
# print("Corpus loaded with {} sentences / embeddings".format(len(corpus_sentences)))

Downloading readme:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/58.2M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


DatasetDict({
    train: Dataset({
        features: ['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate'],
        num_rows: 404290
    })
})

In [4]:
ds=ds['train'].remove_columns(['id','qid1','qid2','is_duplicate'])
ds

Dataset({
    features: ['question1', 'question2'],
    num_rows: 404290
})

In [5]:
# Get all unique sentences from the file
corpus_sentences = set()

num=len(ds['question1'])
num2=len(ds['question2'])

while len(corpus_sentences)<max_corpus_size:
    if num>0:
        for i in ds['question1']:
            corpus_sentences.add(i)
            num-=1
    elif num2>0:
        for i in ds['question2']:
            corpus_sentences.add(i)
            num2-=1
    break

corpus_sentences=list(corpus_sentences)
len(corpus_sentences)

290457

# Loading the model

In [6]:
from sentence_transformers import SentenceTransformer

model=SentenceTransformer(os.getenv('MODEL_NAME'))
model

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/572 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/539M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/447 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
)

# Computing the embeddings

In [7]:
from sentence_transformers.util import normalize_embeddings, dot_score

corpus_embeddings=model.encode(corpus_sentences, show_progress_bar=True,convert_to_tensor=True).to('cuda')
corpus_embeddings=normalize_embeddings(corpus_embeddings)
corpus_embeddings

Batches:   0%|          | 0/9077 [00:00<?, ?it/s]

tensor([[ 0.0062, -0.0119,  0.0053,  ..., -0.0135,  0.0120,  0.0040],
        [ 0.0069,  0.0272,  0.0277,  ..., -0.0172, -0.0045,  0.0316],
        [ 0.0186,  0.0128,  0.0379,  ..., -0.0215, -0.0149,  0.0216],
        ...,
        [ 0.0122,  0.0100,  0.0380,  ..., -0.0137,  0.0222, -0.0008],
        [ 0.0055,  0.0540,  0.0410,  ..., -0.0069,  0.0075,  0.0201],
        [-0.0109,  0.0048,  0.0187,  ..., -0.0191, -0.0301, -0.0096]],
       device='cuda:0')

In [8]:
import time
from sentence_transformers.util import semantic_search

# Function that searches the corpus and prints the results
def search(inp_question):
    start_time=time.time()
    question_embedding=model.encode(inp_question,show_progress_bar=True, convert_to_tensor=True).to('cuda')
#     question_embedding=normalize_embeddings(question_embedding)
    hits=semantic_search(question_embedding, corpus_embeddings)
    end_time=time.time()
    # only on the first element
    hits=hits[0]
    
    print('Input question',{inp_question})
    print('Results (after {:.3f} seconds):'.format(end_time-start_time))
    
    for hit in hits[0:5]:
        print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']]))

In [9]:
search('How can I learn Python online?')

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Input question {'How can I learn Python online?'}
Results (after 0.064 seconds):
	0.988	How and from where can I learn Python language via video tutorials online?
	0.983	Which is the best resource to learn Python? Online, books, video tutes?
	0.982	What is the best online resource to learn Python?
	0.981	How do I learn Python at home?
	0.980	Where I should learn Python?


In [10]:
# German: How can I learn Python online?
search("Wie kann ich Python online lernen?")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Input question {'Wie kann ich Python online lernen?'}
Results (after 0.032 seconds):
	0.988	How and from where can I learn Python language via video tutorials online?
	0.985	Which is the best resource to learn Python? Online, books, video tutes?
	0.984	What is the best online resource to learn Python?
	0.981	How do I learn Python at home?
	0.980	Where I should learn Python?
