In [1]:
!pip install "pinecone-client[grpc]" sentence-transformers datasets



### Load [dataset](https://huggingface.co/datasets/quora)

In [2]:
from datasets import load_dataset

dataset = load_dataset("quora", split="train[0:50000]")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset

Dataset({
    features: ['questions', 'is_duplicate'],
    num_rows: 50000
})

In [4]:
dataset[:5]

{'questions': [{'id': [1, 2],
   'text': ['What is the step by step guide to invest in share market in india?',
    'What is the step by step guide to invest in share market?']},
  {'id': [3, 4],
   'text': ['What is the story of Kohinoor (Koh-i-Noor) Diamond?',
    'What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?']},
  {'id': [5, 6],
   'text': ['How can I increase the speed of my internet connection while using a VPN?',
    'How can Internet speed be increased by hacking through DNS?']},
  {'id': [7, 8],
   'text': ['Why am I mentally very lonely? How can I solve it?',
    'Find the remainder when [math]23^{24}[/math] is divided by 24,23?']},
  {'id': [9, 10],
   'text': ['Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?',
    'Which fish would survive in salt water?']}],
 'is_duplicate': [False, False, False, False, False]}

In [6]:
import pandas as pd
from IPython.display import display

pd.DataFrame(dataset[:5])

Unnamed: 0,questions,is_duplicate
0,"{'id': [1, 2], 'text': ['What is the step by s...",False
1,"{'id': [3, 4], 'text': ['What is the story of ...",False
2,"{'id': [5, 6], 'text': ['How can I increase th...",False
3,"{'id': [7, 8], 'text': ['Why am I mentally ver...",False
4,"{'id': [9, 10], 'text': ['Which one dissolve i...",False


### Preprocess dataset
Wrapping all questions in one text

In [7]:
questions = []

for question in dataset['questions']:
    questions.extend(question['text'])
    
questions = list(set(questions)) # Remove duplicates
print('\n'.join(questions[:5]))
print(len(questions))

What is a good way to make an eCommerce website?
What is the equation to convert a decimal into a fraction?5.83
Which phone to buy Moto m or one plus 3t?
If you're in a mediocre relationship and connect deeper than ever before with another, what is the better path - staying true, or pursuing the other?
I leveraged 100K to secure a loan for a startup, I don't need to touch the 100K to get the business started, what should I do with the 100K?
88884


### Model Loading 

In [8]:
from sentence_transformers import SentenceTransformer
import torch 

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentenceTransformer('all-MiniLM-L6-v2', device = device)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

##### Notes
1. max_seq_length: 256 the max number of tokens that can be encoded into one vector (Longer will be truncated)
2. word_embedding_dimension: 384 the dim of output vector embedding
3. Normalize 

### Creating Index

In [None]:
_id = '0'
metadata = {'text', query}

# The format of the index 
vectors = [(_id, xq, metadata)]

In [9]:
import os 
import pinecone

pinecone.init(
    api_key = '28cf584c-61e4-4185-81eb-aff482cef12c',
    environment = 'gcp-starter'
)

In [10]:
index_name = "semantic-search"

pinecone.create_index(
    name = index_name,
    dimension = model.get_sentence_embedding_dimension(),
    metric = 'cosine'
)

index = pinecone.GRPCIndex(index_name)

In [11]:
from tqdm.auto import tqdm

batch_size = 128
vector_limit = 100000

questions = questions[:vector_limit]

for i in tqdm(range(0, len(questions), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(questions))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    # create embeddings
    xc = model.encode(questions[i:i_end])
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

# check number of records in the index
index.describe_index_stats()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 695/695 [20:38<00:00,  1.78s/it]


{'dimension': 384,
 'index_fullness': 0.8768,
 'namespaces': {'': {'vector_count': 87680}},
 'total_vector_count': 87680}

### Testing

In [12]:
query = "which city has the highest population in the world?"

# create the query vector
xq = model.encode(query).tolist()

# now query
xc = index.query(xq, top_k=5, include_metadata=True)
xc

{'matches': [{'id': '35907',
              'metadata': {'text': "What's the world's largest city?"},
              'score': 0.7859108,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '51947',
              'metadata': {'text': 'What is the biggest city?'},
              'score': 0.72731686,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '72666',
              'metadata': {'text': "What are the world's most advanced "
                                   'cities?'},
              'score': 0.71006703,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': '13794',
              'metadata': {'text': 'Which city in the world has the lowest '
                                   'crime rate and why?'},
              'score': 0.64893097,
              'sparse_values': {'indices': [], 'values': []},
              'va

In [13]:
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

0.79: What's the world's largest city?
0.73: What is the biggest city?
0.71: What are the world's most advanced cities?
0.65: Which city in the world has the lowest crime rate and why?
0.64: Which is the largest country in the world?


In [14]:
query = "what is the capital of Egypt?"

# create the query vector
xq = model.encode(query).tolist()

# now query
xc = index.query(xq, top_k=5, include_metadata=True)
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")
     

0.66: What are some interesting facts about Egypt?
0.66: What is the capital of India?
0.66: What are interesting facts about Egypt?
0.64: What is it like to live in Egypt?
0.63: What is it like to live in Egypt today?


In [15]:
query = "How can I increase the speed of my internet connection?"

# create the query vector
xq = model.encode(query).tolist()

# now query
xc = index.query(xq, top_k=5, include_metadata=True)
for result in xc['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")
     

0.93: How can I speed up my Internet connection?
0.9: How can I speed up my Internet connectionn?
0.82: How do I increase download speed?
0.82: How can one speed up the Internet?
0.79: How can I increase the speed of my internet connection while using a VPN?
