In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from DLAIUtils import Utils
import DLAIUtils

import os
import time 
import torch

from tqdm.auto import tqdm

In [2]:
dataset = load_dataset("quora", split="train[240000:290000]")

In [3]:
dataset[:5]

{'questions': [{'id': [207550, 351729],
   'text': ['What is the truth of life?', "What's the evil truth of life?"]},
  {'id': [33183, 351730],
   'text': ['Which is the best smartphone under 20K in India?',
    'Which is the best smartphone with in 20k in India?']},
  {'id': [351731, 351732],
   'text': ['Steps taken by Canadian government to improve literacy rate?',
    'Can I send homemade herbal hair oil from India to US via postal or private courier services?']},
  {'id': [37799, 94186],
   'text': ['What is a good way to lose 30 pounds in 2 months?',
    'What can I do to lose 30 pounds in 2 months?']},
  {'id': [351733, 351734],
   'text': ['Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?',
    'How do you graph x + 2y = -2?']}],
 'is_duplicate': [False, True, False, True, False]}

In [4]:
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])

question = list(set(questions))
print("\n".join(questions[:10]))
print("-" * 50)
print(f"Number of questions: {len(questions)}")


What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
Can I send homemade herbal hair oil from India to US via postal or private courier services?
What is a good way to lose 30 pounds in 2 months?
What can I do to lose 30 pounds in 2 months?
Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?
How do you graph x + 2y = -2?
--------------------------------------------------
Number of questions: 100000


In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [6]:
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

In [7]:
query = "which city is the most populated in the world?"
xq = model.encode(query)
xq.shape

(384,)

In [9]:
from dotenv import find_dotenv, load_dotenv
import openai
_ = load_dotenv(find_dotenv())

pinecone = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
pinecone.list_indexes()
openai.api_key = os.environ['OPENAI_API_KEY']

INDEX_NAME = f'dl-ai-{openai.api_key[-36:].lower().replace("_", "-")}'

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, 
    dimension=model.get_sentence_embedding_dimension(), 
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)
print(index)

<pinecone.data.index.Index object at 0x000002828EA97590>


In [10]:
batch_size = 200
vector_limit = 10000

questions = question[:vector_limit]

import json

for i in tqdm(range(0, len(questions), batch_size)):
    i_end = min(i+batch_size, len(questions))
    ids = [str(x) for x in range(i, i_end)]
    metadatas = [{"text": text} for text in questions[i:i_end]]
    xc = model.encode(questions[i:i_end])
    records = zip(ids, xc, metadatas)
    index.upsert(vectors=records)

  0%|          | 0/50 [00:00<?, ?it/s]

In [11]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

In [13]:
def run_query(query):
    embedding = model.encode(query).tolist()
    results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
    for result in results['matches']:
        print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

In [14]:
run_query('which city has the highest population in the world?')

0.67: Which city in the world is the most beautiful to live in?
0.67: Which is the most urbanised city in India?
0.61: Which city deserves to be the Capital of the World?
0.52: How many cities are there in China?
0.51: What do you think is the greatest country in the world?
0.51: What do you think are the top 3 countries to live in?
0.5: Which country has the most attractive girls?
0.49: What's the most religious city in California?
0.49: What were the largest cities in the Roman Empire?
0.47: How many countries are there in the world?


In [15]:
query = 'how do i make chocolate cake?'
run_query(query)

0.61: What is a cake mix?
0.57: How do you make candles?
0.55: What is the difference between chocolate and truffles and how are they made?
0.55: How do I bake a cake in a microwave oven?
0.49: How do I make art?
0.49: Are You Looking For Tasty Chocolates in Bangalore?
0.45: How do you make cotton candy flavoring? How is cotton candy made?
0.43: Where can I get best flavors, designs and decorations for cupcakes at Gold Coast?
0.43: Where can I get highest quality, tastiest cupcakes across the Gold Coast?
0.43: What is the recipe for bruchetta?
