In [2]:
from datasets import load_dataset

dataset = load_dataset("quora", split="train[240000:250000]")
dataset

Dataset({
    features: ['questions', 'is_duplicate'],
    num_rows: 10000
})

In [3]:
questions = []

for record in dataset['questions']:
    questions.extend(record['text'])

questions = list(set(questions))
print('\n'.join(questions[:5]))
print(len(questions))

What is the difference with ground and without ground?
Why do some people on QUORA ask questions that they can easily findout on Google?
I want to buy a long fur German shephard can u suggest me where to buy I live in Srinagar ..9018328789 anyone can call me?
What is the difference between striploin and sirloin steak?
What is the most beautiful classical music?
19299


In [34]:
from sentence_transformers import SentenceTransformer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if device != "cuda":
    print("WARNING: No CUDA device found. Using CPU.")

model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
model



SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [5]:
query = "What is the best way to learn programming?"
query_embedding = model.encode(query)
query_embedding.shape
query_embedding

array([ 3.77160013e-02,  1.54312719e-02, -5.12053333e-02,  2.37614606e-02,
       -9.23588127e-02, -3.74537185e-02,  7.15376586e-02,  6.00062609e-02,
       -5.67687452e-02,  5.09975627e-02, -5.91733269e-02,  7.15176910e-02,
        4.94287647e-02,  3.07181366e-02,  5.34070982e-03,  3.29329781e-02,
       -8.97877589e-02,  8.38545784e-02,  8.23937580e-02, -5.11909090e-02,
       -5.47390468e-02, -7.05551207e-02, -2.13751812e-02, -5.56690581e-02,
        8.15913454e-02,  7.65197426e-02, -1.22078126e-02, -8.89020215e-04,
        4.43999059e-02,  2.63738981e-03, -4.25614715e-02,  1.56520084e-02,
       -6.98529650e-03,  2.64594108e-02, -4.10793647e-02,  4.98036966e-02,
        4.65675108e-02, -6.08673543e-02,  1.02313850e-02,  1.55912358e-02,
       -1.03648715e-01,  9.29048434e-02, -2.61920672e-02, -7.99211413e-02,
        6.46724775e-02,  1.13762310e-02, -9.74962953e-03, -1.46046430e-02,
        6.61429241e-02, -3.15685378e-04, -9.06648561e-02, -1.70479398e-02,
       -3.48122232e-02, -

In [6]:
_id = 0

metadata = {
    'text': query,
}

vector = [(_id, query_embedding, metadata)]

In [9]:
import os
from pinecone import Pinecone, ServerlessSpec

api_key = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key=api_key)

pc

<pinecone.control.pinecone.Pinecone at 0x14e536350>

In [14]:
index_name  = "semantic-search"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name, 
        vector_type="dense",
        dimension=model.get_sentence_embedding_dimension(),
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)


In [16]:
from tqdm.auto import tqdm

batch_size = 128

for i in tqdm(range(0, len(questions), batch_size)):
    
    i_end = min(i + batch_size, len(questions))

    ids = [str(x) for x in range(i, i_end)]
    
    metadata = [{'text': q} for q in questions[i:i_end]]

    embeddings = model.encode(questions[i:i_end])

    records = zip(ids, embeddings, metadata)

    index.upsert(vectors=records)

index.describe_index_stats()

100%|██████████| 151/151 [02:11<00:00,  1.15it/s]


{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 19840}},
 'total_vector_count': 19840,
 'vector_type': 'dense'}

In [None]:
query = "How to install mac os on windows?"

query_embedding = model.encode(query).tolist()

answer = index.query_namespaces(vector=query_embedding, 
                                namespaces=['(Default)'],
                                metric="cosine",
                                top_k=5, 
                                include_metadata=True
                                )

answer

{
    "usage": {
        "read_units": 1
    },
    "matches": []
}