<a href="https://colab.research.google.com/github/Adnya-01/AI-projects/blob/main/semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Semantic Search**
In this code, we'll walk through how semantic search can be used to find the most relevant searches to our query from a multilingual translation dataset.

Semantic search refers to a retrieval method in which related search results are retrieved based on the context or the intent of the query, rather than just using keywords (as in lexical search).
It can be used in applications where traditional lexical search is insufficient and the intent of the user's input is important as well as for multimodal and multilingual applications.

Install required libraries

In [None]:
!uv pip install -qU \
  pinecone~=7.3.0 \
  pinecone-notebooks==0.1.1 \
  numpy==2.0.2 \
  datasets==3.5.1

Authenticate your Pinecone account and generate an API key

In [None]:
from pinecone_notebooks.colab import Authenticate

Authenticate()

Fetch your API key and initialize a Pinecone client which will be used to perform searches.

In [None]:
from pinecone import Pinecone
# Initialize client
import os

api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(
        # You can remove this for your own projects!
        api_key=api_key
    )

Create (if needed), connect to, and inspect a Pinecone semantic search index.

In [None]:

index_name = "semantic-search"

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model": "llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }
    )

# Initialize index client
index = pc.Index(name=index_name)

# View index stats
index.describe_index_stats()

Load English-Spanish translation sentence pairs from the Tatoeba dataset which contains thousands of sentence translation pairs.

In [None]:
from datasets import load_dataset
# specify that we want the english-spanish translation pairs
tatoeba = load_dataset("Helsinki-NLP/tatoeba", lang1="en", lang2="es", trust_remote_code=True, split="train")

In [None]:
tatoeba[0:5]

In [None]:
keywords= ["fan"]

def simple_keyword_filter(sentence, keywords):
  # filter for a list of keywords by sentence

    for keyword in keywords:
        if keyword in sentence:
            return True
    return False

def transform_dataset_for_pinecone(dataset, use_filter=True):

    if use_filter:
        # filter for a list of keywords by sentence, helpful for building intuition on semantic search
        translation_pairs = dataset.filter(lambda x: simple_keyword_filter(
        sentence = x["translation"]["en"], keywords=keywords))
    else:
        # use the full 200k+ dataset. Run only if you want to embed this many records!
        translation_pairs = dataset

    # flatten and shuffle for ease of use
    translation_pairs = translation_pairs.flatten()
    translation_pairs = translation_pairs.shuffle(seed=1)

    english_sentences = translation_pairs.rename_column("translation.en", "text").remove_columns("translation.es")

    # add lang column to indicate embedding origin
    english_sentences = english_sentences.add_column("lang", ["en"]*len(english_sentences))


    records = []

    for idx, sentence in enumerate(english_sentences):
        # Here, we create a record for each sentence in the dataset
        # The record contains an ID and metadata fields which we can use to filter if desired
        # The chunk_text field is the text we will embed
        records.append(
            {
                "id": str(idx),
                "chunk_text": sentence["text"],
                "lang": sentence["lang"]
            }
        )

    # convert to record format
    return records


records = transform_dataset_for_pinecone(tatoeba)

In [None]:
from tqdm import tqdm

batch_size = 96
namespace = "english-sentences"


# We upsert in batches of 96 to avoid hitting the embedding model's rate limit.

for start in tqdm(range(0, len(records), batch_size), f"Upserting records batch: "):
    index.upsert_records(records=records[start:start+batch_size], namespace = namespace)

In [None]:
search_query = "I am your biggest fan"

results = index.search(
    namespace=namespace,
    query={
        "top_k": 10,
        "inputs": {
            'text': search_query
        }
    }
)

for result in results["result"]["hits"]:
    print(f'Sentence: {result["fields"]["chunk_text"]} Semantic Similarity Score: {result["_score"]}\n')

In [None]:
search_query = "We definately need a fan in this hot summer"

results = index.search(
    namespace=namespace,
    query={
        "top_k": 10,
        "inputs": {
            'text': search_query
        }
    }
)

for result in results["result"]["hits"]:
    print(f'Sentence: {result["fields"]["chunk_text"]} Semantic Similarity Score: {result["_score"]}\n')

In [None]:
search_query = "Stop fanning yourself"

results = index.search(
    namespace=namespace,
    query={
        "top_k": 10,
        "inputs": {
            'text': search_query
        }
    }
)

for result in results["result"]["hits"]:
    print(f'Sentence: {result["fields"]["chunk_text"]} Semantic Similarity Score: {result["_score"]}\n')

In [None]:
pc.delete_index(name=index_name)