# Building a search engine
Use the information of the dataset that you created to build a search engine that can help to find answers of the  most pressing questions about the FAISS library [Semantic Search with FAISS](https://huggingface.co/learn/nlp-course/chapter5/6?fw=pt) !

In [34]:
!pip install sentence_transformers



In [35]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Sample dataset
data = {
    'text': [
        "The quick brown jumps over the lazy dog.",
        "A day in the park is wonderful.",
        "She sells sea by the seashore."
    ]
}

# Load a pre-trained model for generating embeddings
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Generate embeddings for each text entry
embeddings = model.encode(data['text'])

# Convert embeddings to a NumPy array
embeddings = np.array(embeddings)

# Display embeddings shape
print(embeddings.shape)  # Output should be (number_of_texts, embedding_dimension)


(3, 384)


In [36]:
!pip install faiss-cpu



In [37]:
import faiss

# Dimension of the embeddings
embedding_dimension = embeddings.shape[1]

# Initialize a FAISS index
index = faiss.IndexFlatL2(embedding_dimension)

# Add embeddings to the index
index.add(embeddings)

# Check the number of indexed vectors
print("Total vectors in index:", index.ntotal)  # Output should match the number of texts


Total vectors in index: 3


In [38]:
# Sample query
query = "It's a beautiful day to be outside."

# Encode the query to obtain its embedding
query_embedding = model.encode([query])

# Perform the search in the FAISS index
k = 2  # Number of top results to return
distances, indices = index.search(query_embedding, k)

# Display results
print("Top results:")
for i, idx in enumerate(indices[0]):
    print(f"Text: {data['text'][idx]}")
    print(f"Distance: {distances[0][i]}")


Top results:
Text: A day in the park is wonderful.
Distance: 0.6938740015029907
Text: She sells sea by the seashore.
Distance: 1.8053944110870361


In [39]:
def search(query, model, index, data, top_k=3):
    # Encode the query
    query_embedding = model.encode([query])

    # Perform the search
    distances, indices = index.search(query_embedding, top_k)

    # Collect results
    results = []
    for i, idx in enumerate(indices[0]):
        results.append({
            'text': data['text'][idx],
            'distance': distances[0][i]
        })

    return results

# Example usage
query = "A relaxing time by the sea."
results = search(query, model, index, data, top_k=3)

# Display search results
for result in results:
    print(f"Text: {result['text']}")
    print(f"Distance: {result['distance']}\n")


Text: She sells sea by the seashore.
Distance: 1.17189621925354

Text: A day in the park is wonderful.
Distance: 1.274449110031128

Text: The quick brown jumps over the lazy dog.
Distance: 1.5749938488006592



Creating text embeddings


In [40]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [41]:
import torch

device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [42]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [43]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [44]:
embedding = get_embeddings(data['text'][0])
embedding.shape

torch.Size([1, 768])

In [45]:
!pip install datasets
from datasets import Dataset



In [46]:
# Assuming 'data' is your dictionary
dataset = Dataset.from_dict(data)

embeddings_dataset = dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [47]:
print(embeddings_dataset)

Dataset({
    features: ['text', 'embeddings'],
    num_rows: 3
})


In [48]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'embeddings'],
    num_rows: 3
})

In [49]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [50]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [51]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [52]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.text}")
    print(f"SCORE: {row.scores}")
    print(f"TITLE: {row.embeddings}")
    print("=" * 50)
    print()

COMMENT: The quick brown jumps over the lazy dog.
SCORE: 72.14004516601562
TITLE: [-0.07283102720975876, -0.6770839691162109, -0.35731345415115356, -0.29229459166526794, 0.17197395861148834, 0.012172823771834373, 0.1538657695055008, 0.17560841143131256, -0.3977195918560028, 0.03237605094909668, 0.20412681996822357, 0.4029882550239563, -0.18322621285915375, -0.14170695841312408, 0.16373388469219208, -0.008798811584711075, 0.5085492730140686, 0.06594230979681015, 0.1743410974740982, -0.20824234187602997, -0.16542866826057434, -0.12899313867092133, -0.08397825807332993, 0.10811526328325272, -0.09620143473148346, 0.29099541902542114, 0.013769888319075108, 0.1105225682258606, 0.05719054862856865, -0.11325012892484665, 0.10608988255262375, 0.19963178038597107, -0.24760885536670685, 0.2864415943622589, -0.00010167317668674514, -0.1952168494462967, 0.06855903565883636, 0.3118581473827362, 0.4992114305496216, -0.3182898461818695, -0.21471835672855377, -0.19841268658638, -0.22566843032836914, -0

Creating text embeddings



In [53]:
!pip install sentence-transformers




In [54]:
from sentence_transformers import SentenceTransformer

# Initialize the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Sample text data
data = {
    'text': [
        "The quick brown jumps over the lazy dog.",
        "A day in the park is wonderful.",
        "She sells sea by the seashore."
    ]
}

# Create embeddings for each text
embeddings = model.encode(data['text'])

# Display the shape of the embeddings
print("Embeddings shape:", embeddings.shape)


Embeddings shape: (3, 384)


In [55]:
!pip install tensorflow tensorflow_hub



In [56]:
import tensorflow_hub as hub

# Load the Universal Sentence Encoder model
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(model_url)

# Sample text data
texts = [
    "The quick brown [MASK] jumps over the lazy dog.",
    "A [MASK] day in the park is wonderful.",
    "She sells sea [MASK] by the seashore."
]


# Create embeddings
use_embeddings = model(texts)

# Display the shape of the embeddings
print("USE Embeddings shape:", use_embeddings.shape)


USE Embeddings shape: (3, 512)
