# Semantic Search Engine

If you are using Google Colab, make sure you change _Runtime_ -> _Change runtime type_ -> _Hardware Accelerator: GPU_ and restart the runtime.

In [None]:
# PREREQUISITS

# uncomment below to install required modules if needed
# !pip install transformers
# !pip install torch
# !pip install faiss-gpu
# !pip install datasets
# %load_ext autoreload

# import modules
import numpy as np
import pandas as pd
import faiss
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel

print("--all modules imported--")

--all modules imported--


In [None]:
# LOAD DATASET

# I am using the "ecommerce items" dataset from Kaggle
# you can download it here: https://www.kaggle.com/cclark/product-item-data?select=sample-data.csv

from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Code/Repositories/transformer_language_models/semanticSearch/sample-data.csv'
df = pd.read_csv(path)
dataset = Dataset.from_pandas(df)
dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Dataset({
    features: ['id', 'description'],
    num_rows: 500
})

In [None]:
# TEXT EMBEDDINGS

# load pre-trained model and tokenizer from huggingface model hub
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

# place model on GPU for faster embedding
device = torch.device("cuda")
model.to(device)


# helper functions 
# apply this function to represent each search query as a single vector
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

# apply this function to tokenize your the documents in your dataset
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

# test the function using the first entry
# result should be a 768d vector
embedding = get_embeddings(dataset["description"][0])
print("Embedding shape: ", embedding.shape)

# finally, create embeddings on the whole corpus and add it to the dataset
embeddings_dataset = dataset.map(
    lambda x: {"embeddings": get_embeddings(x["description"]).detach().cpu().numpy()[0]}
)

Embedding shape:  torch.Size([1, 768])


  0%|          | 0/500 [00:00<?, ?ex/s]

In [None]:
# COMPUTE SIMILIARITY METRIC

# use faiss index for similiarity search on embedding column
embeddings_dataset.add_faiss_index(column="embeddings")
embeddings_dataset

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['id', 'description', 'embeddings'],
    num_rows: 500
})

In [None]:
# ADD SEARCH DETAILS

# finally. add your search term below. 
search_term = "Shorts from H&M" # adding an example to search for
results_k = 5 # change to how many results you'd like to return

In [12]:
# COMPUTE SEARCH RESULTS

# embedding the search query text. Same as we did with the dataset before
search_term_embedding = get_embeddings([search_term]).cpu().detach().numpy()

# compute k nearest documents
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", search_term_embedding, k=results_k
)

# store results in pandas dataframe
query_results = pd.DataFrame.from_dict(samples)
query_results["scores"] = scores
query_results.sort_values("scores", ascending=False, inplace=True)

# print results
print("Search term: ", search_term, "\n")
for _, row in query_results.iterrows():
    print(f"DESCRIPTION: {row.description}")
    print(f"SCORE: {row.scores}")
    print("=" * 50)
    print()

Search term:  Shorts from H&M 

DESCRIPTION: Baggies shorts - Even Baggies, our most popular shorts for anything, or nothing, occasionally need an update. This season we've increased the inseam length. Their casual fit, quick-drying water-repellent nylon and elasticized waistband with an internal drawstring remain the same as ever. Other features include a polyester mesh lining, a rear snap pocket and front pockets (with self-draining mesh corners) positioned to reduce drag in the water. Inseam (size M) is 7". Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>Quick-drying nylon with a DWR (durable water repellent) finish</li> <li>Elasticized waistband with internal drawstring; black mesh liner</li> <li>Vertical on-seam side pockets for reduced drag in the water; pocket bags have quick-drain-and-dry mesh corners; snap-closed back pocket</li> <li>"Inseam is 7"""</li></ul><br><br><b>Fabric: </b>4.2-oz 100% nylon with a DWR finish. Lining: 5.2-oz 100% 