<a href="https://colab.research.google.com/github/2003Yash/semantic-search/blob/main/semantic_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

Import Libraries

In [None]:
# sentence tranformers will take a sentence with full of words and convert them into a single context aware embedding
# Flowchart : sentence -> BERT -> Pooling -> Embeddings
# other methods like unigram, n-gram, avg of all vectors in a sentence to create a dense vector to represent
# sentence tranformer is superior than anything from abov methods as it can capture bi-directional contexts

In [2]:
from sentence_transformers import SentenceTransformer
import faiss # faiss is a high-performance library for similarity search, used here for fast retrieval.
import numpy as np

In [3]:
# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Input-data:

In [6]:
# List of world news headlines
news_headlines = [
    "Global economy recovers as inflation slows down",
    "New species of butterfly discovered in the Amazon",
    "Scientists develop breakthrough cancer treatment",
    "Wildfires continue to spread in Australia",
    "Stock markets crash amid geopolitical tensions",
    "Countries sign historic climate change agreement",
    "Major earthquake causes damage in Japan",
    "AI technology helps doctors diagnose diseases faster",
    "New educational reforms promise better schooling system",
    "Conflict in Middle East escalates further"
]

# Define a query representing "good news"
query = ["Positive global developments, scientific breakthroughs, environmental success stories"]

Embedding input data:

In [7]:
# Encode the news headlines and query into embeddings
news_embeddings = model.encode(news_headlines, normalize_embeddings=True)
query_embedding = model.encode(query, normalize_embeddings=True)

 Build a FAISS Index for Similarity Search

In [8]:
# Create a FAISS index for similarity search
# - Faiss is a library — developed by Facebook AI — that enables efficient similarity search.
#    So, given a set of vectors, we can index them using Faiss
# — then using another vector (the query vector), we search for the most similar vectors within the index.

In [None]:
dimension = news_embeddings.shape[1]  # news-emdedding has many words and many dimensions for each word and .shape[1] returns the no.of features in the embedding.
                                      # ie., of news_articles has 10 sentences and each has 384 dims of emdedings then 10x384 is size and shape[1] is 384 returns no.od features for faiss to work on
index = faiss.IndexFlatIP(dimension)  # creates a FAISS index that performs similarity search using Inner Product (dot product) within dimensions. AKA cosine similarity

In [None]:
index.add(news_embeddings) # adds all the encoded news headlines to the FAISS search index.

Semantic Search

In [9]:
k = 3 # k means we retrieve the top 3 most relevant news articles.
D, I = index.search(query_embedding, k) # index.search(query_embedding, k) returns:
                                        # D: Similarity scores (higher is better)
                                        # I: Indices of the top k most relevant headlines.

Print Output

In [10]:
# Print the top 3 good news headlines
print("Top Good News Articles:")
for idx in I[0]:
    print(f"- {news_headlines[idx]}")

Top Good News Articles:
- Scientists develop breakthrough cancer treatment
- Countries sign historic climate change agreement
- Global economy recovers as inflation slows down
