# Semantic Search

Colab notebook for trying out the semantic search module.

## Install necessary packages

In [None]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.

In [None]:
from typing import List
import torch
import logging
from sentence_transformers import SentenceTransformer, CrossEncoder, util

In [None]:
class NoCrossEncoder(Exception):
    """For throwing when trying to use cross encoder that wasn't specified"""

class NoDataAvailable(Exception):
    """For throwing when trying to use train a model where the data wasn't specified"""

In [None]:
class SearchModel:
    def __init__(
        self, 
        corpus_passages: List[str] = [],  
        bi_encoder: str = '',
        use_cross_encoder: bool = False, 
        cross_encoder: str = '', 
        top_k=32, 
        name: str = 'semantic_search_model'
        ) -> None:
        """
        corpus_passages: A list of all the text you wish to generate embeddings for
        bi_encoder: A string representing a path to a SentenceTransformer bi-encoder model
        use_cross_encoder: Bool to select whether a cross encoder should be added to the model.
        cross_encoder: A string representing a path to a SentenceTransformer cross-encoder model. A cross encoder
        won't be used if use_cross_encoder is False.
        top_k: The number of passages we want to retrieve
        name: Name of your search model
        """
        self._corpus_passages = corpus_passages
        self._top_k = top_k
        self._name = name
        
        if bi_encoder == '':
            self._bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
        else:
            self._bi_encoder = SentenceTransformer(bi_encoder)
        
        if use_cross_encoder:
            if cross_encoder == '':
                self._cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
            else:
                self._cross_encoder = CrossEncoder(cross_encoder)
        else:
            self._cross_encoder = None
        
        self._corpus_embeddings = None

    def train(self) -> None:
        """
        Use bi-encoder to generate embeddings for the entire corpus
        """
        if not self._corpus_passages:
            raise NoDataAvailable(f"Corpus for model: {self._name} is empty")
        if not torch.cuda.is_available():
            logging.warning("Warning: No GPU found. Please use GPU as running on CPU may take a lot of time")
        
        self._bi_encoder.max_seq_length = 256
        # compute the corpus_embeddings from scratch (which can take a while depending on the GPU)
        self._corpus_embeddings = self._bi_encoder.encode(self._corpus_passages, convert_to_tensor=True, show_progress_bar=True)

    def load(self, embeddings_path: str, is_cpu: bool = False) -> None:
        """
        Load embeddings from pt file
        
        Use is_cpu = True if GPU is not available
        """
        if is_cpu:
            self._corpus_embeddings = torch.load(embeddings_path, map_location=torch.device('cpu'))
        else: # if using GPU
            self._corpus_embeddings = torch.load(embeddings_path)

    def save(self, filename: str) -> None:
        """Save embeddings in torch format (.pt) extension"""
        torch.save(self._corpus_embeddings, filename)

    def predict(self, query, re_rank: bool = False, is_cpu: bool = False) -> List[dict]:
        """
        Run semantic search on a single query
        
        Use is_cpu = True if GPU is not available
        """
        if not self._corpus_passages:
            raise NoDataAvailable(f"Corpus for model: {self._name} is empty")
        
        results = []
        if re_rank and self._cross_encoder == None:
            raise NoCrossEncoder(f"No cross encoder specified for model: {self._name}")
        # Encode the query using the bi-encoder and find potentially relevant passages
        question_embedding = self._bi_encoder.encode(query, convert_to_tensor=True)
        if not is_cpu: # when using GPU
            question_embedding = question_embedding.cuda()
        
        hits = util.semantic_search(question_embedding, self._corpus_embeddings, top_k=self._top_k)
        hits = hits[0]  # Get the hits for the first query

        if not re_rank:
            # sort scores in descending order
            hits = sorted(hits, key=lambda x: x['score'], reverse=True) 
            for hit in hits:
                data = {}
                data['score'] = hit['score']
                data['text'] = self._corpus_passages[hit['corpus_id']]
                results.append(data)
        else:
            cross_inp = []
            for hit in hits:
                text = self._corpus_passages[hit['corpus_id']]
                cross_inp.append([query, text])
            cross_scores = self._cross_encoder.predict(cross_inp)

            for idx in range(len(cross_scores)):
                hits[idx]['cross-score'] = cross_scores[idx]
            
            # sort cross encoder scores in descending order
            hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
        
            for hit in hits:
                data = {}
                data['score'] = hit['cross-score']
                data['text'] = self._corpus_passages[hit['corpus_id']]
                results.append(data)
        
        return results
            

# Training

In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("/content/drive/MyDrive/SO_df.csv")

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,id
0,0,Stack data structure in python,4688859
1,1,How to create a glib.Source from Python?,4688943
2,2,What to reference in the shebang python26 or p...,4689233
3,3,python multiple imports for a common module,4689252
4,4,Anybody know a valid mechanism to detect the d...,4689456


In [None]:
df.describe()

Unnamed: 0.1,Unnamed: 0,id
count,1667288.0,1667288.0
mean,833643.5,46467660.0
std,481304.7,19694580.0
min,0.0,337.0
25%,416821.8,32254170.0
50%,833643.5,50821680.0
75%,1250465.0,62916410.0
max,1667287.0,73842220.0


In [None]:
indexes = list(df.index)
passages = {}
passages['title'] = []
passages['id'] = []

# generate embeddings for the first 300K rows
for title, id in zip(df['title'][:300000], indexes[:300000]): 
    passages['title'].append(title)
    passages['id'].append(id)

# If you like, you can also limit the number of passages you want to use
print("Passages:", len(passages['title']))

Passages: 300000


In [None]:
my_search_model = SearchModel(passages['title'], name='my_search_model')

In [None]:
my_search_model.train()

Batches:   0%|          | 0/9375 [00:00<?, ?it/s]

In [None]:
my_search_model.save('my_embeddings.pt')

In [None]:
results = my_search_model.predict("Hello World in python")

In [None]:
for data in results:
    print(f"Text: {data['text']}, Score: {data['score']}")

Text: Hello World in Python, Score: 0.9999998807907104
Text: Python Newbie hello world (Why???), Score: 0.7648884654045105
Text: Hello_World is a word in Python, it is different in an other mode. How I change it?, Score: 0.7475423216819763
Text: Hello, World! in Python + Glade 2, Score: 0.7379846572875977
Text: Syntax error on hello world with python, Score: 0.721665620803833
Text: How can I print "hello world" every 34 minutes in python?, Score: 0.7170544862747192
Text: Need a simple "Hello World" example using the Webkit library in Python, Score: 0.7016738653182983
Text: Simple "Hello-World" program for python-evince, Score: 0.688801646232605
Text: Hello world in Java: Understanding the concept versus in python, Score: 0.6830593943595886
Text: Python Hello World in PyObjC on iPhone?, Score: 0.6741775274276733
Text: simple hello world program gives issue in webpy, Score: 0.671778678894043
Text: "Hello World" programme for Bottle.py, Score: 0.6667776107788086
Text: Can't run Python 'He

# Inference

In [None]:
saved_model = SearchModel(name='incomplete_model')
saved_model.load('my_embeddings.pt', is_cpu=False) # Using GPU

Watch when I try to make a prediction without specifying my passages.

In [None]:
query = "Hello world in python"

print("Without retrieve and re-ranking: ")
results = saved_model.predict(query)
for data in results[:5]:
    print(f"Text: {data['text']}, Score: {data['score']}")

print("Using retrieve and re-ranking: ")
results = saved_model.predict(query, re_rank=True)
for data in results[:5]:
    print(f"Text: {data['text']}, Score: {data['score']}")

Without retrieve and re-ranking: 


NoDataAvailable: ignored

In [None]:
saved_model = SearchModel(passages['title'], name='complete_model')
saved_model.load('my_embeddings.pt', is_cpu=False)

In [None]:
query = "Hello world in python"

In [None]:
print("Without retrieve and re-ranking: ")
results = saved_model.predict(query)
for data in results[:5]:
    print(f"Text: {data['text']}, Score: {data['score']}")

Without retrieve and re-ranking: 
Text: Hello World in Python, Score: 0.9999998807907104
Text: Python Newbie hello world (Why???), Score: 0.7648884654045105
Text: Hello_World is a word in Python, it is different in an other mode. How I change it?, Score: 0.7475423216819763
Text: Hello, World! in Python + Glade 2, Score: 0.7379846572875977
Text: Syntax error on hello world with python, Score: 0.721665620803833


Trying to use re-ranking without specifying cross-encode

In [None]:
print("Using retrieve and re-ranking: ")
results = saved_model.predict(query, re_rank=True)
for data in results[:5]:
    print(f"Text: {data['text']}, Score: {data['score']}")

Using retrieve and re-ranking: 


NoCrossEncoder: ignored

Correct Usage

In [None]:
saved_model = SearchModel(passages['title'], name='complete_model', use_cross_encoder=True)
saved_model.load('my_embeddings.pt', is_cpu=False)

Downloading:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
print("Using retrieve and re-ranking: ")
results = saved_model.predict(query, re_rank=True)
for data in results[:5]:
    print(f"Text: {data['text']}, Score: {data['score']}")

Using retrieve and re-ranking: 
Text: Hello_World is a word in Python, it is different in an other mode. How I change it?, Score: 8.804879188537598
Text: Hello World in Python, Score: 8.680126190185547
Text: How to print 'Hello world in python' through pressing a button connected to the terminal on a piface digital 2, Score: 8.500879287719727
Text: Hello, World! in Python + Glade 2, Score: 8.458288192749023
Text: Simple "Hello-World" program for python-evince, Score: 8.171799659729004


Thank you!!!