In [2]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.cross_encoder import CrossEncoder

In [14]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [15]:
print("Max Sequence Length:", model.max_seq_length)

Max Sequence Length: 256


Characteristics of Sentence Transformer (a.k.a bi-encoder) models:

- Calculates a fixed-size vector representation (embedding) given texts or images.

- Embedding calculation is often efficient, embedding similarity calculation is very fast.

- Applicable for a wide range of tasks, such as semantic textual similarity, semantic search, clustering, classification, paraphrase mining, and more.

- Often used as a first step in a two-step retrieval process, where a Cross-Encoder (a.k.a. reranker) model is used to re-rank the top-k results from the bi-encoder.



In [5]:
sentences = [
    "The macbook pro is a better laptop.",
    "The apple silicon is performant!",
    "He is playing outside.",
]

In [6]:
embeddings = model.encode(sentences)
print(embeddings.shape)

(3, 384)


In [7]:
similarities = model.similarity(embeddings, embeddings)
print(similarities)

tensor([[ 1.0000,  0.3927, -0.0111],
        [ 0.3927,  1.0000, -0.1030],
        [-0.0111, -0.1030,  1.0000]])


In [9]:
model = CrossEncoder("cross-encoder/stsb-distilroberta-base")

Downloading config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/148 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [10]:
query = "A man is eating pasta."

corpus = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "A cheetah is running behind its prey.",
]

In [11]:
ranks = model.rank(query, corpus)

In [12]:
print("Query: ", query)
for rank in ranks:
    print(f"{rank['score']:.2f}\t{corpus[rank['corpus_id']]}")

Query:  A man is eating pasta.
0.67	A man is eating food.
0.34	A man is eating a piece of bread.
0.08	A man is riding a horse.
0.07	A man is riding a white horse on an enclosed ground.
0.01	The girl is carrying a baby.
0.01	Two men pushed carts through the woods.
0.01	A monkey is playing drums.
0.01	A woman is playing violin.
0.01	A cheetah is running behind its prey.
