In [1]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [7]:
document = ["ankush is a boy, aadhaar number is 12324242, he likes pan pizza","ankush is a boy pan number is 232525 and needs to have tablet named dopamicnine"]
embeddings = model.encode(document)

In [8]:
embeddings.shape

(2, 384)

In [None]:
#limitations

# Sentence transformers often work by encoding sentences into fixed-size vectors, which can lose some of the subtle context or longer dependencies between words. While they capture semantic meaning, they might struggle with understanding nuanced or long-range relationships that span multiple sentences or paragraphs.

# Sentence transformers can be computationally expensive, especially when processing large datasets or performing inference on long texts. This is particularly an issue for transformer models with many layers, such as BERT-based models, which require significant memory and processing power.
# Inability to Handle OOV (Out of Vocabulary) Words Well:

# While models like BERT or RoBERTa are based on subword tokenization (e.g., Byte Pair Encoding), they still face challenges with handling rare or completely out-of-vocabulary words. The embedding of these words can sometimes be imprecise, potentially leading to less accurate results.
# Biases:

# Like many machine learning models, sentence transformers can inherit biases from the data they are trained on. This could lead to biased sentence embeddings when used for tasks such as sentiment analysis, information retrieval, or other NLP tasks.
# Dependence on Pre-trained Data:

# Sentence transformers rely on pre-trained models, which means their performance is heavily influenced by the quality of the training corpus. If the training data doesn't cover certain topics or domains well, the model's performance may degrade on tasks outside the domain it was trained on.
# Difficulty in Handling Ambiguity:

# Sentence transformers may struggle with sentences that are highly ambiguous or contain contradictory information. Since the model relies on vector representations of sentences, it might fail to distinguish subtle differences between sentences that require deep understanding or reasoning.
# Limited Interpretability:

# The embeddings generated by sentence transformers are high-dimensional vectors, which can be difficult to interpret directly. While there are methods to analyze them, they do not inherently provide insights into why a particular sentence was encoded in a certain way, making them less transparent.
# Performance on Short Texts:

# While sentence transformers work well for full sentences or longer texts, their performance on very short texts (e.g., single-word inputs or short phrases) can be suboptimal since the representation might lack enough information to form a meaningful embedding.
# Lack of Fine-Tuning for Specific Tasks:

# While general-purpose pre-trained models perform well in a variety of scenarios, they may not always perform optimally on highly specialized tasks without fine-tuning. Fine-tuning requires a domain-specific dataset, which may not always be available.
# Difficulty with Out-of-Context Phrases:

# Sentence transformers are typically trained to handle sentences in context, and they may have difficulty when a sentence or phrase is presented out of its original context. For example, using a sentence fragment in isolation may lead to less accurate embeddings because the model misses surrounding cues.

In [8]:

from sentence_transformers import SentenceTransformer





In [11]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Load the Universal Sentence Encoder (USE)
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
# Load Sentence-BERT (SBERT) model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')







In [29]:



# Example sentences
sentence1 = "The teacher enthusiastically explained the complex topic, ensuring that every student understood the concepts thoroughly."
sentence2 = "The instructor eagerly described the intricate subject, making sure that each learner grasped the ideas completely."
#SBERT WON


sentence1 = "X loves Y"
sentence2 = "Y is loved by X"
#SBERT WON

sentence1 = "the cat sat on the windowsill in the sunlight"
sentence2 = "the cat rested on the windowsill under the sun"
#SBERT WON

sentence1 = "she is reading a book in the garden"  
sentence2 = "she sits in the garden, reading the book"
#SBERT WON

sentence1 = "he went to the store to buy groceries"
sentence2 = "he visited the shop to get some groceries"

#finding grocerries and groceries have affect on the store.


In [30]:
# Get the embeddings using USE
use_embeddings1 = use_model([sentence1])[0].numpy()
use_embeddings2 = use_model([sentence2])[0].numpy()


# Get the embeddings using SBERT
sbert_embeddings1 = sbert_model.encode([sentence1])
sbert_embeddings2 = sbert_model.encode([sentence2])



# Calculate Cosine Similarity for USE
use_similarity = cosine_similarity([use_embeddings1], [use_embeddings2])[0][0]

# Calculate Cosine Similarity for SBERT
sbert_similarity = cosine_similarity(sbert_embeddings1, sbert_embeddings2)[0][0]

# Print results
print("Cosine Similarity using Universal Sentence Encoder (USE):", use_similarity)
print("Cosine Similarity using Sentence-BERT (SBERT):", sbert_similarity)

print("\nEmbedding size for Universal Sentence Encoder (USE):", use_embeddings1.shape)
print("Embedding size for Sentence-BERT (SBERT):", sbert_embeddings1.shape)


Cosine Similarity using Universal Sentence Encoder (USE): 0.795874
Cosine Similarity using Sentence-BERT (SBERT): 0.895548

Embedding size for Universal Sentence Encoder (USE): (512,)
Embedding size for Sentence-BERT (SBERT): (1, 384)


In [None]:
# here we can observe sizes 512 for 384, although 384 size is less but captures sementic relations properly.
# Question is does bigger embedding size capture context better ?

# Yes, in general, a larger embedding size can help capture more contextual information and semantic nuances. However, the relationship between embedding size and contextual understanding is nuanced. 

# The benefits of a larger embedding size depend on the task. For simple tasks like text classification or sentiment analysis, the difference in performance between a 384-dimensional and 768-dimensional model may be small. However, for more complex tasks like semantic textual similarity or paraphrase detection, a larger embedding size might yield better results, since capturing subtle differences in meaning is more critical