In [None]:
# Hugging Face Embeddings
from langchain_huggingface import HuggingFaceEmbeddings # no api key is required

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)

# different version of sentence-transformers can be used, L6-v2 has fast speed and it's lightweight
# Accuracy wise it's not the best option  

In [None]:
# Creating Embedding : Sentence -> Vector Representation 
text = "This is Akash, Learning about the Embeddings"

#As only using one sentence - will use embddings.embed_query
embeddings_query = embeddings.embed_query(text)
print(f"Text : {text}")
print("Embedding Length : ", len(embeddings_query))
print(embeddings_query)

Text : This is Akash, Learning about the Embeddings
Embedding Length :  384
[-0.03704123944044113, -0.0713258907198906, 0.028184927999973297, -0.029531314969062805, 0.025108153000473976, 0.06139608845114708, 0.02622235007584095, -0.03704600781202316, 0.01679334044456482, 0.013683684170246124, 0.03765535354614258, -0.03162381425499916, 0.0056676254607737064, 0.02162349969148636, -0.10918711870908737, 0.07156123220920563, 0.059338171035051346, 0.015210182406008244, -0.05483749136328697, -0.0638241171836853, -0.01933957450091839, 0.038291558623313904, 0.04736733064055443, -0.06276071816682816, 0.06299927085638046, -0.05287834629416466, 0.002413781126961112, 0.054061152040958405, 0.08417198061943054, -0.08109980821609497, 0.01553500909358263, -0.030653171241283417, -0.0042478167451918125, 0.07539594173431396, -0.022349486127495766, 0.09310410916805267, 0.029386594891548157, 0.039234794676303864, -0.06933228671550751, 0.02945619262754917, 0.04004446789622307, 0.024685034528374672, -0.025250

In [None]:
# use embed_documents for the big sentence 
sentence = [
    "Right now, Mbapp√© is the best player in the world",
    "Real Madrid isn't player well this season, magic is missing",
    "Border collie's are the smartest dogs!",
    "ICPC is one of the goated coding competition",
    "Border collie's are the smartest dogs!"]

embeddings_sentence = embeddings.embed_documents(sentence)
print(f"Text : {text}")
print("Embedding Length : ", len(embeddings_sentence))
print(embeddings_sentence)



print(embeddings_sentence[2])
print(embeddings_sentence[-1])
# it will always create the same embedding for the same sentence

Text : This is Akash, Learning about the Embeddings
Embedding Length :  5
[[0.04127988964319229, -0.06877169758081436, -0.04591580107808113, -0.04263904318213463, -0.04286706820130348, 0.0427071675658226, 0.13852304220199585, 0.12668652832508087, 0.05442973971366882, 0.04587589576840401, -0.002500329166650772, -0.0015840985579416156, 0.03959256038069725, 0.08191744238138199, 0.03902001678943634, 0.0028829157818108797, 0.051961690187454224, -0.03501810133457184, 0.006702239625155926, -0.0851140022277832, -0.054876212030649185, -0.06793175637722015, -0.011227781884372234, -0.009686342440545559, -0.011510477401316166, -0.021900298073887825, 0.05048456788063049, 0.014548021368682384, -0.01890289969742298, -0.060498449951410294, 0.03013012744486332, -0.009589503519237041, -0.009269790723919868, 0.003756123362109065, -0.0043289209716022015, 0.03685056045651436, -0.0024412316270172596, -0.00371880573220551, 0.046327073127031326, -0.07025890052318573, -0.0022712629288434982, -0.110805943608284

Model: sentence-transformers/all-MiniLM-L6-v2
  Embedding size: 384 dimensions
  Description: Fast and efficient, good quality
  Use case: General purpose, real-time applications

Model: sentence-transformers/all-mpnet-base-v2
   Embedding size: 768 dimensions
   Description: Best quality, slower than MiniLM
   Use case: When quality matters more than speed

Model: sentence-transformers/all-MiniLM-L12-v2
   Embedding size: 384 dimensions
   Description: Slightly better than L6, bit slower
   Use case: Good balance of speed and quality

Model: sentence-transformers/multi-qa-MiniLM-L6-cos-v1
   Embedding size: 384 dimensions
   Description: Optimized for question-answering
   Use case: Q&A systems, semantic search

Model: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
   Embedding size: 384 dimensions
   Description: Supports 50+ languages
   Use case: Multilingual applications

In [37]:
# Semantic Search

doc = [
    "Lanchain is a framework to develop applications powered by language models",
    "Python is high-level programming language",
    "Machine Learning is subset of AI",
    "ICPC is a coding competition for the University Students"
]

query = "what is langchain ?"

In [51]:
import numpy as np

def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    normal_v1 = np.linalg.norm(vector1)
    normal_v2 = np.linalg.norm(vector2)
    similarity  = dot_product/ (normal_v1* normal_v2)
    return similarity

In [54]:
def semantic_search(query, documents, embeddings_model, top_k=3):
    """ simple semantic search implementation """

    query_embedding = embeddings_model.embed_query(query)
    doc_embedding = embeddings_model.embed_documents(documents)

    # similarity score
    
    similarities = []

    for i, doc_emb in enumerate(doc_embedding) :
        similarity = cosine_similarity(query_embedding, doc_emb)
        similarities.append((similarity, documents[i])) 

    # sort it by similarity
    similarities.sort(reverse = True)
    return similarities[ : top_k]



In [57]:
result = semantic_search(query, doc, embeddings)
result

# first Langchain match = 54%

[(np.float64(0.5408776558349455),
  'Lanchain is a framework to develop applications powered by language models'),
 (np.float64(0.16212926686480256),
  'Python is high-level programming language'),
 (np.float64(0.04392206304691663),
  'ICPC is a coding competition for the University Students')]

In [58]:
query2 = "what's is ICPC"
r = semantic_search(query2, doc, embeddings)
r

[(np.float64(0.7647518661598605),
  'ICPC is a coding competition for the University Students'),
 (np.float64(0.17996600718342606),
  'Lanchain is a framework to develop applications powered by language models'),
 (np.float64(0.12173999633452681), 'Machine Learning is subset of AI')]