### OpenAI Embeddings

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [3]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [4]:
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x10ab095b0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x110b34140>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [None]:
## Example 1: Single text embeddings
single_text="Langchain and Rag are amazing frameworks and projects to work on"
single_embeddings=embeddings.embed_query(single_text)
print(len(single_embeddings))
print(single_embeddings)

1536
[-0.050007786601781845, -0.031081510707736015, -0.0034399048890918493, -0.003286944702267647, 0.03265869989991188, -0.03132624924182892, -0.014466634951531887, 0.001493061427026987, -0.01057804748415947, -0.033909574151039124, 0.018178468570113182, 0.004568411037325859, -0.03817886486649513, 0.05003497749567032, 0.005989241413772106, 0.01439865306019783, -8.14194354461506e-05, -0.06227179616689682, 0.04057184234261513, 0.06520862877368927, -0.001464169006794691, -0.006070820149034262, -0.018246451392769814, 0.03295782208442688, -0.006907002534717321, -0.008334631100296974, -0.00758002744987607, 0.06330512464046478, 0.01789294369518757, -0.019796447828412056, 8.497788803651929e-05, -0.033936768770217896, -0.006420928984880447, 0.033719226717948914, 0.014942510984838009, 0.021835917606949806, -0.00574450520798564, 0.004095933865755796, -0.008096693083643913, 0.022556530311703682, 0.009626294486224651, 0.03804289922118187, 0.004422249272465706, -0.0075188432820141315, 0.0178929436951

In [6]:
print("📝 Single Text Embedding:")
print(f"Input: {single_text}")
print(f"Output: Vector of {len(single_embeddings)} dimensions")
print(f"Sample values: {single_embeddings[:5]}")

📝 Single Text Embedding:
Input: Langchain and Rag are amazing frameworks and projects to work on
Output: Vector of 1536 dimensions
Sample values: [-0.050007786601781845, -0.031081510707736015, -0.0034399048890918493, -0.003286944702267647, 0.03265869989991188]


In [7]:
# Example 2: Multiple texts at once
multiple_texts = [
    "Python is a programming language",
    "LangChain is a framework for LLM applications",
    "Embeddings convert text to numbers",
    "Vectors can be compared for similarity"
]

multiple_embeddings = embeddings.embed_documents(multiple_texts)

In [8]:
multiple_embeddings

[[-0.011024138890206814,
  -0.02037579007446766,
  0.018795086070895195,
  -0.0028325202874839306,
  0.015684667974710464,
  -0.026657816022634506,
  0.0005908519960939884,
  0.03720264509320259,
  -0.0016941581852734089,
  0.01301276683807373,
  0.021538373082876205,
  -0.02474057488143444,
  -0.009494424797594547,
  0.0018139858730137348,
  0.003936463966965675,
  0.015470507554709911,
  -0.032919444143772125,
  0.029758036136627197,
  -0.027228908613324165,
  0.010402055457234383,
  -0.001435381593182683,
  -0.009938041679561138,
  -0.053845930844545364,
  0.015429714694619179,
  0.03683551400899887,
  -0.04287278279662132,
  0.005476376041769981,
  0.03630521148443222,
  -0.019498754292726517,
  0.0010886464733630419,
  0.012971974909305573,
  -0.03232795372605324,
  -0.03648877888917923,
  0.051235221326351166,
  -0.03118576854467392,
  -0.045034781098365784,
  0.04585062712430954,
  -0.010493838228285313,
  0.06840880960226059,
  -0.015103375539183617,
  0.004053742159157991,
  -

In [9]:
print("\n📚 Multiple Text Embeddings:")
print(f"Number of texts: {len(multiple_texts)}")
print(f"Number of embeddings: {len(multiple_embeddings)}")
print(f"Each embedding size: {len(multiple_embeddings[0])}")


📚 Multiple Text Embeddings:
Number of texts: 4
Number of embeddings: 4
Each embedding size: 1536


In [None]:
multiple_embeddings[0][:30]

[-0.011024138890206814,
 -0.02037579007446766,
 0.018795086070895195,
 -0.0028325202874839306,
 0.015684667974710464,
 -0.026657816022634506,
 0.0005908519960939884,
 0.03720264509320259,
 -0.0016941581852734089,
 0.01301276683807373,
 0.021538373082876205,
 -0.02474057488143444,
 -0.009494424797594547,
 0.0018139858730137348,
 0.003936463966965675,
 0.015470507554709911,
 -0.032919444143772125,
 0.029758036136627197,
 -0.027228908613324165,
 0.010402055457234383,
 -0.001435381593182683,
 -0.009938041679561138,
 -0.053845930844545364,
 0.015429714694619179,
 0.03683551400899887,
 -0.04287278279662132,
 0.005476376041769981,
 0.03630521148443222,
 -0.019498754292726517,
 0.0010886464733630419]

In [11]:
from langchain_openai import OpenAIEmbeddings

# Different OpenAI embedding models
models_comparison = {
    "text-embedding-3-small": {
        "dimensions": 1536,
        "description": "Good balance of performance and cost",
        "cost_per_1m_tokens": 0.02,
        "use_case": "General purpose, cost-effective"
    },
    "text-embedding-3-large": {
        "dimensions": 3072,
        "description": "Highest quality embeddings",
        "cost_per_1m_tokens": 0.13,
        "use_case": "When accuracy is critical"
    },
    "text-embedding-ada-002": {
        "dimensions": 1536,
        "description": "Previous generation model",
        "cost_per_1m_tokens": 0.10,
        "use_case": "Legacy applications"
    }
}

# Display comparison
print("📊 OpenAI Embedding Models Comparison:\n")
for model_name, details in models_comparison.items():
    print(f"Model: {model_name}")
    print(f"  📏 Dimensions: {details['dimensions']}")
    print(f"  💰 Cost: ${details['cost_per_1m_tokens']}/1M tokens")
    print(f"  📝 Description: {details['description']}")
    print(f"  🎯 Use case: {details['use_case']}\n")

📊 OpenAI Embedding Models Comparison:

Model: text-embedding-3-small
  📏 Dimensions: 1536
  💰 Cost: $0.02/1M tokens
  📝 Description: Good balance of performance and cost
  🎯 Use case: General purpose, cost-effective

Model: text-embedding-3-large
  📏 Dimensions: 3072
  💰 Cost: $0.13/1M tokens
  📝 Description: Highest quality embeddings
  🎯 Use case: When accuracy is critical

Model: text-embedding-ada-002
  📏 Dimensions: 1536
  💰 Cost: $0.1/1M tokens
  📝 Description: Previous generation model
  🎯 Use case: Legacy applications



### Cosine Similarity With OpenAI Embeddings

In [12]:
# Example 1: Finding similar sentences
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",
    "The dog played in the yard",
    "I love programming in Python",
    "Python is my favorite programming language"
]

In [13]:
import numpy as np
def cosine_similarity(vec1, vec2):
    """
    Cosine similarity measures the angle between two vectors.
    - Result close to 1: Very similar
    - Result close to 0: Not related
    - Result close to -1: Opposite meanings
    """

    dot_product=np.dot(vec1,vec2)
    norm_a=np.linalg.norm(vec1)
    norm_b=np.linalg.norm(vec2)
    return dot_product/(norm_a * norm_b)

In [14]:
from langchain_openai import OpenAIEmbeddings
embeddings=OpenAIEmbeddings(model="text-embedding-3-small")
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x1119af440>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x1118b74d0>, model='text-embedding-3-small', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [15]:
sentence_embeddings=embeddings.embed_documents(sentences)
sentence_embeddings

[[-0.030758168548345566,
  -0.04960077628493309,
  -0.005053127650171518,
  -0.0015193307772278786,
  0.03628946840763092,
  -0.002040313323959708,
  -0.008891437202692032,
  0.027165407314896584,
  0.007082132622599602,
  -0.011896173469722271,
  0.04161399230360985,
  -0.0013739402638748288,
  0.04523260146379471,
  0.052702441811561584,
  0.03202468156814575,
  0.03248992934823036,
  -0.012426041066646576,
  0.0031016638968139887,
  -0.06601374596357346,
  0.04748130589723587,
  0.025821352377533913,
  -0.04538768157362938,
  -0.0034893720876425505,
  0.014590744860470295,
  0.00907236710190773,
  0.014771674759685993,
  -0.011172452010214329,
  -0.012070642784237862,
  0.010778282769024372,
  0.01282667275518179,
  0.01228388212621212,
  -0.036056842654943466,
  -0.026519227772951126,
  -0.04536183550953865,
  -0.03458355367183685,
  0.004807579331099987,
  -0.01986357383430004,
  -0.011728166602551937,
  -0.042027547955513,
  -0.02282307855784893,
  -0.036341164261102676,
  -0.005

In [17]:
## Calculate the similarity between all pairs and track top 3
import heapq

# List to store top 3 similarities (we'll use a min-heap of size 3)
top_similarities = []

for i in range(len(sentences)):
    for j in range(i+1,len(sentences)):
        similarity=cosine_similarity(sentence_embeddings[i],sentence_embeddings[j])
        
        # Add to top similarities tracking
        pair_info = {
            'similarity': similarity,
            'sentence1': sentences[i],
            'sentence2': sentences[j],
            'indices': (i, j)
        }
        
        # Keep only top 3 similarities using a min-heap
        if len(top_similarities) < 3:
            heapq.heappush(top_similarities, (similarity, pair_info))
        elif similarity > top_similarities[0][0]:  # If current similarity is better than the worst in top 3
            heapq.heapreplace(top_similarities, (similarity, pair_info))
        
        print(f"'{sentences[i]}' vs '{sentences[j]}'")
        print(f"Similarity: {similarity:.3f}\n")

'The cat sat on the mat' vs 'A feline rested on the rug'
Similarity: 0.656

'The cat sat on the mat' vs 'The dog played in the yard'
Similarity: 0.324

'The cat sat on the mat' vs 'I love programming in Python'
Similarity: 0.089

'The cat sat on the mat' vs 'Python is my favorite programming language'
Similarity: 0.120

'A feline rested on the rug' vs 'The dog played in the yard'
Similarity: 0.296

'A feline rested on the rug' vs 'I love programming in Python'
Similarity: 0.055

'A feline rested on the rug' vs 'Python is my favorite programming language'
Similarity: 0.103

'The dog played in the yard' vs 'I love programming in Python'
Similarity: 0.126

'The dog played in the yard' vs 'Python is my favorite programming language'
Similarity: 0.085

'I love programming in Python' vs 'Python is my favorite programming language'
Similarity: 0.708



In [18]:
## Display the top 3 most similar pairs
print("🏆 TOP 3 MOST SIMILAR PAIRS:")
print("=" * 50)

# Sort the top similarities in descending order
sorted_top = sorted(top_similarities, key=lambda x: x[0], reverse=True)

for rank, (similarity, pair_info) in enumerate(sorted_top, 1):
    print(f"{rank}. Similarity: {similarity:.3f}")
    print(f"   '{pair_info['sentence1']}'")
    print(f"   '{pair_info['sentence2']}'")
    print()

🏆 TOP 3 MOST SIMILAR PAIRS:
1. Similarity: 0.708
   'I love programming in Python'
   'Python is my favorite programming language'

2. Similarity: 0.656
   'The cat sat on the mat'
   'A feline rested on the rug'

3. Similarity: 0.324
   'The cat sat on the mat'
   'The dog played in the yard'



## Semantic Search Implementation

In [19]:
### Example- Semantic Search- Retireve the similar sentence
# Test semantic search
documents = [
    "LangChain is a framework for developing applications powered by language models",
    "Python is a high-level programming language",
    "Machine learning is a subset of artificial intelligence",
    "Embeddings convert text into numerical vectors",
    "The weather today is sunny and warm"
]
query="What is Langchain?"

In [20]:
def semantic_search(query,documents,embeddings_models,top_k=3):
    """Simple semantic search implementation"""

    ## embed query and doument

    query_embedding=embeddings_models.embed_query(query)
    doc_embeddings = embeddings_models.embed_documents(documents)

    ## Calculate the similarity score

    similarties=[]

    for i,doc_emb in enumerate(doc_embeddings):
        similarity=cosine_similarity(query_embedding,doc_emb)
        similarties.append((similarity,documents[i]))

    ## Sort by similarity
    similarties.sort(reverse=True)
    return similarties[:top_k]

In [21]:
results=semantic_search(query,documents,embeddings)
results

[(np.float64(0.6755734259243745),
  'LangChain is a framework for developing applications powered by language models'),
 (np.float64(0.1302479588370226),
  'Python is a high-level programming language'),
 (np.float64(0.10104655373322172),
  'Embeddings convert text into numerical vectors')]

In [22]:
print(f"\n🔎 Semantic Search Results for: '{query}'")
for score, doc in results:
    print(f"Score: {score:.3f} | {doc}")


🔎 Semantic Search Results for: 'What is Langchain?'
Score: 0.676 | LangChain is a framework for developing applications powered by language models
Score: 0.130 | Python is a high-level programming language
Score: 0.101 | Embeddings convert text into numerical vectors


In [23]:
query="What is Embeddings?"
results=semantic_search(query,documents,embeddings)
results

[(np.float64(0.6226925178962247),
  'Embeddings convert text into numerical vectors'),
 (np.float64(0.25208926551734484),
  'Machine learning is a subset of artificial intelligence'),
 (np.float64(0.22919241324405523),
  'LangChain is a framework for developing applications powered by language models')]

In [24]:
print(f"\n🔎 Semantic Search Results for: '{query}'")
for score, doc in results:
    print(f"Score: {score:.3f} | {doc}")


🔎 Semantic Search Results for: 'What is Embeddings?'
Score: 0.623 | Embeddings convert text into numerical vectors
Score: 0.252 | Machine learning is a subset of artificial intelligence
Score: 0.229 | LangChain is a framework for developing applications powered by language models
