# Exploring Embedding Models in LangChain

In [1]:
!pip install langchain==0.2.0
!pip install langchain-openai==0.1.7
!pip install langchain-community==0.2.0
!pip install langchain-huggingface==0.0.1

!pip install openai==1.55.3 httpx==0.27.2 --force-reinstall --quiet

Collecting langchain==0.2.0
  Downloading langchain-0.2.0-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain==0.2.0)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain-core<0.3.0,>=0.2.0 (from langchain==0.2.0)
  Downloading langchain_core-0.2.43-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain==0.2.0)
  Downloading langchain_text_splitters-0.2.4-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain==0.2.0)
  Downloading langsmith-0.1.147-py3-none-any.whl.metadata (14 kB)
Collecting tenacity<9.0.0,>=8.1.0 (from langchain==0.2.0)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain==0.2.0)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->lang

In [2]:
import os
from google.colab import userdata



os.environ['OPENAI_API_KEY'] = userdata.get('OPEN_API_KEY')
os.environ['HUGGINGFACEHUB_API_TOKEN'] = userdata.get('HF_TOKEN')

## Embedding models

The Embeddings class is a class designed for interfacing with text embedding models. There are lots of embedding model providers (OpenAI, Cohere, Hugging Face, etc) - this class is designed to provide a standard interface for all of them.

Embeddings create a vector representation of a piece of text. This is useful because it means we can think about text in the vector space, and do things like semantic search where we look for pieces of text that are most similar in the vector space.

In [3]:
pip install -qU langchain-openai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/411.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.2/411.2 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.2.0 requires langchain-core<0.3.0,>=0.2.0, but you have langchain-core 0.3.25 which is incompatible.
langchain-community 0.2.0 requires langchain-core<0.3.0,>=0.2.0, but you have langchain-core 0.3.25 which is incompatible.
langchain-huggingface 0.0.1 requires langchain-core<0.3,>=0.1.52, but you have langchain-core 0.3.25 which i

In [4]:
docs = [
    "cats eat and sleep",
    "dogs eat and bark",
    "cars drive fast",
    "vehicles include trucks and cars"
]

### Open AI Embedding Models

In [35]:
from langchain_openai import OpenAIEmbeddings

open_embed_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [7]:
embeddings = open_embed_model.embed_documents(docs)

In [8]:
len(embeddings)

4

In [9]:
len(embeddings[0])

3072

In [10]:
print(embeddings[0])

[-0.008327714167535305, 0.0008373516029678285, -0.0011941941920667887, -0.03789442777633667, 0.038747187703847885, 0.006309076212346554, -0.012738071382045746, -0.01857413351535797, 0.01025974377989769, -0.03949334844946861, -0.006079230923205614, 0.022264976054430008, -0.0005258951568976045, -0.035202912986278534, 0.005766109097748995, 0.042717840522527695, 0.002151881344616413, 0.01387064065784216, -0.008854025043547153, -0.00274481438100338, 0.015389615669846535, -0.014203748665750027, 0.014856641180813313, 0.007681483402848244, -0.012897963635623455, 0.02875393070280552, 0.009533566422760487, 0.0005916840746067464, -0.007748105097562075, -0.03533615544438362, -0.009893324226140976, 0.04407692328095436, 0.032351501286029816, -0.007388347759842873, -0.01717507652938366, -0.07866691797971725, 0.005606216844171286, -0.027874523773789406, 0.04287773370742798, 0.003032953478395939, -0.004896695725619793, -0.010379662737250328, -0.06933987885713577, -0.05217812582850456, -0.02251813746988

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(embeddings)

array([[1.        , 0.52144132, 0.19885222, 0.13880775],
       [0.52144132, 1.        , 0.21429906, 0.20798207],
       [0.19885222, 0.21429906, 1.        , 0.48642271],
       [0.13880775, 0.20798207, 0.48642271, 1.        ]])

## Open Source Embedding Models on HuggingFace


In [13]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

model_name = "mixedbread-ai/mxbai-embed-large-v1"

hf_embeddings = HuggingFaceEmbeddings(model_name=model_name)

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/114k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/677 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [14]:
embeddings = hf_embeddings.embed_documents(docs)

In [15]:
len(embeddings)

4

In [16]:
len(embeddings[0])

1024

In [17]:
sim_matrix = cosine_similarity(embeddings)
sim_matrix

array([[1.        , 0.52516038, 0.3422186 , 0.33691793],
       [0.52516038, 1.        , 0.31728604, 0.33082582],
       [0.3422186 , 0.31728604, 1.        , 0.7225333 ],
       [0.33691793, 0.33082582, 0.7225333 , 1.        ]])

## Build a small search engine!

In [28]:
documents = [
    'Quantum mechanics describes the behavior of very small particles.',
    'Photosynthesis is the process by which green plants make food using sunlight.',
    "Shakespeare's plays are a testament to English literature.",
    'Artificial Intelligence aims to create machines that can think and learn.',
    'The pyramids of Egypt are historical monuments that have stood for thousands of years.',
    'Biology is the study of living organisms and their interactions with the environment.',
    'Music therapy can aid in the mental well-being of individuals.',
    'The Milky Way is just one of billions of galaxies in the universe.',
    'Economic theories help understand the distribution of resources in society.',
    'Yoga is an ancient practice that involves physical postures and meditation.'
]

In [29]:
len(documents)

10

In [36]:
document_embeddings = open_embed_model.embed_documents(documents)

### Find most similar document for one query

In [41]:
new_text = "What is Artificial Intelligence?"
new_text

'What is Artificial Intelligence?'

In [42]:
query_embedding = open_embed_model.embed_query(new_text)

In [45]:
cosine_similarities = cosine_similarity([query_embedding], document_embeddings)
cosine_similarities

array([[ 0.10697394,  0.09800227, -0.01664217,  0.6612003 ,  0.02615491,
         0.14776126,  0.06916632,  0.04312317,  0.03792733,  0.05892435]])

In [46]:
import numpy as np

documents[np.argmax(cosine_similarities[0])]

'Artificial Intelligence aims to create machines that can think and learn.'

### Create search engine function

In [47]:
def semantic_search_engine(query, embedder_model):
  query_embedding = embedder_model.embed_query(query)
  cos_score = cosine_similarity([query_embedding], document_embeddings)[0]
  top_result_id = np.argmax(cos_score)
  return documents[top_result_id]

In [48]:
new_sentence = 'Tell me about AI'
semantic_search_engine(new_sentence, open_embed_model)

'Artificial Intelligence aims to create machines that can think and learn.'

In [49]:
new_sentence = 'Do you know about the pyramids?'
semantic_search_engine(new_sentence, open_embed_model)

'The pyramids of Egypt are historical monuments that have stood for thousands of years.'

In [50]:
new_sentence = 'How do plants survive?'
semantic_search_engine(new_sentence, open_embed_model)

'Photosynthesis is the process by which green plants make food using sunlight.'