In [1]:
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain.embeddings import OpenAIEmbeddings

documents = [
    "The cat is on the mat.",
    "There is a cat on the mat.",
    "The dog is in the yard.",
    "There is a dog in the yard.",
]

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

document_embeddings = embeddings.embed_documents(documents)

query = "A cat is sitting on a mat."
query_embedding = embeddings.embed_query(query)

similarity_scores = cosine_similarity([query_embedding], document_embeddings)

most_similar_index = np.argmax(similarity_scores)
most_similar_document = documents[most_similar_index]

print(f"Most similar document to the query\n'{query}':")
print(most_similar_document)



Most similar document to the query
'A cat is sitting on a mat.':
The cat is on the mat.


In [2]:
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

documents = ["Document 1", "Document 2", "Document 3"]

doc_embeddings = hf.embed_documents(documents)

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)a8e1d/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<?, ?B/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 191kB/s]
Downloading (…)b20bca8e1d/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 11.9MB/s]
Downloading (…)0bca8e1d/config.json: 100%|██████████| 571/571 [00:00<?, ?B/s] 
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<?, ?B/s] 
Downloading (…)e1d/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 332kB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [02:14<00:00, 3.25MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<?, ?B/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 266kB/s]
Downloading (…)a8e1d/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.14MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 400kB/s]
Downloading (…)8e1d/train

Cohere embeddings

In [4]:
import cohere
from langchain.embeddings import CohereEmbeddings

cohere = CohereEmbeddings(model="embed-multilingual-v2.0", cohere_api_key="Ash75qZAwm2vcLsSyKTKJzxPXSiwiuuSYvjlVHXw")

texts = [
    "Hello from Cohere!", 
    "مرحبًا من كوهير!", 
    "Hallo von Cohere!",  
    "Bonjour de Cohere!", 
    "¡Hola desde Cohere!", 
    "Olá do Cohere!",  
    "Ciao da Cohere!", 
    "您好，来自 Cohere！", 
    "कोहेरे से नमस्ते!"
]

document_embeddings = cohere.embed_documents(texts)

for text, embedding in zip(texts, document_embeddings):
    print(f"TextL {text}")
    print(f"Embedding: {embedding[:5]}")

TextL Hello from Cohere!
Embedding: [0.23449707, 0.50146484, -0.048797607, 0.13989258, -0.18005371]
TextL مرحبًا من كوهير!
Embedding: [0.25341797, 0.30029297, 0.010414124, 0.12585449, -0.18237305]
TextL Hallo von Cohere!
Embedding: [0.10290527, 0.2836914, -0.049560547, 0.23706055, -0.07165527]
TextL Bonjour de Cohere!
Embedding: [0.15161133, 0.28222656, -0.057128906, 0.117370605, -0.04421997]
TextL ¡Hola desde Cohere!
Embedding: [0.2512207, 0.43139648, -0.086120605, 0.24658203, -0.11669922]
TextL Olá do Cohere!
Embedding: [0.18664551, 0.39086914, -0.046051025, 0.14562988, -0.11242676]
TextL Ciao da Cohere!
Embedding: [0.115722656, 0.43310547, -0.026016235, 0.14526367, 0.07104492]
TextL 您好，来自 Cohere！
Embedding: [0.24597168, 0.3088379, -0.11212158, 0.26611328, -0.0513916]
TextL कोहेरे से नमस्ते!
Embedding: [0.1928711, 0.6352539, 0.03225708, 0.11767578, -0.2607422]


DeepLake

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

texts = [
    "Napoleon Bonaparte was born in 15 August 1769",
    "Louis XIV was born in 5 September 1638",
    "Lady Gaga was born in 28 March 1986",
    "Michael Jeffrey Jordan was born in 17 February 1963"
]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.create_documents(texts)

embeddings = OpenAIEmbeddings(model = "text-embedding-ada-002")

my_activeloop_org_id = "veaceslavcalestru"
my_activeloop_dataset_name = "langchain_course_embeddings"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

db.add_documents(docs)

Your Deep Lake dataset has been successfully created!
The dataset is private so make sure you are logged in!


 

Dataset(path='hub://veaceslavcalestru/langchain_course_embeddings', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
 embedding  embedding  (4, 1536)  float32   None   
    id        text      (4, 1)      str     None   
 metadata     json      (4, 1)      str     None   
   text       text      (4, 1)      str     None   


['044bbcc8-6e70-11ee-b60b-cc4740c98b6b',
 '044bbcc9-6e70-11ee-98a7-cc4740c98b6b',
 '044bbcca-6e70-11ee-b082-cc4740c98b6b',
 '044bbccb-6e70-11ee-b8f9-cc4740c98b6b']

In [8]:
retriever = db.as_retriever()

model = ChatOpenAI(model="gpt-3.5-turbo")

qa_chain = RetrievalQA.from_llm(model, retriever=retriever)

qa_chain.run("When was Michael Jordan born?")

'Michael Jordan was born on 17 February 1963.'