## Embeddings, another example using 

In [2]:
import pandas as pd
import pprint as pp
from pathlib import Path

from dotenv import find_dotenv, load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from openai.embeddings_utils import cosine_similarity

load_dotenv(Path('../../.env'))

True

In [7]:
embeddings = OpenAIEmbeddings()
df = pd.read_csv('./data.csv')
print(df.head(10))

# Save to csv so you dont need to go to Openai 
# each time. 
df['embedding'] = df['Words'].apply(lambda x: embeddings.embed_query(x))
df.to_csv('word_embeddings.csv')

        Words
0    Elephant
1        Lion
2       Tiger
3         Dog
4     Cricket
5     Footbal
6      Tennis
7  Basketball
8       Apple
9      Orange


In [8]:
new_df = pd.read_csv('word_embeddings.csv')
new_df

Unnamed: 0.1,Unnamed: 0,Words,embedding
0,0,Elephant,"[-0.01782482331077295, -0.008749894932295476, ..."
1,1,Lion,"[-0.001506082151574034, -0.01005350346088858, ..."
2,2,Tiger,"[-0.013397357906240956, -0.009651300171978608,..."
3,3,Dog,"[-0.0009682854718578403, -0.015160790406976526..."
4,4,Cricket,"[0.003916181735831925, -0.007183414623500497, ..."
5,5,Footbal,"[-0.011428742569128604, -0.008062036637177512,..."
6,6,Tennis,"[-0.022973087948412095, 0.0016404849747739204,..."
7,7,Basketball,"[-0.012690632047445981, -0.013235320727905299,..."
8,8,Apple,"[0.014517220425915257, -0.003906639012518288, ..."
9,9,Orange,"[0.020745464394357635, -0.029286145324851107, ..."


In [10]:
text = "Mango"
text_embedding = embeddings.embed_query(text)
pp.pprint(f"Our embedding is {text_embedding}")

('Our embedding is [-0.0033660750329263744, -0.019877087575502057, '
 '0.010494233241386207, -0.016172489355289036, 0.006269075055495474, '
 '0.008526964220638826, -0.02520404601586244, -0.01423076837573066, '
 '0.001090621007388965, -0.03298370209204527, -0.0011105812294751386, '
 '0.003672662470234827, 0.0019449142518763545, 0.01682398675735074, '
 '-0.021576093781861608, 0.004784042075997122, 0.03267711442190618, '
 '-0.008220376550499727, 0.009408403306627443, -0.015750931774509067, '
 '-0.01591699963017312, 0.002847111819822978, 0.0014938154679030428, '
 '-0.012391243286218651, 0.004633941746076198, 0.00949782424475672, '
 '0.024501448806132378, -0.01363036798736955, 0.011548128124658713, '
 '0.00302435764199916, 0.047827646739376714, -0.014971688578566816, '
 '-0.0043880330020674525, -0.009868283880513506, -0.017616005836532667, '
 '0.004605199268415981, -0.010328165385722154, -0.006336140991923079, '
 '-0.006316979495370032, -0.018344151087451736, 0.014447935107740482, '
 '-0.00

In [13]:
# Openai package
df["similarity score"] = df['embedding'].apply(lambda x: cosine_similarity(x, text_embedding))
df.sort_values("similarity score", ascending=False).head(10)

## As you can see, banana is considered similar to Mango

Unnamed: 0,Words,embedding,similarity score
10,Banana,"[-0.01302593772857713, -0.019896900474770694, ...",0.898691
2,Tiger,"[-0.013397357906240956, -0.009651300171978608,...",0.852014
9,Orange,"[0.020745464394357635, -0.029286145324851107, ...",0.843953
0,Elephant,"[-0.01782482331077295, -0.008749894932295476, ...",0.830545
1,Lion,"[-0.001506082151574034, -0.01005350346088858, ...",0.827259
4,Cricket,"[0.003916181735831925, -0.007183414623500497, ...",0.817998
8,Apple,"[0.014517220425915257, -0.003906639012518288, ...",0.813918
6,Tennis,"[-0.022973087948412095, 0.0016404849747739204,...",0.805554
7,Basketball,"[-0.012690632047445981, -0.013235320727905299,...",0.794429
5,Footbal,"[-0.011428742569128604, -0.008062036637177512,...",0.777427
